# n-step TD Methods on the Random Walk

In [1]:
import random
import matplotlib.pyplot as plt

In [2]:
# Step-size parameter --> (0,1]
alpha = 0.1

# TERMINAL
terminal = [0, 20]

# n in n-step
n = 25

# Episodes (runs)
episodes = 100

In [3]:
def n_step_td(alpha, n, terminal, episodes):
    """
    n-step TD for estimating V = v_pi
    """

    # Input: a policy pi
    policy = (-1 ,1)
    
    # Initialize V(s) arbitrarily, for all s 2 S
    V = [0] * 21
    
    # All store and access operations (for St and Rt) can take their index mod n + 1
    store = []
    
    # Loop for each episode:
    for i in range(episodes):
        
        # Initialize S0 != terminal
        S = 10
        
        # Store S0 != terminal
        store.append((S, 0))
        
        # T <-- inf
        T = float('inf');  t = tau = 0
        
        # Loop for t = 0, 1, 2,... : Until t' = T - 1
        while tau != T-1:

            # If t < T, then:
            if t < T:
                
                # Take an action according to pi(·|St)
                S += random.choice(policy)
                
                # reset reward
                R = 0
                
                # Take action A, observe R, S'
                if S == terminal[0]: R = 0
                    
                # Take action A, observe R, S'
                if S == terminal[1]: R = 1

                # Store the next reward as Rt+1 and the next state as St+1
                store.append((S, R))
                
                # If St+1 is terminal, then T <-- t + 1
                if S in terminal: T = t + 1
        
            # t' <-- t - n +1 (t' is the time whose state’s estimate is being updated)
            tau = t - n + 1
            
            # If t' >= 0:
            if tau >= 0:
            
                # G <-- sum(i=t'+1 to min(t'+n,T)) gamma^i-t'-1*Ri
                G = sum([store[i][1] for i in range(tau + 1, min(tau + n, T)+1)])
                
                # If t'+n < T, then: G <-- G + gamma^n*V(St'+n)
                if tau+n < T: G = G + V[store[tau+n][0]]

                # V(St') <-- V(St') + a[G - V(St')]
                V[store[tau][0]] = V[store[tau][0]] + alpha*(G - V[store[tau][0]])
            
            # Until t' = T - 1
            t += 1

    # Return v_pi
    return V

In [4]:
print(n_step_td(alpha, n, terminal, episodes))

[0.476059110212517, 1.0647204726974508, 1.779717544739992, 1.686591961323786, 0.7610033578112437, 0.1262527173168598, 0.4037400517221632, 0.29114507930722644, 0.2041607386406197, 0.08360537419529404, 0.03684794867723369, 0.012195045362757407, 0.013831544568857702, 0.05549004361237179, 0.19077610921167798, 0.49197997242240327, 1.4345616707078281, 1.5150567677490887, 1.605532945825688, 1.747534720371962, 0.8048407157628558]
