#### Sutton and Barto, Reinforcement Learning 2nd. Edition, page 120.
![Sutton and Barto, Reinforcement Learning 2nd. Edition.](./TD0Prediction.png)

Tabular TD(0) for estimating V

In [1]:
from env import create_standard_grid, create_custom_grid_1
import numpy as np

code to play game and return states visited along with reward

In [2]:
def play_game(gw, policy, epsilon ):
    # game starting state
    state = (0,0) 
    # list of tuples that are (state, reward) pairs
#    states_and_rewards = [(state,0)] # list of tuples that are (state, reward) pairs
    states_and_rewards = []
    converged = False
    while not converged:
        # get action from policy
        action = policy[state] # get action from policy
        # get all valid actions at state
        all_actions = gw.valid_decisions(state) 
        # choose a random action with probability epsilon
        action = random_action(action, all_actions, epsilon) 
        # find reward for the action
        reward = gw.get_reward_for_action(state, action)
        # new state
        stateprime = move(state,action)
        # add new state and reward to the list
        states_and_rewards.append((stateprime,reward))
        # if you have moved to a terminal state, then stop
        if gw.is_terminal(stateprime):
            converged = True
        # update state to new state
        state = stateprime
    return states_and_rewards

def move(state, action): # only valid actions at states are sent to move
    i,j = state
    if action == 'left':
        j = j-1
    if action == 'right':
        j = j+1
    if action == 'down':
        i = i-1
    if action == 'up':
        i = i+1
    return (i,j)

def random_action(action, all_actions, epsilon ):
    p = np.random.random_sample()
    if p < (1 - epsilon):
        return action
    else:
        return np.random.choice(all_actions)

Create standard grid - with probability 0.1 choose off policy

In [33]:
gw = create_custom_grid_1()
policy = {
    (0, 0): 'right', (0, 1): 'right', (0, 2): 'up', (0, 3): 'left',
    (1, 0): 'down', (1, 1): '', (1, 2): '', (1, 3): '',
    (2, 0): 'right', (2, 1): 'right', (2, 2): 'down', (2, 3): 'left',
    (3, 0): 'right', (3, 1): 'right', (3, 2): 'down', (3, 3): 'left'
}
gamma = 0.9 # discount factor for future rewards
alpha = 0.99 # fraction for value update
epsilon = 0.99 # explore/exploit probability of exploration
number_play_game = 100000 # number of game play iterations

Iteration One - Play the game from beginning to end. The result is stochastic.

In [4]:
states_and_rewards = play_game(gw, policy, epsilon)

In [5]:
states_and_rewards # see the state path and rewards

[((0, 1), 0.0),
 ((0, 2), 0.0),
 ((0, 1), 0.0),
 ((0, 2), 0.0),
 ((0, 1), 0.0),
 ((0, 2), 0.0),
 ((0, 3), 0.0),
 ((0, 2), 0.0),
 ((0, 1), 0.0),
 ((0, 0), 0.0),
 ((1, 0), 0.0),
 ((0, 0), 0.0),
 ((1, 0), 0.0),
 ((0, 0), 0.0),
 ((0, 1), 0.0),
 ((0, 0), 0.0),
 ((0, 1), 0.0),
 ((0, 2), 0.0),
 ((1, 2), 1.0)]

Update values using TD(0)

In [6]:
for t in range(len(states_and_rewards) - 1 ):
    state, _ = states_and_rewards[t]
    stateprime, reward = states_and_rewards[t+1]
    current_value = gw.get_value(state)
    dest_value = gw.get_value(stateprime)
    new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
    gw.set_value(state, new_value)

In [7]:
gw.print_values()

-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.99 |   0.00 |
-------------------------------------


Iteration Two

In [8]:
states_and_rewards = play_game(gw, policy, epsilon)

In [9]:
states_and_rewards

[((0, 1), 0.0), ((0, 0), 0.0), ((0, 1), 0.0), ((0, 2), 0.0), ((1, 2), 1.0)]

Update values

In [10]:
for t in range(len(states_and_rewards) - 1 ):
    state, _ = states_and_rewards[t]
    stateprime, reward = states_and_rewards[t+1]
    current_value = gw.get_value(state)
    dest_value = gw.get_value(stateprime)
    new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
    gw.set_value(state, new_value)

In [11]:
gw.print_values()

-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.00 |   0.88 |   1.00 |   0.00 |
-------------------------------------


Play the game 1000 times

In [34]:
for _ in range(number_play_game):
    states_and_rewards = play_game(gw, policy, epsilon)
    for t in range(len(states_and_rewards) - 1 ):
        state, _ = states_and_rewards[t]
        stateprime, reward = states_and_rewards[t+1]
        current_value = gw.get_value(state)
        dest_value = gw.get_value(stateprime)
        new_value = current_value + alpha*(reward + gamma*dest_value - current_value)
        gw.set_value(state, new_value)

In [35]:
print("Policy")
gw.print_policy(policy)
print("")
# Print values from TD Learning
print("Values from TD(0) Learning")
gw.print_values()

Policy
-------------------------------------
|  Right |  Right |   Down |   Left |
-------------------------------------
|  Right |  Right |   Down |   Left |
-------------------------------------
|   Down |        |        |        |
-------------------------------------
|  Right |  Right |     Up |   Left |
-------------------------------------

Values from TD(0) Learning
-------------------------------------
|  -0.03 |  -0.24 |  -0.64 |  -0.89 |
-------------------------------------
|   0.03 |  -0.22 |  -0.39 |  -1.00 |
-------------------------------------
|  -0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.01 |  -0.63 |   0.98 |  -1.00 |
-------------------------------------
