In [1]:
from env import create_standard_grid, create_custom_grid_1
import numpy as np
np.random.seed(42)

In [2]:
def play_game(gw, policy, state=(0,0)):
    # default game starting point is state = (0,0) 
    # list of tuples that are (state, reward) pairs
 #   states_and_rewards = [(state,0)] # list of tuples that are (state, reward) pairs
    states_and_rewards = [] # list of tuples that are (state, reward) pairs
    converged = False
    while not converged:
        # get action from policy
        action = policy[state] # get action from policy
        # find reward for the action
        reward = gw.get_reward_for_action(state, action)
        # more to the new state
        stateprime = move(state,action)
        # add new state and reward to the list
        states_and_rewards.append((state,reward))
        # if you have moved to a terminal state, then stop
        if gw.is_terminal(stateprime):
            converged = True
        # update state to new state
        state = stateprime
    return states_and_rewards

def move(state, action): # only valid actions at states are sent to move
    i,j = state
    if action == 'left':
        j = j-1
    if action == 'right':
        j = j+1
    if action == 'down':
        i = i-1
    if action == 'up':
        i = i+1
    return (i,j)

In [3]:
gw = create_custom_grid_1()

In [4]:
policy = {
    (0, 0): 'right', (0, 1): 'right', (0, 2): 'up', (0, 3): 'left',
    (1, 0): 'down', (1, 1): '', (1, 2): '', (1, 3): '',
    (2, 0): 'right', (2, 1): 'right', (2, 2): 'down', (2, 3): 'left',
    (3, 0): 'right', (3, 1): 'right', (3, 2): 'down', (3, 3): 'left'
}

In [59]:
gamma = 0.5
all_states = [
            (0,0), (0,1) ,(0,2), (0,3),
            (1,0), (1,1), (1,2), (1,3),
            (2,0), (2,1), (2,2), (2,3),
            (3,0), (3,1), (3,2), (3,3)
]

In [60]:
states_and_rewards = play_game(gw, policy)

In [61]:
states_and_rewards

[((0, 0), 0.0), ((0, 1), 0.0), ((0, 2), 1.0)]

In [62]:
G = 0
states_and_returns = []
for s, r in reversed(states_and_rewards):
    G = r + gamma*G
    states_and_returns.append((s,G))
states_and_returns.reverse()

In [63]:
states_and_returns

[((0, 0), 0.25), ((0, 1), 0.5), ((0, 2), 1.0)]

In [64]:
returns = {}
for s in all_states:
    returns[s] = []
    
seen_states = set()
for s, G in states_and_returns:
    if s not in seen_states:
        returns[s].append(G)
        gw.set_value(s, np.mean(returns[s]))
        seen_states.add(s)      

In [65]:
gw.print_values()

-------------------------------------
|   0.00 |   0.01 |   0.10 |   0.01 |
-------------------------------------
|   0.01 |   0.10 |   1.00 |   0.10 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |   0.10 |
-------------------------------------


In [66]:
num_episodes = 100
for t in range(num_episodes):
    states_and_rewards = play_game(gw, policy)
    G = 0
    states_and_returns = []
    for s, r in reversed(states_and_rewards):
        G = r + gamma*G
        states_and_returns.append((s,G))
    states_and_returns.reverse()
    
    returns = {}
    for s in all_states:
        returns[s] = []
    
    seen_states = set()
    for s, G in states_and_returns:
        if s not in seen_states:
            returns[s].append(G)
            gw.set_value(s, np.mean(returns[s]))
            seen_states.add(s)  

In [67]:
gw.print_values()

-------------------------------------
|   0.00 |   0.01 |   0.10 |   0.01 |
-------------------------------------
|   0.01 |   0.10 |   1.00 |   0.10 |
-------------------------------------
|   0.00 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |   0.10 |
-------------------------------------


In [68]:
num_episodes = 100

for t in range(num_episodes):
    
    # select a random index in the list of all states
    random_indx = np.random.randint(0,len(all_states)) 
    # get the state 
    state = all_states[random_indx]
    # if the selected random state is a barrier state or a terminal state, then do it again
    while(gw.is_barrier(state) or gw.is_terminal(state)):
        random_indx = np.random.randint(0,len(all_states))
        state = all_states[random_indx]  
        
    # play the game from the selected state
    states_and_rewards = play_game(gw, policy, state=state)
    
    # compute states_and_returns
    G = 0
    states_and_returns = []
    for s, r in reversed(states_and_rewards):
        G = r + gamma*G
        states_and_returns.append((s,G))
    states_and_returns.reverse()
    
    # initialize the returns dictionary
    returns = {}
    for s in all_states:
        returns[s] = []
    
    # perform rest of calculation on the 
    seen_states = set()
    for s, G in states_and_returns:
        if s not in seen_states:
            returns[s].append(G)
            gw.set_value(s, np.mean(returns[s]))
            seen_states.add(s)  

In [69]:
gw.print_values()

-------------------------------------
|   0.12 |   0.25 |   0.50 |   0.25 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |   0.50 |
-------------------------------------
|   0.12 |   0.00 |   0.00 |   0.00 |
-------------------------------------
|   0.25 |   0.50 |   1.00 |   0.50 |
-------------------------------------
