In [23]:
ALL_ACTIONS = ("U","D","R","L")
class Grid:
    def __init__(self,dimension,start) -> None:
        rows , cols = dimension
        i , j = start
        self.cols = cols
        self.rows = rows
        self.i = i
        self.j = j
    def set(self,rewards,actions):
        self.rewards = rewards
        self.actions = actions
    def set_state(self,point):
        self.i = point[0]
        self.j = point[1]
    def get_current_state(self):
        return (self.i,self.j)
    def is_end(self,state):
        return state not in self.actions
    def get_next_state(self,state,action):
        (i,j) = state
        if action == 'U':
                i -= 1
        elif action == 'D':
            i += 1
        elif action == 'R':
            j += 1
        elif action == 'L':
            j -= 1
        return (i,j)
    def move(self, action):
    # check if legal move first
        if action in self.actions[(self.i, self.j)]:
            if action == 'U':
                self.i -= 1
            elif action == 'D':
                self.i += 1
            elif action == 'R':
                self.j += 1
            elif action == 'L':
                self.j -= 1
        return self.rewards.get((self.i, self.j), 0)
    def get_all_states(self):
        return set(self.actions.keys()) | set(self.rewards.keys())
    
    

In [24]:
def standard_grid():

    # .  .  .  1
    # .  x  . -1
    # s  .  .  .
    grid = Grid((3, 4), (2, 0))
    rewards = {(0, 3): 1, (1, 3): -1}
    actions = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U'),
        }
    grid.set(rewards,actions)
    return grid

In [25]:
grid  = standard_grid()

In [27]:
import numpy as np
def take_action(policy,state,eps = 0.1):
    if np.random.random() < eps:
        return np.random.choice(ALL_ACTIONS)
    else:
        return policy[state]

In [49]:

def play_game(grid,policy):
    state = (2,0)
    grid.set_state(state)
    state_and_rewards = [(state,0)]
    while not grid.is_end(grid.get_current_state()):
        a = take_action(policy,state)
        rew = grid.move(a)
        next_state = grid.get_current_state()
        state_and_rewards.append((next_state,rew))
    return state_and_rewards

In [60]:
V = {}
ALPHA = 0.1
GAMMA = 0.9
policy = {
(2, 0): 'R',
(1, 0): 'D',
(0, 0): 'R',
(0, 1): 'R',
(0, 2): 'L',
(1, 2): 'D',
(2, 1): 'R',
(2, 2): 'R',
(2, 3): 'U',
}
for i in grid.get_all_states():
    if i in grid.rewards.keys():
        V[i] = grid.rewards[i]
    else:
        V[i] = 0

for i in range (20000):
    state_and_rewards = play_game(grid,policy)
    # print(state_and_rewards)
    for t in range(len(state_and_rewards) - 1):
        state,_ = state_and_rewards[t]
        next_state,rew = state_and_rewards[t+1]
        V[state] += ALPHA * (rew + GAMMA * V[next_state] - V[state])


In [52]:
def print_values(V, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            v = V.get((i, j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="")  
        print("")


def print_policy(P, g):
    for i in range(g.rows):
        print("---------------------------")
        for j in range(g.cols):
            a = P.get((i, j), " ")
            print("  %s  |" % a, end="")
        print("")



In [61]:
print_values(V,grid)

---------------------------
 1.40| 1.59| 1.84| 1.00|
---------------------------
 0.28| 0.00|-1.64|-1.00|
---------------------------
-0.34|-0.36|-0.43|-0.46|
