## Gridworld A

We experiment with policy and value incrementing algorithms on the simple gridworld problem

In [347]:
import numpy as np
import random

In [348]:
world_size = 4
terminal_states = (0, world_size**2 - 1)
states = {i: np.array([i // world_size, i % world_size]) for i in range(world_size * world_size)}

In [349]:
actions = {'up':np.array([-1,0]),
           'down':np.array([1,0]),
           'right':np.array([0,1]),
           'left':np.array([0,-1])}

In [388]:
arrows = {'left':"🠈",
          'right':"🠊",
          'up':"🠉",
          'down':"🠋",
          'none':" "}

In [350]:
np.all(np.array([1,2])<np.array([1,6]))

False

In [351]:
def transition(state, action):
    """ Return the new state resulting from applying action to state.
        If action is invalid or if state is terminal, return the original state """
    if state in terminal_states:
        return state
    temp = states[state] + actions[action]
    if (np.all( np.array([0,0]) <= temp) and np.all(temp < np.array([world_size,world_size]))):
        return temp[0]*world_size + temp[1]
    else:
        return state

In [352]:
transition(3,'up'), transition(3,'down'), transition(8,'left')

(3, 7, 8)

In [353]:
def reward(state, action):
    new_state = transition(state, action)
    if new_state in terminal_states:
        return -1
    else:
        return -1

In [354]:
state_actions = dict()
for s in states.keys():
    children = set()
    for a in actions.keys():
        if transition(s,a) != s:
            children.add(a)
    state_actions.update({ s: children } )

In [355]:
state_actions

{0: set(),
 1: {'down', 'left', 'right'},
 2: {'down', 'left', 'right'},
 3: {'down', 'left'},
 4: {'down', 'right', 'up'},
 5: {'down', 'left', 'right', 'up'},
 6: {'down', 'left', 'right', 'up'},
 7: {'down', 'left', 'up'},
 8: {'down', 'right', 'up'},
 9: {'down', 'left', 'right', 'up'},
 10: {'down', 'left', 'right', 'up'},
 11: {'down', 'left', 'up'},
 12: {'right', 'up'},
 13: {'left', 'right', 'up'},
 14: {'left', 'right', 'up'},
 15: set()}

Initialize a uniform policy

In [356]:
pi = dict()
for s in states:
    children = state_actions[s]
    for a in children:
        pi.update( { (s,a) : 1/len(children) } )
        

Initialize transition probabilities

In [357]:
p = dict()

In [358]:
for s in states:
    children = state_actions[s]
    for a in children:
        next_s = transition(s, a)
        r = reward(s, a)
        p.update( { (s, a, next_s, r) : 1 } )

## Iterative Policy Evaluation Algorithm

In [359]:
V = { s:0.0 for s in states }
gamma = 0.99
count = 1
while count < 50:
    delta = 0
    for s in states:
        v = V[s]
        new_v = 0
        for a in state_actions[s]:
            r = reward(s, a)
            new_state = transition(s, a)
            new_v  += pi[(s,a)]*p[(s,a,new_state,r)]*(r + gamma*V[new_state])
        V[s] = new_v
        delta = max(delta, abs(v - V[s]))
    if delta < 0.01:
        break
    count += 1

Now compute the optimal policy based on the child with max V

In [366]:
optimal = dict()
for s in states:
    children = state_actions[s]
    children_states = [transition(s,a) for a in children]
    if (len(children_states) == 0):
        continue
    best_child = max([c for c in children_states], key = lambda x: V[x])
    optimal.update({s : best_child} )
optimal.update({ s: s for s in terminal_states })

In [367]:
for i in range(world_size):
    for j in range(world_size):
        print (f"{V[i*world_size + j]:8.2f}", end=' ')
    print()

    0.00    -9.67   -13.58   -14.44 
   -9.67   -12.72   -14.02   -13.59 
  -13.58   -14.02   -12.73    -9.69 
  -14.44   -13.59    -9.69     0.00 


In [368]:
for i in range(world_size):
    for j in range(world_size):
        print (f"{optimal[i*world_size + j]:4}", end=' ')
    print()

   0    0    1    2 
   0    4    5   11 
   4    5   11   15 
   8   14   15   15 


## Policy Iteration

In [369]:
V = dict( {s : random.random() for s in states} )
V.update ( {s : 0 for s in terminal_states } )
pi = dict( {s : random.choice(list(state_actions[s])) for s in states if len(state_actions[s]) > 0} )
pi.update( { s: None for s in terminal_states} )

In [382]:
count= 0
gamma = 0.4
done = False

while not done:
    while True:
        delta = 0
        for s in states:
            v = V[s]
            if pi[s]:
                next_s = transition(s,pi[s])
                r = reward(s, pi[s])
                if (s == 1):
                    print ("help:", s, v, next_s, r, gamma, V[next_s])
                V[s] = 1 * (r + gamma * V[next_s])
                delta = max(delta, abs(v - V[s]))
                #print(delta, s, V[s])
        if delta < 2:
            break
    done = True
    #print(pi)
    optimal = dict()
    for s in states:
        if s not in terminal_states:
            children = state_actions[s]
            best_child = max([c for c in children], \
                             key = lambda x: (-1+gamma*V[transition(s,x)]))
            optimal.update({s : best_child} )
            if optimal[s] != pi[s]:
                done = False
                pi[s] = optimal[s]
optimal.update({ s: 'none' for s in terminal_states })

help: 1 -1.0 0 -1 0.4 0


In [383]:
terminal_states

(0, 15)

In [389]:
for i in range(world_size):
    for j in range(world_size):
        print (f"{arrows[optimal[i*world_size + j]]:2}", end=' ')
    print()

   🠈  🠈  🠈  
🠉  🠈  🠊  🠋  
🠉  🠊  🠊  🠋  
🠊  🠊  🠊     


In [390]:
for i in range(world_size):
    for j in range(world_size):
        print (f"{V[i*world_size + j]:6}", end=' ')
    print()

     0   -1.0   -1.4  -1.56 
  -1.0   -1.4  -1.56   -1.4 
  -1.4  -1.56   -1.4   -1.0 
 -1.56   -1.4   -1.0      0 


## Jack's Car Rental

In [417]:
max_cars = 20
max_moves = 5
state_ceiling = np.array([max_cars, max_cars])
states = [np.array([i,j]) for i in range(max_cars+1) for j in range(max_cars+1)]

In [466]:
actions = [np.array([-i,i]) for i in range(-max_moves,max_moves+1)]

In [467]:
states[10]+actions[4]

array([1, 9])

In [468]:
a = np.array([1,4])

In [469]:
b = np.array([3,3])

In [470]:
np.min([a,b],axis = 0)

array([1, 3])

In [471]:
def poisson(l, k):
    return l**k/math.factorial(k)*math.exp(-l)

In [529]:
p = dict()
p_rentals = [0]*max_cars
p_returns = [0]*max_cars
p_in = [[poisson(3,a)*poisson(2,b) for b in range(max_cars+1)] for a in range(max_cars+1)]
p_out = [[poisson(3,a)*poisson(4,b) for b in range(max_cars+1)] for a in range(max_cars+1)]

In [530]:
m = max_cars+1
p = dict()
for s in states:
    for a in actions:
        p[(s[0],s[1],a[0],a[1])] = dict()

In [531]:
def truncate(s):
    s = np.min([s, state_ceiling], axis=0)
    s = np.max([s, [0,0]], axis = 0)
    return s

In [532]:
import time

In [533]:
time.time()

1712783675.864478

In [None]:
tic = time.time()
for s in states:
    print(time.time() - tic, s)
    for a in actions:
        new_s = s+a
        if (np.any(new_s < [0,0])):
            continue
        new_s = truncate(s)
        for in_a in range(m):
            for in_b in range(m):
                for out_a in range(m):
                    for out_b in range(m):
                        new_s += np.array([in_a, in_b])
                        new_s = truncate(new_s)
                        r = 10*(min(out_a, new_s[0]) + min(out_b, new_s[1]))
                        new_s -= np.array([min(out_a, new_s[0]), min(out_b, new_s[1])])
                        old_p = p[(s[0],s[1],a[0],a[1])].get((new_s[0],new_s[1],r))
                        if old_p:
                            p[(s[0],s[1],a[0],a[1])][new_s[0],new_s[1],r] += p_in[in_a][in_b]*p_out[out_a][out_b]
                        else:
                            p[(s[0],s[1],a[0],a[1])][new_s[0],new_s[1],r] = p_in[in_a][in_b]*p_out[out_a][out_b]


0.0006382465362548828 [0 0]
2.7935855388641357 [0 1]
7.961071491241455 [0 2]
16.394394159317017 [0 3]
28.057419776916504 [0 4]
41.68884778022766 [0 5]
59.14366865158081 [0 6]
76.17127084732056 [0 7]
93.44247436523438 [0 8]
111.0280692577362 [0 9]
127.97518348693848 [ 0 10]
144.65555357933044 [ 0 11]
162.19644832611084 [ 0 12]
179.43471121788025 [ 0 13]
196.09392046928406 [ 0 14]
212.59214401245117 [ 0 15]
230.48203301429749 [ 0 16]
248.3438320159912 [ 0 17]
265.7030117511749 [ 0 18]
282.06717562675476 [ 0 19]
298.5232357978821 [ 0 20]
315.9177632331848 [1 0]
322.0399787425995 [1 1]
330.9116630554199 [1 2]


In [525]:
p

{(0, 0, 5, -5): {},
 (0, 0, 4, -4): {},
 (0, 0, 3, -3): {},
 (0, 0, 2, -2): {},
 (0, 0, 1, -1): {},
 (0, 0, 0, 0): {(0, 0, 0): 0.00673794698604857,
  (0, 1, 0): 0.00024681960817044885,
  (0, 1, 10): 0.0014143824271734035,
  (0, 0, 20): 0.03543924771035542,
  (0, 0, 10): 0.021926783253882528,
  (0, 2, 0): 0.00024681960817044885,
  (0, 3, 10): 0.0012720144290095344,
  (0, 3, 20): 0.00394418310428177,
  (0, 2, 30): 0.002837927428346732,
  (0, 0, 40): 0.025967555787380754,
  (0, 3, 0): 0.000164546405446966,
  (0, 5, 10): 0.0007151328210534118,
  (0, 6, 20): 0.0013337852609725106,
  (0, 6, 30): 0.004042168959442671,
  (0, 5, 40): 0.0023116421683939577,
  (0, 3, 50): 0.0033363941300064365,
  (0, 0, 60): 0.008857705122860384,
  (0, 0, 30): 0.03496647509022981,
  (0, 4, 0): 8.2273202723483e-05,
  (0, 7, 10): 0.00033451635368112706,
  (0, 9, 20): 0.0008862508318246075,
  (0, 10, 30): 0.0008776097793067388,
  (0, 10, 40): 0.0023961826726048865,
  (0, 9, 50): 0.0012865189899409258,
  (0, 7, 60): 

In [485]:
s[0]

0

In [486]:
(s[0],s[1],actions[0])

(0, 0, array([ 5, -5]))