In [None]:
import numpy as np

#Initialization
ACTION_SPACE = ('U', 'D', 'L', 'R')
States=[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3)]
REWARDS = {(0, 3): 1, (1, 3): -1}
actions = {
    (0, 0): ('D', 'R'),
    (0, 1): ('L', 'R'),
    (0, 2): ('L', 'D', 'R'),
    (1, 0): ('U', 'D'),
    (1, 2): ('U', 'D', 'R'),
    (2, 0): ('U', 'R'),
    (2, 1): ('L', 'R'),
    (2, 2): ('L', 'R', 'U'),
    (2, 3): ('L', 'U'),
    }
def is_terminal(s):
    return s in [(0, 3),(1, 3)] 
def get_next_state(s, a):
    # this answers: where would I end up if I perform action 'a' in state 's'?
    i, j = s[0], s[1]
    # if this action moves you somewhere else, then it will be in this dictionary
    if a in actions[(i, j)]:
        if a == 'U':
            i -= 1
        elif a == 'D':
            i += 1
        elif a == 'R':
            j += 1
        elif a == 'L':
            j -= 1
    return i, j
### define transition probabilities
  # the key is (s, a, s'), the value is the probability
  # that is, transition_probs[(s, a, s')] = p(s' | s, a)
  # any key NOT present will considered to be impossible (i.e. probability 0)
transition_probs = {}
  # to reduce the dimensionality of the dictionary, we'll use deterministic
  # rewards, r(s, a, s')
  # note: you could make it simpler by using r(s') since the reward doesn't
  # actually depend on (s, a)
rewards = {}

for s in States:
    if not is_terminal(s):
        for a in ACTION_SPACE:
            s2 = get_next_state(s, a)
            transition_probs[(s, a, s2)] = 1
            if s2 in REWARDS:
                rewards[(s, a, s2)] = REWARDS[s2]
            else:
                rewards[(s, a, s2)] = 0

In [None]:
# initialize V(s)
V = {}
for s in States:
    V[s] = 0

In [None]:
# repeat until convergence
# V[s] = max[a]{ sum[s',r] { p(s',r|s,a)[r + gamma*V[s']] } }
it = 0
gamma=0.9
SMALL_ENOUGH=1e-3
while True:
    biggest_change = 0
    for s in States:
        if not is_terminal(s):
            old_v = V[s]
            new_v = float('-inf')
            for a in ACTION_SPACE:
                v = 0
                for s2 in States:
                # reward is a function of (s, a, s'), 0 if not specified
                    r = rewards.get((s, a, s2), 0)
                    v += transition_probs.get((s, a, s2), 0) * (r + gamma * V[s2])
                    # keep v if it's better
                if v > new_v:
                        new_v = v
            V[s] = new_v
            biggest_change = max(biggest_change, np.abs(old_v - V[s]))
    it += 1
    print("iter:", it, "biggest_change:", biggest_change)
    if biggest_change < SMALL_ENOUGH:
        break

In [None]:
# find a policy that leads to optimal value function
policy = {}
for s in States:
    if not is_terminal(s):
        best_a = None
        best_value = float('-inf')
        # loop through all possible actions to find the best current action
        for a in ACTION_SPACE:
            v = 0
            for s2 in States:
                # reward is a function of (s, a, s'), 0 if not specified
                r = rewards.get((s, a, s2), 0)
                v += transition_probs.get((s, a, s2), 0) * (r + gamma * V[s2])

            # best_a is the action associated with best_value
            if v > best_value:
                best_value = v
                best_a = a
                policy[s] = best_a

In [None]:
def print_values(V, rows,columns):
    for i in range(rows):
        print("---------------------------")
        for j in range(columns):
            v = V.get((i,j), 0)
            if v >= 0:
                print(" %.2f|" % v, end="")
            else:
                print("%.2f|" % v, end="") # -ve sign takes up an extra space
        print("")
def print_policy(policy,rows,columns):
    for i in range(rows):
        print("---------------------------")
        for j in range(columns):
              a = policy.get((i,j), ' ')
              print("  %s  |" % a, end="")
        print("")

In [None]:
print_values(V, 3,4)

In [None]:
print_policy(policy,rows=3,columns=4)