In [1]:
import numpy as np
from copy import deepcopy

In [9]:
def init_chain():
    
    # initialize table with only zeros
    table = np.zeros((5,2))
    
    # for each state, action 1 and action 2
    transitions = {0: [1, 0], 
                   1: [2, 0], 
                   2: [3, 0], 
                   3: [4, 0],
                   4: [4, 0]}
    
    # rewards for each state's actions
    R = np.array([[0, 0.2],
                  [0, 0  ],
                  [0, 0  ],
                  [0, 0  ],
                  [1, 0  ]])
    
    gamma = 0.9
    
    return table, transitions, R, gamma

In [8]:
def BK(state, old_table, action, next_state, R, gamma):

    # tabular values of 
    A = old_table[next_state, :]
    
    # immediate reward of the current state, for either action 1 or action 2
    R_xa = R[state][action]

    return R_xa + gamma * max(A)
    

def tabular_Qlearning(table, transitions, R, gamma):
    
    # loop until converged
    converged = False
    while not converged:
        
        # save copy of the table in the previous timestep
        old_table = deepcopy(table)

        # iterate over all states
        for state in range(len(table)):

            # retrieve all possible next states
            next_states = np.array(transitions[state])

            #calculate BK(state, action) for action 1 and action 2
            a1, a2 = [BK(state, old_table, action, next_state, R, gamma) 
                      for (action, next_state) in enumerate(next_states)]
            
            

            # update table with BK(state, action)
            table[state] = a1, a2
            
        # stop if the table has converged
        if np.array_equal(table, old_table):
            converged = True
            
    return table

# initialize table and the corresponding parameters
table, transitions, R, gamma = init_chain()

table = tabular_Qlearning(table, transitions, R, gamma)
print(table)

[[ 6.561   6.1049]
 [ 7.29    5.9049]
 [ 8.1     5.9049]
 [ 9.      5.9049]
 [10.      5.9049]]
