In [1]:
import numpy as np
from numpy.random import randint

In [2]:
class ChainEnv:
    def __init__(self, N=4):
        self.current_state = 0
        self.N = N

    def _get_reward(self):
        if self.current_state == 1:
            return 1
        return 0

    def _go_left(self):
        self.current_state -= 1
        if self.current_state < 0:
            self.current_state = 0

    def _go_right(self):
        self.current_state += 1
        if self.current_state >= self.N:
            self.current_state = self.N-1

    def reset(self):
        self.current_state = 0
        return self.current_state

    def step(self, action):
        r = self._get_reward()
        if action == 0: # left
            self._go_left()
        else: # right
            self._go_right()
                
        return self.current_state, r

In [3]:
env = ChainEnv()
s = env.reset()
for i in range(5):
    a = randint(2)
    sp, r = env.step(a)
    print(f's: {s}, a: {a}, r: {r}, sp: {sp}')
    s = sp

s: 0, a: 1, r: 0, sp: 1
s: 1, a: 1, r: 1, sp: 2
s: 2, a: 1, r: 0, sp: 3
s: 3, a: 0, r: 0, sp: 2
s: 2, a: 1, r: 0, sp: 3


In [4]:
Q = np.zeros((4,2))
ALPHA = .3
GAMMA = .9
print(Q)

[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 0.]]


In [5]:
s = env.reset()

In [12]:
a = 1
sp, r = env.step(a)
print(f's: {s}, a: {a}, r: {r}, sp: {sp}')

s: 0, a: 1, r: 0, sp: 1


In [13]:
print(f'Q[s,a]: {Q[s,a]}')
print(f'r+GAMMA*max_a\' Q[sp, a\']: {r+GAMMA*np.max(Q[sp,:])}')

Q[s,a]: 0.0
r+GAMMA*max_a' Q[sp, a']: 0.27


In [14]:
Q[s,a] = ALPHA * (r+GAMMA*np.max(Q[sp,:])) + (1-ALPHA) * Q[s,a]
s = sp
Q

array([[0.   , 0.081],
       [0.3  , 0.   ],
       [0.   , 0.   ],
       [0.   , 0.   ]])

In [15]:
for i in range(10000):
    a = randint(2)
    sp, r = env.step(a)
    Q[s,a] = ALPHA * (r+GAMMA*np.max(Q[sp,:])) + (1-ALPHA) * Q[s,a]
    s = sp

In [16]:
Q

array([[4.26315789, 4.73684211],
       [5.26315789, 5.26315789],
       [4.73684211, 3.83684211],
       [4.26315789, 3.83684211]])