Rich S. Sutton [Tutorial: Introduction to Reinforcement Learning with Function Approximation](https://www.microsoft.com/en-us/research/video/tutorial-introduction-to-reinforcement-learning-with-function-approximation/)

Didn't get very far through this tutorial before I wanted to try building a state machine and a Q-learning agent ...

In [1]:
import numpy as np

In [2]:
states = ['A', 'B']
state_to_i = {s:i for i,s in enumerate(states)}
actions = ['1', '2']
action_to_i = {s:i for i,s in enumerate(actions)}

In [3]:
class Environment:
    def __init__(self):
        self.state = states[0]
        self.rng = np.random.default_rng()
        
    def change_state(self, probablity_that_we_will_change=None):
        if probablity_that_we_will_change is not None:
            if self.rng.choice(2, p=[probablity_that_we_will_change, 1-probablity_that_we_will_change]) == 1:
                return # i.e. don't change state
        self.state = states[1 - state_to_i[self.state]]
        
    def step(self, action):
        # 00:22:35 state machine
        if self.state == states[0]:                    # A
            if action == actions[0]:                   # 1
                return 10 + np.random.randint(-3, 3)
            else:                                      # 2
                self.change_state(0.8)
                return -10 + np.random.randint(0, 15)
        else:                                          # B
            if action == actions[0]:                   # 1
                self.change_state(0.8)
                return 40 + np.random.randint(-5, 5)
            else:                                      # 2
                self.change_state(0.8)
                return 20 + np.random.randint(-3, 3)

In [4]:
def demo():
    env = Environment()
    for action in '1 1 2 2 1 2'.split(' '):
        print(f'state {env.state} action {action} reward {env.step(action): >3} next state {env.state}')
demo()

state A action 1 reward   7 next state A
state A action 1 reward  12 next state A
state A action 2 reward   0 next state B
state B action 2 reward  19 next state B
state B action 1 reward  37 next state A
state A action 2 reward   3 next state B


In [5]:
class Agent:
    def __init__(self):
        self.q_table = np.zeros((len(states), len(actions)))
        self.rng = np.random.default_rng()
        
    def learn(self, time_steps=100, lr=0.9, gamma=0.95, epsilon=1.0):
        for time_step in range(time_steps):
            state = env.state
            if self.rng.choice(2, p=[epsilon, 1-epsilon]) == 0: # epsilon controls explore/exploit balance
                action = actions[int(self.rng.choice(2))]
            else:
                action = actions[np.argmax(self.q_table[state_to_i[state]])]
            reward = env.step(action)
            new_state = env.state
#             print(state, action, new_state, reward, epsilon)
            # 00:35:32 in sutton lecture - Q learning, the simplest RL algo
            q, state, action, new_state = self.q_table, state_to_i[state], action_to_i[action], state_to_i[new_state]
            q[state, action] = q[state, action] + lr * (reward + gamma * np.max(q[new_state]) - q[state, action])
            # q*(s,a) = E[reward + gamma * np.max(q[new_state]) | St=s, At=a] Bellman optimality equation???
            
    def exploit(self, time_steps):
        for time_step in range(time_steps):
            state = env.state
            action = actions[np.argmax(self.q_table[state_to_i[state]])] # greedify the policy
            reward = env.step(action)
            new_state = env.state
            print(state, action, new_state, reward)

without stochastic state change, the agent learns in ~20 steps

In [6]:
env = Environment()
agent = Agent()
print('Before training')
print(agent.q_table)
agent.exploit(9)

agent.learn(time_steps=50, lr=0.9) # it might make sense to do cosine anealing of lr, eps, ...
print('After training part 1')
print(agent.q_table)
agent.exploit(9)

agent.learn(time_steps=50, lr=0.3, epsilon=0.5) # but stepped decay works - and is sometimes not needed for this problem
print('After training with reduced eps')
print(agent.q_table)
agent.exploit(9)

Before training
[[0. 0.]
 [0. 0.]]
A 1 A 11
A 1 A 9
A 1 A 9
A 1 A 11
A 1 A 8
A 1 A 8
A 1 A 9
A 1 A 10
A 1 A 11
After training part 1
[[175.54652721 182.7931265 ]
 [201.44845885 167.8978501 ]]
B 1 A 40
A 2 B -10
B 1 A 43
A 2 B -4
B 1 B 35
B 1 A 44
A 2 B -7
B 1 A 37
A 2 B -1
After training with reduced eps
[[202.54635173 227.95599166]
 [255.14391476 218.12939142]]
A 2 B -4
B 1 A 43
A 2 B -10
B 1 B 38
B 1 A 41
A 2 B -6
B 1 A 40
A 2 A -9
A 2 B -8


how could the agent tell when it has learned enough? i.e. if the goal is to maximize long term reward, the sooner we stop trying random things, the better