# Reinforcement Learning

## The Game 
What is a state and what is an action?

In [1]:
import gym
from IPython.display import clear_output
import time

env=gym.make('Taxi-v3')
env.reset()
score = 0
for _ in range(30):
    env.render()
    action = env.action_space.sample()
    print(action)
    observation, reward, done, info = env.step(action) # take a random action    
    score += reward
    print(score, reward)
    print(observation, reward, done, info)
    time.sleep(0.5)
    clear_output(True)
env.render()
print(score)
env.close()
print(env.env.s) # how to get current state
print(env.action_space)
print(env.observation_space)
#env.reward

+---------+
|[35m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
-120
8
Discrete(6)
Discrete(500)


## QTable? 

### What does each element represent?

### What is the dimension of QTable?

### If we are at state $s$ how do we pick the next best action

Closing Note: QTable doesn't really have to be a table. You can think about it as a parametrized function. The discrete one is much easier to implement.

## Q Learning

### Write down the update rule

### What is $\alpha$?
What happen when alpha is 1 and what happen when alpha is 0?

### What is gamma?

### What value does the update rule converge to?

### When learning do we always want to use the best action?

### What is $\epsilon$?

### When we evaluate the QTable do we want to always use the best action?

In [2]:
print(env.action_space.n)
print(env.observation_space)

6
Discrete(500)


## Build QLearning to play the taxi game

$$
Q^{new}(s, a) = Q^{old}(s, a) + \alpha \left(R(s, a) + \gamma \max_{a'}Q^{old}(s', a') - Q^{old}(s, a)\right)
$$
where $s \rightarrow s'$ via action $a$

In [3]:
import numpy as np
class QTable:
    def __init__(self, qtable: np.ndarray):
        self.qtable = qtable # qtable[s, a]
        print(self.qtable.shape)
      
    def update_table(self, s:int, a:int, next_state: int, reward: float, gamma: float, alpha: float):
        """update q table to a better value"""
        q = self.qtable
        q[s, a] = q[s, a] + alpha* (reward + gamma* np.max(q[next_state]) - q[s, a])
    
    def best_action(self, s: int) -> int:
        return np.argmax(self.qtable[s])
    
    def learn(self, env, epsilon: float, gamma: float, alpha: float, n: int):
        """Play the game many time and iteratively update QTable"""
        for i in range(n):
            if i%1000 == 0:
                print(i)
            env.reset()
            n_turn = 0
            should_quit = False
            score = 0
            while not should_quit:
                s = env.env.s
                if np.random.rand() > epsilon:
                    a = env.action_space.sample()
                else:
                    a = self.best_action(s)
                observation, reward, done, info = env.step(a) # take a random action    
                new_state = env.env.s
                score += reward
                self.update_table(s, a, new_state, reward, gamma, alpha)
                n_turn += 1
                should_quit = n_turn > 100 or done
           
    @classmethod
    def for_env(self, env):
        n_state = env.action_space.n
        n_action = env.observation_space.n
        return QTable(np.zeros((n_action, n_state, )))

In [4]:
env=gym.make('Taxi-v3')

In [5]:
qtable = QTable.for_env(env)
qtable.learn(env, epsilon=0.5, gamma=0.5, alpha=0.1, n=10000 )

(500, 6)
0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [6]:
qtable.learn(env, epsilon=0.5, gamma=0.5, alpha=0.1, n=10000 )

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [None]:
def animate(qtable, env):
    env.reset()
    score = 0
    done = False
    while not done:
        env.render()
        s = env.env.s
        action = qtable.best_action(s)
        observation, reward, done, info = env.step(action) # take a random action    
        score += reward
        print(score, reward)
        print(observation, reward, done, info)
        time.sleep(0.5)
        clear_output(True)
    env.render()
    print(score)
for _ in range(1000):
    animate(qtable, env)
    clear_output(True)

+---------+
|R: | : :[35mG[0m|
| : | : : |
|[42m_[0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
-9 -1
237 -1 False {'prob': 1.0}


In [None]:
np.random.rand()