# Q-Learning w/ Taxi-v3
[Link](https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Q%20learning/FrozenLake/Q%20Learning%20with%20FrozenLake.ipynb)

### Importing the necessary libraries

In [1]:
import numpy as np
import gym
import random

### Creating the environment

In [2]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|[35mR[0m:[43m [0m| : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



### Create the Q-table and initialize it
It has to have as much columns as actions the agent can do, and as much lines as states possible.

In [3]:
action_size = env.action_space.n
state_size = env.observation_space.n
print("Action size ", action_size)
print("State size ", state_size)

Action size  6
State size  500


In [4]:
qtable = np.zeros((state_size, action_size))
print(qtable)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


### Creating the hyperparameters

In [5]:
total_episodes = 50000
total_test_episodes = 100
max_steps = 99

learning_rate = 0.7
gamma = 0.618

# exploration hyperparameters
epsilon = 1.0
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

### The Q-learning algorithm
#### Here we train our model
1. Initialize Q-values arbitrarily for all state-action pairs.
2. For each episode...
- Choose an action in the world state based on Q-value estimates.
- Take an action and observe the outcome state and reward.
- Update the Q table with the Bellman equation.

In [6]:
for episode in range(total_episodes):
    # reset the environment
    state = env.reset()
    step = 0
    done = False
    
    for step in range(max_steps):
        # choose an action in the current world state
        ## first we randomize a number
        exp_exp_tradeoff = random.uniform(0,1)
        
        # if number > epsilon --> we exploit (taking the "reasonable decision", based on q values)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])
            
        # else, we explore --> random choice
        else:
            action = env.action_space.sample()
            
        # take the action and observe the outcome state and reward
        new_state, reward, done, info = env.step(action)
        
        # update the q table using the Bellman equation
        ## new qvalue(s, a) = current Q(s, a) + lr[ Reward + discount rate * max expected future reward - current Q(s, a)]
        qtable[state, action] = qtable[state, action] + learning_rate* (reward + gamma*np.max(qtable[new_state, :]) - qtable[state, action])
        
        # new state
        state = new_state
        
        if done:
            break
            
    episode += 1
    
    # Reduce epsilon (we need less exploration)
    epsilon = min_epsilon + (max_epsilon + min_epsilon)*np.exp(-decay_rate*episode)
    

# Play taxi!

In [7]:
# for animation
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Total rewards: {frame['reward']}")
        sleep(.1)


In [9]:
env.reset()
rewards= []
frames = []

for episode in range(5):
    state = env.reset()
    step = 0 
    done = False
    total_rewards = 0
        
    for step in range(max_steps):
            #taking the best action according to qtable
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
            
        total_rewards += reward
        
        frames.append({'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': total_rewards})
        
        if done:
            rewards.append(total_rewards)
            print('Score: ', total_rewards)
            break
            
        state = new_state
        
env.close()
print_frames(frames)

+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 69
State: 97
Action: 5
Total rewards: 3
