<a href="https://colab.research.google.com/github/moizca/Q-learning-and-SARSA-implementation-on-Taxi-v3-gym-environment/blob/main/RL_algo_Q_learn_SARSA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


```
[# This code is made with the help of the commit on github with SHA1-hash 53bd33e8d78311ebdd46450d9956a5f3af30f9a5 and github link](https://github.com/PacktPublishing/Reinforcement-Learning-Algorithms-with-Python/commits/master/Chapter04/SARSA%20Q_learning%20Taxi-v2.py)
```

 

##Please give me feedback on my first reinforcement code, say tuned as new amazing Reinforcement learning systems will be made soon!!

In [None]:
# Importing libraries.
import numpy as np 
import gym

# Epsilon Greedy action selection function with respect to a given state s.
def eps_greedy(Q, s, eps=0.1):

    if np.random.uniform(0,1) < eps:
        return np.random.randint(Q.shape[1])
    else:
        return greedy(Q, s)

# Greedy action selection function with  respect to a given state s, Epsilon Greedy action selection function also uses it.
def greedy(Q, s):

    return np.argmax(Q[s])

# This function runs test epsidos to test the agent, and measure the performance of it by reporting the reward accumulated by the agent during the whole testing run.
def run_episodes(env, Q, num_episodes=100, to_print=False):

    tot_rew = []                           # An array which stores the cumulative reward of the agent in a testing epsisode.
    state = env.reset()                    # Initialize the episode by sampling the initial state from the starting destribution over states.

    for _ in range(num_episodes):          # The loop which runs the testing episodes
        done = False
        game_rew = 0

        while not done:                    # This loop will run until the state become terminal state.
            action = greedy(Q, state)      # Calls the greedy function as it follows the greedy policy, based on the table 'Q'.
            next_state, rew, done, _ = env.step(action)          # At this point agent is giving action to the environment and the environment in return will give next state, reward and gives us some additional information like whether the next state is terminal or not.

            state = next_state
            game_rew += rew                # Rewards gets accumulated with in an episode in 'game_rew' variable.
            if done:                       # When the state becomes terminal then the episode terminates, a new one may begain.
                state = env.reset()
                tot_rew.append(game_rew)   # Total accumulated reward of the terminated episode will be accumulated in 'tot_rew' array.

    if to_print:
        print('Mean score: %.3f of %i games!'%(np.mean(tot_rew), num_episodes))

    return np.mean(tot_rew)

def Q_learning(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):     # Lookup table based Q-learning will occur in this function. Q-learning is an off policy learning algorithm. Here we follow epsilon greedy policy as our behaviour policy but learning the Q table of the greedy one.
    nA = env.action_space.n                                                    # Size of action space as an integer
    nS = env.observation_space.n                                                # Size of state space as an integer

    Q = np.zeros((nS, nA))                                                      # Initialize the Q table with zero values.
    games_reward = []                                                           # This empty list will store the cumulative rewards during training.
    test_rewards = []                                                           # This empty list will store cumulative rewards during testing.

    for ep in range(num_episodes):                                              # This loop will run the episodes
        state = env.reset()                                                     # Sample the initial state from the destribution over states.
        done = False
        tot_rew = 0

        if eps > 0.01:                                                          # We have decay rate of exploration rate, by which we want to decay our exploration rate after each episode if exploration rate is greater then 0.01
            eps -= eps_decay

        while not done:                                                         # Here the training episode will run till the terminal episode.

            action = eps_greedy(Q, state, eps)                                  # We select our action based on the state and the epsilon greedy policy which is our behaviour policy.

            next_state, rew, done, _ = env.step(action)                         # Action is given to the environment and next state, reward and other information will be given by the environment.

            Q[state][action] = Q[state][action] + lr*(rew + gamma*np.max(Q[next_state]) - Q[state][action])       # Q value is updated in the Q-table with the learning rate of 'lr = 0.01'.
                                                                                # Here, the update is made by bootstrapping from the Q-value of the action which has the maximum value, over the next state.
            state = next_state
            tot_rew += rew                                                      # Training reward gets accumulated in the 'tot_rew variable'.
            if done:
                games_reward.append(tot_rew)                                    # At the end of the episode the cumulative reward of the episode will be stored in 'games_reward' variable.

        if (ep % 300) == 0:                                                     # Testing episodes are initialized at every 300th of training episode.
            test_rew = run_episodes(env, Q, 1000)
            print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            test_rewards.append(test_rew)
            
    return Q                                                                    # Learned Q-table will be returned.


def SARSA(env, lr=0.01, num_episodes=10000, eps=0.3, gamma=0.95, eps_decay=0.00005):     # Lookup table based SARSA (State, Action, Reward, State, Action) will occur in this function. SARSA is an on-policy learning algorithm. Here, we follow epsilon greedy policy as our behaviour policy which is also the policy for which Q table is learned. It is like Temporal Difference (TD) method but for action values, not for state values
    nA = env.action_space.n                                                     # Size of action space as an integer
    nS = env.observation_space.n                                                # Size of state space as an integer


    Q = np.zeros((nS, nA))                                                      # Initialize the Q table with zero values.
    games_reward = []                                                           # This empty list will store the cumulative rewards during training.
    test_rewards = []                                                           # This empty list will store cumulative rewards during testing.

    for ep in range(num_episodes):                                              # This loop will run the episodes
        state = env.reset()                                                     # Sample the initial state from the destribution over states.
        done = False
        tot_rew = 0

        if eps > 0.01:                                                          # We have decay rate of exploration rate, by which we want to decay our exploration rate after each episode if exploration rate is greater then 0.01
            eps -= eps_decay


        action = eps_greedy(Q, state, eps) 

        while not done:                                                         # Here the training episode will run till the terminal episode.
            next_state, rew, done, _ = env.step(action) 

            next_action = eps_greedy(Q, next_state, eps)                        # We select our action based on the state and the epsilon greedy policy which is our behaviour policy as well as target policy. We are doing value iteration instead of policy iteration.

            Q[state][action] = Q[state][action] + lr*(rew + gamma*Q[next_state][next_action] - Q[state][action])       # Q value is updated in the Q-table with the learning rate of 'lr = 0.01'.
                                                                                # Here, the update is made by bootstrapping from the Q-value of the action which over the next state which is choosen by our behaviour policy.
            state = next_state
            action = next_action
            tot_rew += rew                                                      # Training reward gets accumulated in the 'tot_rew variable'.
            if done:
                games_reward.append(tot_rew)                                    # At the end of the episode the cumulative reward of the episode will be stored in 'games_reward' variable.

        if (ep % 300) == 0:                                                     # Testing episodes are initialized at every 300th of training episode.
            test_rew = run_episodes(env, Q, 1000)
            print("Episode:{:5d}  Eps:{:2.4f}  Rew:{:2.4f}".format(ep, eps, test_rew))
            test_rewards.append(test_rew)

    return Q                                                                    # Learned Q-table will be returned.


if __name__ == '__main__':                                                      # Code under this if statement will only run in this file. If this python file is imported in some other file and then used, this part of code cannot be run.
    env = gym.make('Taxi-v3')                                                   # The environment 'Taxi-v3' from 'open-ai gym' is created.
    
    Q_qlearning = Q_learning(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)       # Q_learning function is called to initialize the Q-learning.

    Q_sarsa = SARSA(env, lr=.1, num_episodes=5000, eps=0.4, gamma=0.95, eps_decay=0.001)                # SARSA function is called to initialize the learning based on SARSA algorithm.

Episode:    0  Eps:0.3990  Rew:-235.6850
Episode:  300  Eps:0.0990  Rew:-204.4050
Episode:  600  Eps:0.0100  Rew:-178.6070
Episode:  900  Eps:0.0100  Rew:-126.8770
Episode: 1200  Eps:0.0100  Rew:-153.8570
Episode: 1500  Eps:0.0100  Rew:-76.5620
Episode: 1800  Eps:0.0100  Rew:-35.6510
Episode: 2100  Eps:0.0100  Rew:-41.7790
Episode: 2400  Eps:0.0100  Rew:-32.0440
Episode: 2700  Eps:0.0100  Rew:-5.4690
Episode: 3000  Eps:0.0100  Rew:-12.9770
Episode: 3300  Eps:0.0100  Rew:5.6800
Episode: 3600  Eps:0.0100  Rew:6.3220
Episode: 3900  Eps:0.0100  Rew:7.3270
Episode: 4200  Eps:0.0100  Rew:7.8910
Episode: 4500  Eps:0.0100  Rew:7.9700
Episode: 4800  Eps:0.0100  Rew:7.9920
Episode:    0  Eps:0.3990  Rew:-200.0000
Episode:  300  Eps:0.0990  Rew:-210.4260
Episode:  600  Eps:0.0100  Rew:-212.5980
Episode:  900  Eps:0.0100  Rew:-155.3150
Episode: 1200  Eps:0.0100  Rew:-198.9420
Episode: 1500  Eps:0.0100  Rew:-63.7550
Episode: 1800  Eps:0.0100  Rew:-30.9500
Episode: 2100  Eps:0.0100  Rew:-23.9860
Epi