# **Q-Learning**

## **Problem Statement**

Solve the cliff walking environment using Q-learning.

### **Environment**

Cliff walking is a two dimensional board game.

![Demo 7 Q-learning](https://drive.google.com/uc?id=1rLg6GH1ErppUumPO06NTEMROQjwlMHxw)

* The environment has episodic tasks.

* Agent starts at the state **S** and traverses through the environment to reach goal **G**.

* Agent can move up, down, right, and left.

* Agent receives a reward of -1 for every movement except for the cliff and the goal.

* Agent falling into the cliff attracts a reward of -100 and is sent back to the state **S**.

### **Import tools and environment**

In [2]:
import gym
import itertools
import matplotlib
import numpy as np
import pandas as pd
import sys
import cliff_walking
from collections import namedtuple
from matplotlib import pyplot as plt
from collections import defaultdict
from cliff_walking import CliffWalkingEnv
matplotlib.style.use('ggplot')

ImportError: cannot import name 'discrete' from 'gym.envs.toy_text' (/usr/local/lib/python3.10/site-packages/gym/envs/toy_text/__init__.py)

In [None]:
env = CliffWalkingEnv()

### **Create the $\epsilon-greedy$ policy** 

In [None]:
#Creating an epsilon-greedy policy using Q-function and epsilon.
#Arguments:
#Q: Action-values
#epsilon: Probability of selecting a random action
#nA: Number of actions available in the environment
#Returns: Probabilities for each action will be presented in the form of a numpy array of length nA


def make_epsilon_greedy_policy(Q, epsilon, nA):
   
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

### **Q-Learning**

In [None]:
#Q-Learning algorithm: Its an off-policy TD control used to find the optimal greedy policy
#It follows an epsilon-greedy policy
#Arguments:
#env: Cliff walking environment
#num_episodes: Number of episodes
#discount_factor: Gamma
#alpha: learning rate.
#epsilon: Probability of sampling a random action   
#Returns:
#A = It is a tuple of Q, episode_lengths
#Q = It is the optimal action-value function
#Stats = It is an EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards


def q_learning(env, num_episodes, discount_factor=1.0, alpha=0.5, epsilon=0.1):
   
    
    #The final action-value function.
    #A nested dictionary that maps state to (action -> action-value)
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    #Tracking useful statistics
    EpisodeStats = namedtuple("Stats",["episode_lengths", "episode_rewards"]) #storing stats in tuples helps during plotting the stats

    
    stats = EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))    
    
    #Policy followed by us
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
    
    for i_episode in range(num_episodes):
        # Printing out which episode we're on (useful for debugging)
        if (i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="")
            sys.stdout.flush()
        
        # Reseting the environment and picking up the first action
        state = env.reset()
        
        # One step in the environment
        # total_reward = 0.0
        for t in itertools.count():
            
            # Take a step
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t
            
            # TD Update
            best_next_action = np.argmax(Q[next_state])    
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha * td_delta
                
            if done:
                break
                
            state = next_state
    
    return Q, stats

In [None]:
Q, stats = q_learning(env, 500)

### **Plot the stats**

In [None]:
def plot_episode_stats(stats, smoothing_window=10, noshow=False):
    # Plot the episode length over time
    fig1 = plt.figure(figsize=(10,5))
    plt.plot(stats.episode_lengths)
    plt.xlabel("Episode")
    plt.ylabel("Episode Length")
    plt.title("Episode Length over Time")
    if noshow:
        plt.close(fig1)
    else:
        plt.show(fig1)

    # Plot the episode reward over time
    fig2 = plt.figure(figsize=(10,5))
    rewards_smoothed = pd.Series(stats.episode_rewards).rolling(smoothing_window, min_periods=smoothing_window).mean()
    plt.plot(rewards_smoothed)
    plt.xlabel("Episode")
    plt.ylabel("Episode Reward (Smoothed)")
    plt.title("Episode Reward over Time (Smoothed over window size {})".format(smoothing_window))
    if noshow:
        plt.close(fig2)
    else:
        plt.show(fig2)

    # Plot time steps and episode number
    fig3 = plt.figure(figsize=(10,5))
    plt.plot(np.cumsum(stats.episode_lengths), np.arange(len(stats.episode_lengths)))
    plt.xlabel("Time Steps")
    plt.ylabel("Episode")
    plt.title("Episode per time step")
    if noshow:
        plt.close(fig3)
    else:
        plt.show(fig3)

    return fig1, fig2, fig3

In [None]:
plot_episode_stats(stats)