# Q-Learning

#### Rendering Gym in Colab
https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t

In [1]:
#!pip install gym==0.20.0
#!pip install torch==1.8.0
#!pip install numpy==1.21.2
#!pip install matplotlib==3.4.3
#!pip install gym[box2d]

In [2]:
from collections import defaultdict, namedtuple
import tqdm
import gym
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
print("gym=={}".format(gym.__version__))

gym==0.25.2


  and should_run_async(code)


# Frozen Lake Environment

https://www.gymlibrary.dev/environments/toy_text/frozen_lake/

In [4]:
# Create the Frozen lake environment
# with 4 columns and 4 rows.
env = gym.make('FrozenLake-v1', 
               desc = None, 
               map_name = "4x4", 
               is_slippery = False)

# There are 4 possible actions the agent can take:
# Move up, move down, move left and move right.
# Each action results in the agent moving to a 
# new x-y position, thereby changing the state
# of the environment.
# As the environment is a 4x4 grid, there are
# 16 possible states.
print("Actions: {}, states: {}.".format(env.action_space, 
                                   env.observation_space))

Actions: Discrete(4), states: Discrete(16).


  deprecation(
  deprecation(


In [5]:
# Reset the environment and get the reset state.
state = env.reset(seed = 123)
print("Reset state: {}.".format(state))

actions = [1, 1, 2, 2, 1, 2]

# Take a series of actions and print the resulting state.
for action in actions:
    # Each action changes the state of the environment.
    next_state, reward, done, info = env.step(action)
    print("Next state: {:2d}, reward: {}, game end: {}.".format(next_state, 
                                                                reward, 
                                                                done))

Reset state: 0.
Next state:  4, reward: 0.0, game end: False.
Next state:  8, reward: 0.0, game end: False.
Next state:  9, reward: 0.0, game end: False.
Next state: 10, reward: 0.0, game end: False.
Next state: 14, reward: 0.0, game end: False.
Next state: 15, reward: 1.0, game end: True.


In [6]:
# Reset the environment and get the reset state.
state = env.reset(seed = 123)
print("Reset state: {}.".format(state))

actions = [1, 2]

# Take a series of actions and print the resulting state.
for action in actions:
    # Each action changes the state of the environment.
    next_state, reward, done, info = env.step(action)
    print("Next state: {:2d}, reward: {}, game end: {}.".format(next_state, 
                                                                reward, 
                                                                done))

Reset state: 0.
Next state:  4, reward: 0.0, game end: False.
Next state:  5, reward: 0.0, game end: True.


# Q-Learning Agent

https://github.com/rasbt/machine-learning-book/blob/main/ch19/ch19.ipynb

In [7]:
class Agent(object):
    def __init__(self, 
                 env,                         # gym env.
                 learning_rate = 0.7,         # Q-learning learning rate.
                 discount_factor = np.log(2), # Q-learning discount factor.
                 epsilon = 1.0,               # Greedy epsilon factor.
                 epsilon_min = 0.01,          # Minimum epsilon value to decay to.
                 epsilon_decay = 0.99,        # Epsilon decay rate.
                 train_mode = True            # Train mode. 
                 ):   
      
        # Q-learning hyperparameters.
        self.learning_rate = learning_rate     # Q-learning learning rate.
        self.discount_factor = discount_factor # Q-learning discount factor.
        self.epsilon = epsilon                 # Greedy epsilon factor.
        self.epsilon_min = epsilon_min         # Minimum epsilon value to decay to.
        self.epsilon_decay = epsilon_decay     # Epsilon decay rate.

        # Environment hyperparameters.
        self.env = env          
        self.nA = env.action_space.n

        # Table of Q-values.
        self.q_table = np.zeros([env.observation_space.n, env.action_space.n])

        # If train mode is False, then epsilon will be set to 0, and the
        # agent will choose the action to be taken solely from the Q-table
        # and not use any randomness.
        self.set_train_mode(train_mode)
 
    def choose_action(self, state):
        if self.train_mode == True:
            action = np.argmax(self.q_table[state])
        else: 
            # Exploration - randomly sample an action.
            # Only applicable for training.
            if np.random.uniform() < self.epsilon:
                action = env.action_space.sample()
            # Exploitation - choose the best (highest Q-value) action given some
            # current state.
            # For non-training, this is the only way the model
            # will choose the next action.
            else:
                action = np.argmax(self.q_table[state])
        return action

    def set_train_mode(self, train_mode):
        self.train_mode = train_mode

    def _learn(self, transition):
        s, a, r, next_s, done = transition
        q_val = self.q_table[s][a]
        if done:
            q_target = r
        else:
            q_target = r + self.discount_factor * np.max(self.q_table[next_s])

        self.q_table[s][a] = self.q_table[s][a] + self.learning_rate * (q_target - q_val)
        self._adjust_epsilon()

    def _adjust_epsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon * self.epsilon_decay

In [8]:
def run_qlearning(agent, env, num_episodes = 50):
    history = {"states" : [], "actions" : [], "n_moves" : [], "rewards" : []}

    for episode in range(num_episodes):
        state = env.reset(seed = 123)

        previous_state = state

        states = [state]
        actions = []
        step = 0
        while True:
            step = step + 1
            # Agent chooses action to take based on current state.
            action = agent.choose_action(state)

            # Performing an action changes the environment's state.
            next_s, reward, done, _ = env.step(action)

            # If the agent walks into a hole, a penalty is incurred.
            if done == True and reward == 0:
                reward = -10
            # If the agent finds the wrapped gift, a reward in incurred.
            elif done == True and reward == 1:
                reward = 10

            # If the agent returns to the previous state or stays in the same
            # state, a penalty is incurred.
            if next_s == previous_state:
                reward = -10
            if next_s == state:
                reward = -10

            # Q-learning.
            agent._learn((state, action, reward, next_s, done))

            states.append(next_s)
            actions.append(action)

            previous_state = state
            state = next_s

            if done:
                break

            final_reward = reward

        history["states"].append(states)
        history["actions"].append(actions)
        history["n_moves"].append(step)
        history["rewards"].append(final_reward)

    return history

In [9]:
# Create environment and agent.

env = gym.make('FrozenLake-v1', 
               desc = None, 
               map_name = "4x4", 
               is_slippery = False)

agent = Agent(env, train_mode = True)


In [10]:
history = run_qlearning(agent, env, 1000)

In [11]:
Q_table = pd.DataFrame(agent.q_table)

display(Q_table)

Unnamed: 0,0,1,2,3
0,-12.496421,-10.396421,1.600027,-12.496421
1,-11.118125,-9.1,2.308351,-7.0
2,-7.0,3.330247,0.0,0.0
3,0.0,0.0,0.0,0.0
4,-12.496421,-10.396421,-9.1,-7.0
5,0.0,0.0,0.0,0.0
6,-7.0,4.80453,0.0,0.0
7,0.0,0.0,0.0,0.0
8,-12.496421,-9.1,-7.0,-7.0
9,-7.0,-10.396421,-7.0,-7.0


In [12]:
#history

  and should_run_async(code)


In [13]:
"""
plt.figure(figsize = (20, 5))
plt.subplot(1, 2, 1)
plt.plot(history["n_moves"][:, 0])
plt.subplot(1, 2, 2,)
plt.plot(history["rewards"][:, 1])
plt.show()
""";

In [14]:
env = gym.make('FrozenLake-v1', 
               desc = None, 
               map_name = "4x4", 
               is_slippery = False)

state = env.reset(seed = 123)

print(state)

0


  deprecation(
  deprecation(


In [15]:
agent.set_train_mode(False)

In [16]:
done = False

while done == False:
    action = agent.choose_action(state)
    next_s, reward, done, _ = env.step(action)

    print("State: {}, action: {}, next_s: {}, reward: {}, done: {}.".format(state, action, next_s, reward, done))

    state = next_s

State: 0, action: 2, next_s: 1, reward: 0.0, done: False.
State: 1, action: 2, next_s: 2, reward: 0.0, done: False.
State: 2, action: 1, next_s: 6, reward: 0.0, done: False.
State: 6, action: 1, next_s: 10, reward: 0.0, done: False.
State: 10, action: 1, next_s: 14, reward: 0.0, done: False.
State: 14, action: 2, next_s: 15, reward: 1.0, done: True.
