# MountainCar-v0
### In this notebook, you will deal with continuous state and action spaces by discretizing them and apply reinforcement learning algorithms.

In [None]:
import sys
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import os

from torch.utils.tensorboard import SummaryWriter

# Clean previous runs
shutil.rmtree('runs/mountaincar_qlearning', ignore_errors=True)

# Create an environment and set random seed
env = gym.make('MountainCar-v0')
env.seed(505)

def create_uniform_grid(low, high, bins=(10, 10)):
    x_grid_len = (high[0] - low[0])/(bins[0]*1.0)
    y_grid_len = (high[1] - low[1])/(bins[1]*1.0)
    x_grid = [low[0]+i*x_grid_len for i in range(1, bins[0])]
    y_grid = [low[1]+i*y_grid_len for i in range(1, bins[1])]
    return [np.array(x_grid), np.array(y_grid)]

def discretize(sample, grid):
    x = np.digitize(np.array([sample[0]]), grid[0], right=False)
    y = np.digitize(np.array([sample[1]]), grid[1], right=False)
    return [x[0], y[0]]

class QLearningAgent:
    """Q-Learning agent that can act on a continuous state space by discretizing it."""

    def __init__(self, env, state_grid, alpha=0.02, gamma=0.99,
                 epsilon=1.0, epsilon_decay_rate=0.9995, min_epsilon=.01, seed=505):
        self.env = env
        self.state_grid = state_grid
        self.state_size = tuple(len(splits) + 1 for splits in self.state_grid)
        self.action_size = self.env.action_space.n
        self.seed = np.random.seed(seed)
        print("Environment:", self.env)
        print("State space size:", self.state_size)
        print("Action space size:", self.action_size)
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = self.initial_epsilon = epsilon
        self.epsilon_decay_rate = epsilon_decay_rate
        self.min_epsilon = min_epsilon
        self.q_table = np.zeros(shape=(self.state_size + (self.action_size,)))

    def preprocess_state(self, state):
        return tuple(discretize(state, self.state_grid))

    def reset_episode(self, state):
        self.epsilon *= self.epsilon_decay_rate
        self.epsilon = max(self.epsilon, self.min_epsilon)
        self.last_state = self.preprocess_state(state)
        self.last_action = np.argmax(self.q_table[self.last_state])
        return self.last_action
    
    def reset_exploration(self, epsilon=None):
        self.epsilon = epsilon if epsilon is not None else self.initial_epsilon

    def act(self, state, reward=None, done=None, mode='train'):
        state = self.preprocess_state(state)
        if mode == 'test':
            action = np.argmax(self.q_table[state])
        else:
            self.q_table[self.last_state + (self.last_action,)] += self.alpha * (
                reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state + (self.last_action,)]
            )
            td_error = reward + self.gamma * max(self.q_table[state]) - self.q_table[self.last_state+(self.last_action,)]
            do_exploration = np.random.uniform(0, 1) < self.epsilon
            if do_exploration:
                action = np.random.randint(0, self.action_size)
            else:
                action = np.argmax(self.q_table[state])
        self.last_state = state
        self.last_action = action
        return action
    
def run(agent, env, num_episodes=20000, mode='train', writer=None):
    scores = []
    max_avg_score = -np.inf
    for i_episode in range(1, num_episodes+1):
        state = env.reset()
        action = agent.reset_episode(state)
        total_reward = 0
        done = False
        episode_losses = []
        while not done:
            prev_state = agent.last_state
            prev_action = agent.last_action
            state, reward, done, info = env.step(action)
            total_reward += reward
            action = agent.act(state, reward, done, mode)
            td_target = reward + agent.gamma * max(agent.q_table[agent.preprocess_state(state)])
            td_error = td_target - agent.q_table[prev_state + (prev_action,)]
            episode_losses.append(abs(td_error))
        scores.append(total_reward)
        if writer is not None:
            writer.add_scalar("Reward/Episode", total_reward, i_episode)
            if episode_losses:
                avg_loss = np.mean(episode_losses)
                writer.add_scalar("Loss/Episode", avg_loss, i_episode)
        if mode == 'train':
            if len(scores) > 100:
                avg_score = np.mean(scores[-100:])
                if avg_score > max_avg_score:
                    max_avg_score = avg_score
            if i_episode % 100 == 0:
                print("\rEpisode {}/{} | Max Average Score: {}".format(i_episode, num_episodes, max_avg_score), end="")
                sys.stdout.flush()
    return scores

#### You can modify the parameters, re-train the agent and observe changes!

In [None]:
# MODIFY the hyperparameters here
alpha=0.02
gamma=0.99
epsilon=1.0
epsilon_decay_rate=0.9995

# Create discretization grid
state_grid_new = create_uniform_grid(env.observation_space.low, env.observation_space.high, bins=(20, 20))

# Initialize agent
q_agent_new = QLearningAgent(env, state_grid_new, alpha, gamma, epsilon, epsilon_decay_rate)

writer = SummaryWriter(log_dir="runs/mountaincar_qlearning")
q_agent_new.scores = []

# Train agent if Q-table does not exist
q_table_filename = f'trained_q_table_a{alpha}_g{gamma}_e{epsilon}_d{epsilon_decay_rate}.npy'
if not os.path.exists(q_table_filename):
    print("Training agent...")
    q_agent_new.scores += run(q_agent_new, env, num_episodes=15000, writer=writer)
    np.save(q_table_filename, q_agent_new.q_table)
    print("\nTraining complete. Q-table saved.")
else:
    print("Loading existing Q-table...")
    q_agent_new.q_table = np.load(q_table_filename)

writer.close()
env.close()

#### You can watch the replay of your trained agent. Enter 'r' to replay, or 'q' to quit.

In [None]:
# Watch Replay
while True:
    user_input = input("\nPress 'r' to watch the agent, or 'q' to quit: ")
    if user_input.lower() == 'q':
        print("Exiting.")
        break
    elif user_input.lower() == 'r':
        view_env = gym.make('MountainCar-v0')
        state = view_env.reset()
        score = 0
        for t in range(200):
            action = q_agent_new.act(state, mode='test')
            view_env.render()
            state, reward, done, _ = view_env.step(action)
            score += reward
            if done:
                break
        print(f'Final score: {score}')
        view_env.close()
    else:
        print("Invalid input. Please use 'r' or 'q'.")

#### You can start a tensorboard to see changes of the agent training process.

In [None]:
#restart the kernel everytime you run the program with different parmeters to see changes in tensorboard
!tensorboard --logdir=runs/mountaincar_qlearning

### Open a new tab in the browser and type:
http://localhost:6006