# NEURAL NETWORKS AND DEEP LEARNING

---
A.A. 2021/22 (6 CFU) - Dr. Alberto Testolin, Dr. Umberto Michieli
---


# Homework 3 - Reinforcement Learning

### Author: Michele Guadagnini - Mt.1230663

# Part 2: CartPole-v1 with pixels

In [None]:
### ADDITIONAL LIBRARIES THAT NEED INSTALLATION (uncomment if needed)

#!pip install gym


In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import os
import datetime
import time
import logging
import matplotlib

import torch
from torch import nn
from collections import deque
import gym
import pytorch_lightning as pl

from pytorch_lightning.utilities.seed import seed_everything
### 'seed_everything(seed)' internally calls the followings:
#    random.seed(seed)
#    np.random.seed(seed)
#    torch.manual_seed(seed)
#    torch.cuda.manual_seed_all(seed)

# reduce verbosity
logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)

MAGIC_NUM = 23   #seed 

In [None]:
# utility function to plot some statistics about the training
def plot_results(results, x_label="Episode", figsize=(6,4), avg_window=None, show=False, savepath=None):
    
    keys   = list(results.keys())
    Nplots = len(keys)
    full_fig_size = (figsize[0]*Nplots, figsize[1])

    fig, axs = plt.subplots(1, Nplots, figsize=full_fig_size)

    for idx, ax in enumerate(axs):
        if keys[idx] == "Temperature":
            ax.plot(results[keys[idx]], label="Temperature profile", color="blue")

        if keys[idx] != "Temperature" and avg_window is not None:
            ax.plot(results[keys[idx]], label=keys[idx], color="lightblue")

            # compute and plot moving average of score
            moving_avg = np.convolve(results[keys[idx]], np.ones(avg_window), 'valid') / avg_window

            x_space = np.arange(avg_window/2,len(moving_avg)+avg_window/2)
            ax.plot(x_space, moving_avg, label=keys[idx]+" (smoothed)", lw=2, color="blue")

        ax.grid()
        ax.set_xlabel(x_label)
        ax.set_ylabel(keys[idx])
        ax.legend()

    plt.tight_layout()   

    if savepath is not None:
        #save picture
        plt.savefig(savepath)

    if show:
        plt.show()
    plt.close()

    return

In [None]:
# import the classes implemented to solve this task
from cartpolefrompixels.agent import DQNAgent
from cartpolefrompixels.callbacks import RLResults, MaxEpisodesStop

---
<a name="top-shortcuts"></a>
## Table of contents:

1. [**Train the agent**](#Train-the-agent)
1. [**Test the trained agent**](#Test-the-trained-agent)
1. [**Assistance of a pretrained policy**](#Assistance-of-a-pretrained-policy)
    1. [*Train the agent with assistance*](#Train-the-agent-with-assistance)
    1. [*Test the agent trained with assistance*](#Test-the-agent-trained-with-assistance)

---

## Train the agent
[Table of contents](#top-shortcuts)

In [None]:
# set random state
seed_everything(MAGIC_NUM)

In [None]:
# define the agent and initialize training environment
# network hyper-parameters
policy_params = {"conv_channels" : [16,32],
                 "linear_units"  : [512,128],
                 "activation"    : "relu",
                 "batch_norm"    : True,
                 "dropout"       : 0.,
                 "conv_config"   : [[8,4,0],  
                                    [4,2,0],
                                   ],
                }
# exploration profile parameters
behaviour_params = {"initial_temperature"    : 4.,
                    "decay_const_in_interval": 8 ,
                   }

# other hyper-parameters
penalty_type     = "none"   # "state"  or  "pixels" or "none"
N_episodes       = 2000
mem_capacity     = 15360
target_sync_rate = 200      # number of steps between target net updates
batch_size       = 128
learning_rate    = 0.01


In [None]:
# define the agent
agent = DQNAgent(env_name         = "CartPole-v1", 
                 N_episodes       = N_episodes,
                 mem_capacity     = mem_capacity,
                 policy_params    = policy_params,
                 behaviour_params = behaviour_params,
                 target_sync_rate = target_sync_rate,   # number of steps between target net updates
                 batch_size       = batch_size,
                 gamma            = 0.97,
                 optimizer        = "sgd",   
                 learning_rate    = learning_rate, 
                 L2_penalty       = 0.,
                 momentum         = 0.,
                 seed             = MAGIC_NUM,
                 penalty_type     = penalty_type,    # "state"  or  "pixels" or "none"
                )

In [None]:
# fill memory with initial random steps
warm_up_steps = 5120
agent.fill_memory(warm_up_steps)

In [None]:
# setup trainer and callbacks
results_callback = RLResults("results")

trainer = pl.Trainer(
    logger = False,
    max_epochs=1e6,   # we use a callback to stop when completed the required number of episodes
    callbacks = [MaxEpisodesStop(), results_callback],
    enable_model_summary = False,
    enable_checkpointing = False,
)

In [None]:
print( "Training started at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") )
fit_begin = time.time()  # measure running time

trainer.fit(agent) # run the training

fit_time = time.time() - fit_begin
print( "Training ended at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") )
print(f"Fit time:", str(datetime.timedelta(seconds=fit_time)) )

agent.env.close()

In [None]:
# save checkpoint on disk
trainer.save_checkpoint("CartPolePixels/BestAgent_none_penalty_sgd_relu_2000.ckpt")

In [None]:
results = {"Temperature"   : results_callback.temperatures,
           "Loss"          : results_callback.losses,
           "Episode reward": results_callback.rewards,
           "Score"         : results_callback.scores,
          }

plot_results(results, 
             show       = True, 
             avg_window = 20, 
             savepath   = "CartPolePixels/History_none_penalty_sgd_relu_2000.pdf" )

## Test the trained agent
[Table of contents](#top-shortcuts)

In [None]:
# set random state
seed_everything(MAGIC_NUM)

In [None]:
# import model checkpoint (uncomment and change path if needed)
path = "CartPolePixels/BestAgent_none_penalty_sgd_relu_2000.ckpt"
agent = DQNAgent.load_from_checkpoint(path)

In [None]:
# run a set of episodes
N_iters = 10
results = agent.run(N_iters, record=False)

agent.env.close()

In [None]:
# print results
average_score = []
for it in results:
    average_score.append(it['score'])
    print(f"ID: {it['episode_id']: <{4}}"+
          f" - Reward: {it['final_reward']: <{20}}"+
          f" - Score: {it['score']: <{8}}")
    
print("")
print(f"Average score: {np.mean(average_score)} +/- {np.std(average_score)}")
print("Solved episodes: ", sum([av==500 for av in average_score]))

In [None]:
# run a set of episodes and record videos
N_iters = 10
results = agent.run(N_iters, record=True, video_folder="CartPolePixels/Videos_agent_6")

agent.env.close()

## Assistance of a pretrained agent
[Table of contents](#top-shortcuts)

In the following cells we test the usage of a pretrained network as a *teacher* for the new agent. The teacher network at the beginning of the training select the action to take with a probability of `0.6`. This probability decreases linearly until it reaches 0 when completed the `60%` of the episodes assigned. <br>
As teacher we have used the network trained from the environment state variables, which can easily solve the task. We import the network class and the weights in the following cells. <br>

In [None]:
# teacher model class
class DQN(nn.Module):

    def __init__(self, state_space_dim, action_space_dim, 
                 hidden_units = [128,128],
                 activation   = "tanh",
                ):
        super().__init__()
        
        # activation
        if activation == "tanh":
            self.act = nn.Tanh
        elif activation == "relu":
            self.act = nn.ReLU
        
        # layers units
        units = [state_space_dim]+hidden_units+[action_space_dim]
        
        layers_list = []
        for idx in range(len(units)-2):
            layers_list.append(nn.Linear(units[idx], units[idx+1]))
            layers_list.append(self.act())
        layers_list.append(nn.Linear(units[-2], units[-1]))

        self.linear = nn.Sequential(*layers_list)

    def forward(self, x):
        return self.linear(x)
    

In [None]:
# load teacher model
teacher_net = DQN(state_space_dim  = 4, 
                  action_space_dim = 2, 
                  hidden_units     = [256, 64],
                  activation       = "tanh"
                 )
# load model weights from file
teacher_net.load_state_dict(torch.load("CartPolePixels/TeacherNet_weights"))
teacher_net.eval()

### Train the agent with assistance
[Table of contents](#top-shortcuts)

In [None]:
# set random state
seed_everything(MAGIC_NUM)

In [None]:
# define the agent and initialize training environment
# network hyper-parameters
policy_params = {"conv_channels" : [16,32],
                 "linear_units"  : [512,128],
                 "activation"    : "relu", 
                 "batch_norm"    : True,
                 "dropout"       : 0.,
                 "conv_config"   : [[8,4,0],  
                                    [4,2,0],
                                   ],
                }
# exploration profile parameters
behaviour_params = {"initial_temperature"    : 4.,
                    "decay_const_in_interval": 12,   #higher for teacher-assisted training
                   }

# other hyper-parameters
penalty_type     = "none"   # "state"  or  "pixels" or "none"
N_episodes       = 1000
mem_capacity     = 15360
target_sync_rate = 200      # number of steps between target net updates
batch_size       = 128
learning_rate    = 0.01


In [None]:
# define the agent
agent = DQNAgent(env_name         = "CartPole-v1", 
                 N_episodes       = N_episodes,
                 mem_capacity     = mem_capacity,
                 policy_params    = policy_params,
                 behaviour_params = behaviour_params,
                 target_sync_rate = target_sync_rate,   # number of steps between target net updates
                 batch_size       = batch_size,
                 gamma            = 0.97,
                 optimizer        = "sgd",   
                 learning_rate    = learning_rate, 
                 L2_penalty       = 0.,
                 momentum         = 0.,
                 seed             = MAGIC_NUM,
                 penalty_type     = penalty_type,    # "state"  or  "pixels" or "none"
                )

In [None]:
# set teacher net
agent.set_teacher(teacher_net)

In [None]:
# fill memory with initial random steps
warm_up_steps = 5120
agent.fill_memory(warm_up_steps)

In [None]:
# setup trainer and callbacks
results_callback = RLResults("results")

trainer = pl.Trainer(
    logger = False,
    max_epochs=1e6,   # we use a callback to stop when completed the required number of episodes
    callbacks = [MaxEpisodesStop(), results_callback],
    enable_model_summary = False,
    enable_checkpointing = False,
)

In [None]:
print( "Training started at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") )
fit_begin = time.time()  # measure running time

trainer.fit(agent) # run the training

fit_time = time.time() - fit_begin
print( "Training ended at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") )
print(f"Fit time:", str(datetime.timedelta(seconds=fit_time)) )

agent.env.close()

In [None]:
# save checkpoint on disk
agent.teacher_net = None #remove teacher as we don't need to checkpoint it
trainer.save_checkpoint("CartPolePixels/BestAgent_none_penalty_sgd_relu_teacher.ckpt")

In [None]:
results = {"Temperature"   : results_callback.temperatures,
           "Loss"          : results_callback.losses,
           "Episode reward": results_callback.rewards,
           "Score"         : results_callback.scores,
          }

plot_results(results, 
             show       = True, 
             avg_window = 20, 
             savepath   = "CartPolePixels/History_none_penalty_sgd_relu_teacher.pdf" )

### Test the agent trained with assistance
[Table of contents](#top-shortcuts)

In [None]:
# set random state
seed_everything(MAGIC_NUM)

In [None]:
# import model checkpoint (uncomment and change path if needed)
path = "CartPolePixels/BestAgent_none_penalty_sgd_relu_teacher.ckpt"
agent = DQNAgent.load_from_checkpoint(path)

In [None]:
# run a set of episodes
N_iters = 10
results = agent.run(N_iters, record=False)

agent.env.close()

In [None]:
# print results
average_score = []
for it in results:
    average_score.append(it['score'])
    print(f"ID: {it['episode_id']: <{4}}"+
          f" - Reward: {it['final_reward']: <{20}}"+
          f" - Score: {it['score']: <{8}}")
    
print("")
print(f"Average score: {np.mean(average_score)} +/- {np.std(average_score)}")
print("Solved episodes: ", sum([av==500 for av in average_score]))

In [None]:
# run a set of episodes and record videos
N_iters = 10
results = agent.run(N_iters, record=True, video_folder="CartPolePixels/Videos_teacher_8")

agent.env.close()