# NEURAL NETWORKS AND DEEP LEARNING

---
A.A. 2021/22 (6 CFU) - Dr. Alberto Testolin, Dr. Umberto Michieli
---


# Homework 3 - Reinforcement Learning

### Author: Michele Guadagnini - Mt.1230663

# Part 1: CartPole-v1

This part of the exercise is based on the notebook of `LAB 07` about Reinforcement Learning and part of the code used here is taken or adapted from it. 

In [None]:
### ADDITIONAL LIBRARIES THAT NEED INSTALLATION (uncomment if needed)

#!pip install gym
#!pip install optuna

### the followings are required to plot and save figures about optuna study
#!pip install plotly
#!pip install kaleido

In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import os
import datetime
import logging

import torch
from torch import nn
from collections import deque # this python module implements exactly what we need for the replay memory
import gym
import optuna

MAGIC_NUM = 23   #seed 

---
<a name="top-shortcuts"></a>
## Table of contents:

1. [**Model and tools implementation**](#Model-and-tools-implementation)
1. [**Impact of exploration profile**](#Impact-of-exploration-profile)
    1. [Exploration profiles with softmax and $\epsilon$-greedy behaviours](#Exploration-profiles-with-softmax-and-$\epsilon$-greedy-behaviours)
    1. [Comparison of different profiles results](#Comparison-of-different-profiles-results)

1. [**Tuning model hyper-parameters and reward function**](#Tuning-model-hyper-parameters-and-reward-function)

---

## Model and tools implementation
[Table of contents](#top-shortcuts)

In [None]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity) # Define a queue with maxlen "capacity"

    def push(self, state, action, next_state, reward):
        #  Add the tuple (state, action, next_state, reward) to the queue
        self.memory.append( (state, action, next_state, reward) )

    def sample(self, batch_size):
        batch_size = min(batch_size, len(self)) # Get all the samples if the requested batch_size is higher than the number of sample currently in the memory
        return random.sample(self.memory, batch_size) # Randomly select "batch_size" samples

    def __len__(self):
        return len(self.memory) # Return the number of samples currently stored in the memory

In [None]:
class DQN(nn.Module):

    def __init__(self, state_space_dim, action_space_dim, 
                 hidden_units = [128,128],
                 activation   = "tanh",
                ):
        super().__init__()
        
        # activation
        if activation == "tanh":
            self.act = nn.Tanh
        elif activation == "relu":
            self.act = nn.ReLU
        
        # layers units
        units = [state_space_dim]+hidden_units+[action_space_dim]
        
        layers_list = []
        for idx in range(len(units)-2):
            layers_list.append(nn.Linear(units[idx], units[idx+1]))
            layers_list.append(self.act())
        layers_list.append(nn.Linear(units[-2], units[-1]))

        self.linear = nn.Sequential(*layers_list)

    def forward(self, x):
        return self.linear(x)

In [None]:
class ActionChoice(object):
    """
    Class that implements both epsilon-greedy and softmax behaviours.
    """
    def __init__(self, behaviour="eps-greedy", exploration_profile=None, behaviour_param=0.05):
        
        if behaviour == "eps-greedy":
            #print("Behaviour set to 'eps-greedy'.")
            self.behaviour = "eps-greedy"
            self._choice_func = self._eps_greedy_choice
        elif behaviour == "softmax":
            #print("Behaviour set to 'softmax'.")
            self.behaviour = "softmax"
            self._choice_func = self._softmax_choice
        else:
            raise ValueError(f"Unknown behaviour type: {behaviour}.")
        
        self.exploration_profile = exploration_profile
        self.behaviour_param     = behaviour_param    # used only if 'exploration_profile' is None 
                                                                           
    
    def choose_action(self, net, state, iter_id):
        
        # evaluate network output
        net_out = self._evaluate_net_output(net, state)
        # get parameter (eps or temperature) based on profile
        param  = self._get_behaviour_param(iter_id)
        # apply choice
        action = self._choice_func(net_out, param)
        
        return action, net_out.numpy()   
    
    def choose_optimal_action(self, net, state):
        
        # evaluate network output
        net_out = self._evaluate_net_output(net, state)
        # apply choice
        action = int(net_out.argmax())
        
        return action, net_out.numpy()  
    
    
    def _get_behaviour_param(self, iter_id):
        
        if self.exploration_profile is not None:
            return self.exploration_profile[int(iter_id)]
        else:
            return self.behaviour_param  
    
    def _evaluate_net_output(self, net, state):
        # Evaluate the network output from the current state
        with torch.no_grad():
            net.eval()
            state = torch.tensor(state, dtype=torch.float32) # Convert the state to tensor
            net_out = net(state)
        return net_out
          
    def _eps_greedy_choice(self, net_out, epsilon):
        if epsilon > 1 or epsilon < 0:
            raise Exception('The epsilon value must be between 0 and 1')
        
        # Get the best action (argmax of the network output)
        best_action = int(net_out.argmax())
        
        # Get the number of possible actions
        action_space_dim = net_out.shape[-1]
        
        # Select a non optimal action with probability epsilon, otherwise choose the best action
        if random.random() < epsilon:
            # List of non-optimal actions
            non_optimal_actions = [a for a in range(action_space_dim) if a != best_action]
            # Select randomly
            action = random.choice(non_optimal_actions)
        else:
            # Select best action
            action = best_action

        return action     
         
    def _softmax_choice(self, net_out, temperature):
        if temperature < 0:
            raise Exception('The temperature value must be greater than or equal to 0 ')

        # If the temperature is 0, just select the best action using the eps-greedy policy with epsilon = 0
        if temperature == 0.:
            best_action = int(net_out.argmax())
            return best_action

        # Apply softmax with temp
        temperature = max(temperature, 1e-8) # set a minimum to the temperature for numerical stability
        softmax_out = nn.functional.softmax(net_out / temperature, dim=0).numpy()

        # Sample the action using softmax output as mass pdf
        all_possible_actions = np.arange(0, softmax_out.shape[-1])
        # this samples a random element from "all_possible_actions" with the probability 
        #      distribution p (softmax_out in this case)
        action = np.random.choice(all_possible_actions, p=softmax_out) 

        return action         
    

In [None]:
def exponential_profile(initial_value=5., num_iterations=1000, k=6):
    # y = N * exp(-t / tau), where:
    #   N    = initial value
    #   tau  = characteristic length     -> (num_iterations / k)
    #   k    = number of characteristic length to be represented in the interval
    #   t    = step 
    
    tau = num_iterations / k
    exploration_profile = [initial_value * np.exp(-ii/tau) for ii in range(num_iterations)]

    return exploration_profile


def linear_profile(initial_value=1., num_iterations=1000, decay_frac = 0.5, bottom=0.):
    # linear decay for a certain fraction of iterations, then set to 'bottom'
    
    num_decay_iters = int(num_iterations*decay_frac)
    decay_profile = [bottom+(initial_value-bottom)*(num_decay_iters - ii)/num_decay_iters for ii in range(num_decay_iters)]
    exploration_profile = decay_profile + [bottom for ii in range(num_iterations - num_decay_iters)]
    
    return exploration_profile


def noisy_profile(initial_value=1., num_iterations=1000, noise_frac=0.6, clip_value=None):
    # noisy profile from half-normal distribution
    decay = linear_profile(1., num_iterations, noise_frac)
    
    exploration_profile = []
    for ii in range(num_iterations):
        noise = np.abs( np.random.randn() )
        point = initial_value*noise*decay[ii]
        if clip_value is not None:
            point = min(point, clip_value)
        exploration_profile.append( point )        
    
    return exploration_profile   


In [None]:
def update_step(policy_net, target_net, replay_mem, gamma, optimizer, loss_fn, batch_size):
        
    # Sample the data from the replay memory
    batch = replay_mem.sample(batch_size)
    batch_size = len(batch)

    # Create tensors for each element of the batch
    states      = torch.tensor([s[0] for s in batch], dtype=torch.float32)
    actions     = torch.tensor([s[1] for s in batch], dtype=torch.int64)
    rewards     = torch.tensor([s[3] for s in batch], dtype=torch.float32)

    # Compute a mask of non-final states (all the elements where the next state is not None)
    non_final_next_states = torch.tensor([s[2] for s in batch if s[2] is not None], dtype=torch.float32) # the next state can be None if the game has ended
    non_final_mask = torch.tensor([s[2] is not None for s in batch], dtype=torch.bool)

    # Compute all the Q values (forward pass)
    policy_net.train()
    q_values = policy_net(states)
    # Select the proper Q value for the corresponding action taken Q(s_t, a)
    state_action_values = q_values.gather(1, actions.unsqueeze(1))

    # Compute the value function of the next states using the target network V(s_{t+1}) = max_a( Q_target(s_{t+1}, a)) )
    with torch.no_grad():
        target_net.eval()
        q_values_target = target_net(non_final_next_states)
    next_state_max_q_values = torch.zeros(batch_size)
    next_state_max_q_values[non_final_mask] = q_values_target.max(dim=1)[0]

    # Compute the expected Q values
    expected_state_action_values = rewards + (next_state_max_q_values * gamma)
    expected_state_action_values = expected_state_action_values.unsqueeze(1) # Set the required tensor shape

    # Compute the Huber loss
    loss = loss_fn(state_action_values, expected_state_action_values)  # or try L2, L1, ...

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    # Apply gradient clipping (clip all the gradients greater than 2 for training stability)
    nn.utils.clip_grad_norm_(policy_net.parameters(), 2)
    optimizer.step()
    
    return loss.item()


In [None]:
def reward_correction(reward, state, pos_weight=1., angle_weight=0.):
    # position penalty
    reward -= pos_weight * np.abs(state[0])
    
    # angle penalty
    reward -= angle_weight * np.abs(state[2])
    
    return reward

In [None]:
def train_loop(behaviour, env, policy_net, target_net, loss_fn, optimizer, 
               replay_mem, gamma, batch_size, seed=MAGIC_NUM, pos_weight=1., angle_weight=0.,
               render=False, verbose=False, bad_state_penalty=0., target_net_update_steps=10,
              ):
    
    final_scores  = []  # final score of each episode
    final_losses  = []  # final loss of each episode
    final_rewards = []  # final cumulative rewards
    
    N_iters = len(behaviour.exploration_profile)  # number of iterations    
    for episode_num in tqdm(range(N_iters)):

        # Reset the environment and get the initial state
        state = env.reset()
        # Reset the score. The final score will be the total amount of steps before the pole falls
        score = 0
        loss  = 0.        
        cumulative_reward = 0.
        
        done  = False

        # Go on until the pole falls off
        while not done:
            # Choose the action following the policy and the behaviour profile
            action, q_values = behaviour.choose_action(policy_net, state, iter_id=episode_num)
            # Apply the action and get the next state, the reward and a flag "done" that is 
            #      True if the game is ended
            next_state, reward, done, info = env.step(action)

            # Update the final score (+1 for each step)
            score += 1           

            # Apply penalty for bad state
            if done: # if the pole has fallen down 
                reward += bad_state_penalty
                next_state = None
            # reward correction
            reward = reward_correction(reward, state, pos_weight=1., angle_weight=0.)
            cumulative_reward += reward

            # Update the replay memory
            replay_mem.push(state, action, next_state, reward)
            # Update the network
            if len(replay_mem) > min_samples_for_training: # we enable the training only if we have enough
                # samples in the replay memory, otherwise the training will use the same samples too often
                loss = update_step(policy_net, target_net, replay_mem, gamma, optimizer, loss_fn, batch_size)

            # Visually render the environment (disable to speed up the training)
            if render:
                env.render()

            # Set the current state for the next iteration
            state = next_state

        # Update the target network every target_net_update_steps episodes
        if episode_num % target_net_update_steps == 0:
            if verbose:
                print('Updating target network...')
            # This will copy the weights of the policy network to the target network
            target_net.load_state_dict(policy_net.state_dict())

        # Print the final score of the episode
        if verbose:
            print(f"EPISODE: {episode_num + 1} - FINAL SCORE: {score}") # Print the final score
            
        # store episode results
        final_scores.append(score)
        final_losses.append(loss)
        final_rewards.append(cumulative_reward)   
        
    # close environment
    env.close()
            
    return policy_net, final_scores, final_losses, final_rewards

In [None]:
def test_loop(path_to_model, n_episodes=10, seed=23, video_folder="Videos",
              name_prefix="rl", model_hypers=None, render=False,
             ):
    # Initialize the Gym environment
    env = gym.make('CartPole-v1') 
    env.seed(seed) # Set a random seed for the environment (reproducible results)
    
    # Get the shapes of the state space (observation_space) and action space (action_space)
    state_space_dim  = env.observation_space.shape[0]
    action_space_dim = env.action_space.n   

    # Initialize the policy network
    if model_hypers is None:
        policy_net = DQN(state_space_dim, action_space_dim)
        angle_penalty = 0.
    else:
        policy_net = DQN(state_space_dim, action_space_dim, 
                         model_hypers["DQN_units"], model_hypers["DQN_activ"],
                        )
        angle_penalty = model_hypers["angle_weight"]

    # load model weights from file
    policy_net.load_state_dict(torch.load(path_to_model))
    policy_net.eval()
    
    # define behaviour
    behaviour = ActionChoice()

    for num_episode in range(n_episodes): 
        
        # save only last episode video
        if num_episode == (n_episodes-1):
            # wrapping env to save video 
            os.makedirs(video_folder, exist_ok=True)
            env = gym.wrappers.RecordVideo(env, video_folder=video_folder, 
                                           name_prefix=name_prefix,
                                           episode_trigger=lambda idx: True,
                                          )        
        
        # Reset the environment and get the initial state
        state = env.reset()
        # Reset the score. The final score will be the total amount of steps before the pole falls
        score = 0
        cumulated_reward = 0
        done = False
        # Go on until the pole falls off or the score reach 490
        while not done:
            # Choose the best action
            action, q_values = behaviour.choose_optimal_action(policy_net, state)
            # Apply the action and get the next state, the reward and a flag "done" that is True if the game is ended
            next_state, reward, done, info = env.step(action)
            # Update the final score (+1 for each step)
            score += reward 
            # add penalty to reward
            reward = reward_correction(reward, state, angle_weight=angle_penalty)
            cumulated_reward += reward
            # Set the current state for the next iteration
            state = next_state
            
            if render:
                env.render()
            # Check if the episode ended (the pole fell down)
        # Print the final score
        print(f"EPISODE {num_episode + 1} - FINAL SCORE: {score} " + 
              f"- FINAL REWARD (with position and/or angle penalty): {cumulated_reward}") 
    env.close()

## Impact of exploration profile
[Table of contents](#top-shortcuts)

In [None]:
# Set random seeds
torch.manual_seed(MAGIC_NUM)
np.random.seed(MAGIC_NUM)
random.seed(MAGIC_NUM)

In [None]:
#folder to store trained policy nets
save_folder = "ProfileStudy/Models"
os.makedirs(save_folder, exist_ok=True)

In [None]:
### PARAMETERS
gamma                    = 0.97  # gamma parameter for the long term reward
replay_memory_capacity   = 10000 # Replay memory capacity
lr                       = 1e-2  # Optimizer learning rate
target_net_update_steps  = 10    # Number of episodes to wait before updating the target network
batch_size               = 128   # Number of samples to take from the replay memory for each update
bad_state_penalty        = 0     # Penalty to the reward when we are in a bad state 
                                 #     (in this case when the pole falls down) 
min_samples_for_training = 1000  # Minimum samples in the replay memory to enable the training

N_episodes = 1000                # Number of episodes

### Exploration profiles with softmax and $\epsilon$-greedy behaviours

In the cell below we define the `exploration profiles` to be tested.

In [None]:
behaviour_types = ["softmax"]*4 + ["eps-greedy"]*4 
decay_profiles  = ["exp","linear","noisy","const"]*2

profiles = []

# define profiles for softmax (parameter is temperature)
temp_init_value = 5.
k = 10  #number of exponential decay characteristic length to be included in N_episodes range 
profiles.append( exponential_profile(initial_value=temp_init_value, num_iterations=N_episodes, k=k) )
profiles.append( linear_profile(     initial_value=temp_init_value, num_iterations=N_episodes, decay_frac=0.4) )
profiles.append( noisy_profile(      initial_value=temp_init_value, num_iterations=N_episodes) )
profiles.append( linear_profile(initial_value  = temp_init_value/10., #constant temperature profile
                                num_iterations = N_episodes, 
                                bottom         = temp_init_value/10.,
                               ) )

# define profiles for eps-greedy (parameter is eps probability)
eps_init_value = 1.
k = 10
profiles.append( exponential_profile(initial_value=eps_init_value , num_iterations=N_episodes,  k=k) )
profiles.append( linear_profile(     initial_value=eps_init_value , num_iterations=N_episodes, decay_frac=0.4) )
profiles.append( noisy_profile(      initial_value=eps_init_value , num_iterations=N_episodes, clip_value=1.) )
profiles.append( linear_profile(initial_value  = eps_init_value/10., #constant eps profile
                                num_iterations = N_episodes, 
                                bottom         = eps_init_value/10.,
                               ) )

In [None]:
# plot one profile as example
plt.figure(figsize=(7,4))
plt.plot(profiles[0])
plt.grid()
plt.xlabel('Iteration')
plt.ylabel('Exploration profile')
plt.show()

In [None]:
# loop train over different parameter decay profiles and behaviour types

results = []
for idx, profile in enumerate(profiles):
    # reset random seeds
    torch.manual_seed(MAGIC_NUM)
    np.random.seed(MAGIC_NUM)
    random.seed(MAGIC_NUM)
    
    btype = behaviour_types[idx]
    dtype = decay_profiles[idx]
    
    print(f"Starting iteration: {idx+1} with '{btype}' behaviour and '{dtype}' parameter decay...")
    
    ### INITIALIZATION ### -----------------------------------------------------------------
    # Initialize the Gym environment
    env = gym.make('CartPole-v1') 
    env.seed(MAGIC_NUM) # Set a random seed for the environment (reproducible results)
    
    # Get the shapes of the state space (observation_space) and action space (action_space)
    state_space_dim  = env.observation_space.shape[0]
    action_space_dim = env.action_space.n
    
    # Initialize the replay memory
    replay_mem = ReplayMemory(replay_memory_capacity)    

    # Initialize the policy network
    policy_net = DQN(state_space_dim, action_space_dim)

    # Initialize the target network with the same weights of the policy network
    target_net = DQN(state_space_dim, action_space_dim)
    target_net.load_state_dict(policy_net.state_dict()) # copy weights from policy network to target network

    # Initialize the optimizer
    optimizer = torch.optim.SGD(policy_net.parameters(), lr=lr) 
    # NB: the optimizer will update ONLY the parameters of the policy network

    # Initialize the loss function (Huber loss)
    loss_fn = nn.SmoothL1Loss()
    
    # define behaviour
    behaviour = ActionChoice(behaviour=btype, exploration_profile=profile)
    
    ### TRAIN LOOP ### ----------------------------------------------------------------------
    policy_net, scores, losses, rewards = train_loop(behaviour, env, policy_net, target_net, loss_fn, 
                                                     optimizer, replay_mem, gamma, batch_size, 
                                                     seed=MAGIC_NUM, render=False, verbose=False,
                                                    )
    # store results and metrics
    results.append({"decay_type": dtype,
                    "behaviour" : btype,
                    "profile":profile,
                    "scores" :scores,
                    "losses" :losses,
                    "rewards":rewards,  #different from score since it includes also the position penalty
                   })
    # save DQN model
    torch.save(policy_net.state_dict(), save_folder+"/DQN-profile_"+btype+"-decay_"+dtype)


### Comparison of different profiles results

In [None]:
# compare softmax with eps-greedy
def plot_compare_results(softmax_r, eps_greedy_r, to_plot, y_labels, 
                         x_label="Episode", figsize=(8,6), folder="Models", avg_window=None,
                        ):
    
    Nplots = len(to_plot)
    full_fig_size = (figsize[0]*Nplots, figsize[1])
    
    fig, axs = plt.subplots(1, Nplots, figsize=full_fig_size)
    
    for idx, ax in enumerate(axs):
        if to_plot[idx] == "profile":
            ax.plot(softmax_r[to_plot[idx]], label="softmax", color="blue")
            ax.plot(eps_greedy_r[to_plot[idx]], label="eps-greedy", color="red")
        
        if to_plot[idx] != "profile" and avg_window is not None:
            ax.plot(softmax_r[to_plot[idx]], label="softmax", color="lightblue")    
            # compute and plot moving average of score
            softmax_avg = np.convolve(softmax_r[to_plot[idx]], np.ones(avg_window), 'valid') / avg_window            
            x_space = np.arange(avg_window/2,len(softmax_avg)+avg_window/2)
            ax.plot(x_space, softmax_avg, label="softmax (smoothed)", lw=2, color="blue")
            
            ax.plot(eps_greedy_r[to_plot[idx]], label="eps-greedy", color="orange")
            # compute and plot moving average of score
            eps_greedy_avg = np.convolve(eps_greedy_r[to_plot[idx]], np.ones(avg_window), 'valid') / avg_window            
            ax.plot(x_space, eps_greedy_avg, label="eps-greedy (smoothed)", lw=2, color="red")
        
        ax.grid()
        ax.set_xlabel(x_label)
        ax.set_ylabel(y_labels[idx])
        ax.legend()
    
    plt.tight_layout()   
    
    #save picture
    tt = softmax_r["decay_type"]
    plt.savefig(folder+f"/policy_compare-{tt}_decay.pdf")
    plt.show()
    return

In [None]:
to_plot  = ["profile","losses","scores","rewards"]
y_labels = ['Exploration profile', 'Loss', 'Score', 'Episode cumulated reward']

for idx in range(len(results)//2):
    plot_compare_results(results[idx], results[idx+len(results)//2], 
                         to_plot, y_labels, folder=save_folder,
                         avg_window = 20,
                        )

In [None]:
behaviour_types = ["softmax"]*4 + ["eps-greedy"]*4 
decay_profiles  = ["exp","linear","noisy","const"]*2

# testing the models
for idx in range(len(behaviour_types)):
    torch.manual_seed(MAGIC_NUM)
    np.random.seed(MAGIC_NUM)
    random.seed(MAGIC_NUM)
    
    btype = behaviour_types[idx]
    dtype = decay_profiles[idx]    
    print(f"### MODEL: {btype} with {dtype} decay: ###")
    
    path_to_model = save_folder+"/DQN-profile_"+btype+"-decay_"+dtype
    
    # TEST
    test_loop(path_to_model, 
              n_episodes=10, 
              seed=MAGIC_NUM, 
              video_folder="ProfileStudy/Videos"
              name_prefix="profile_"+btype+"-decay_"+dtype,
             )

## Tuning model hyper-parameters and reward function
[Table of contents](#top-shortcuts)

In [None]:
# Set random seeds
torch.manual_seed(MAGIC_NUM)
np.random.seed(MAGIC_NUM)
random.seed(MAGIC_NUM)

In [None]:
# optuna settings 
OPTUNA_DIR = "OptunaStudy"
study_name = "CartPole-v1_optuna_study"

In the cells below we define the hyper-parameter space and the *Objective* function to be used in the Optuna study.

In [None]:
# fixed hyper-parameters
replay_memory_capacity   = 10000
min_samples_for_training = 1000   # Minimum samples in the replay memory to enable the training
batch_size = 128

N_episodes = 1000  # Number of episodes

# define possible hyper-parameters values 
### list : categorical sampling
### tuple: uniform sampling 
hyperparameters_space = {"optim"       : ["sgd", "adam"],     # optimizer
                         "lr"          : (1e-4, 1e-1),        # learning rate
                         "DQN_units"   : [[128,128],[256,64],[128,32],[64,64]],
                         "DQN_activ"   : ["tanh", "relu"],
                         "softmax_init": (2, 10),
                         "softmax_k"   : (4, 10),
                         "gamma"       : (0.9, 0.99),         # discount parameter
                         "target_net_update_steps": [2, 5, 10],
                         "bad_state_penalty": [0, 1, 2],
                         "angle_weight": (0., 1.),            # penalty weight for pole angle 
                        }

In [None]:
class Objective(object):
    
    def __init__(self, model_class, memory_class, hp_space, env_name, env_seed=0, folder="OptunaTrials"):  
        self.model_class  = model_class
        self.memory_class = memory_class
        self.hp_space     = hp_space 
        self.env_name     = env_name
        self.env_seed     = env_seed
        
        self.folder = folder
        self.best_solved_episodes = 0
        
    def keep_best_model(self, net, scores, trial_id):
        
        counter = 0
        for idx, score in enumerate(scores):
            if score >= 500:
                counter += 1
                
        if (counter >= self.best_solved_episodes) and (counter > 0):
            # obtained a new best model
            self.best_solved_episodes = counter
            torch.save(net.state_dict(), self.folder + f"/BestDQN-TrialID_{trial_id}-Solved_{counter}")  
                
        return counter
        
    def _sample_param(self, trial, param_name, param_space):
        if type(param_space) is list:
            if param_name == "DQN_units":
                param_id = trial.suggest_categorical(param_name+"_ID", list(range(len(param_space))))
                param = param_space[param_id]
            else:                
                param = trial.suggest_categorical(param_name, param_space)
            
        elif type(param_space) is tuple:
            param = trial.suggest_uniform(param_name, param_space[0], param_space[1])
            
        return param
    

    def plot_trial_results(self, trial_id, results, x_label="Episode", figsize=(8,6), 
                           avg_window=None, show=False,
                          ):
        keys   = list(results.keys())
        Nplots = len(keys)
        full_fig_size = (figsize[0]*Nplots, figsize[1])

        fig, axs = plt.subplots(1, Nplots, figsize=full_fig_size)

        for idx, ax in enumerate(axs):
            if keys[idx] == "Profile":
                ax.plot(results[keys[idx]], label="Temperature profile", color="blue")

            if keys[idx] != "Profile" and avg_window is not None:
                ax.plot(results[keys[idx]], label=keys[idx], color="lightblue")

                # compute and plot moving average of score
                moving_avg = np.convolve(results[keys[idx]], np.ones(avg_window), 'valid') / avg_window
                
                x_space = np.arange(avg_window/2,len(moving_avg)+avg_window/2)
                ax.plot(x_space, moving_avg, label=keys[idx]+" (smoothed)", lw=2, color="blue")

            ax.grid()
            ax.set_xlabel(x_label)
            ax.set_ylabel(keys[idx])
            ax.legend()

        plt.tight_layout()   

        #save picture
        full_path = self.folder+"/TrialsPlots"
        os.makedirs(full_path, exist_ok=True)
        plt.savefig(full_path+f"/Results-Trial_{trial_id}.pdf")
        
        if show:
            plt.show()
        plt.close()
            
        return
    
            
    def __call__(self, trial):
        
        print(f"Trial [{trial.number}] started at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
    
        # sample hyper-parameters
        params_dict = {}
        for key in self.hp_space:
            value = self._sample_param(trial, key, self.hp_space[key])
            params_dict.update({key:value})            
            
        # initialize environment
        env = gym.make(self.env_name)
        env.seed(self.env_seed)
        
        # Get the shapes of the state space (observation_space) and action space (action_space)
        state_space_dim  = env.observation_space.shape[0]
        action_space_dim = env.action_space.n

        # Initialize the replay memory
        replay_mem = self.memory_class(replay_memory_capacity)    

        # Initialize the policy network
        policy_net = self.model_class(state_space_dim, 
                                      action_space_dim, 
                                      hidden_units = params_dict["DQN_units"],
                                      activation   = params_dict["DQN_activ"],
                                     )

        # Initialize the target network with the same weights of the policy network
        target_net = self.model_class(state_space_dim, 
                                      action_space_dim, 
                                      hidden_units = params_dict["DQN_units"],
                                      activation   = params_dict["DQN_activ"],
                                     )
        target_net.load_state_dict(policy_net.state_dict()) # copy weights from policy network to target network

        # Initialize the optimizer
        if params_dict["optim"] == "sgd":
            optimizer = torch.optim.SGD(policy_net.parameters(), lr=params_dict["lr"]) 
        elif params_dict["optim"] == "adam":
            optimizer = torch.optim.Adam(policy_net.parameters(), lr=params_dict["lr"]) 
        # NB: the optimizer will update ONLY the parameters of the policy network

        # Initialize the loss function (Huber loss)
        loss_fn = nn.SmoothL1Loss()

        # define behaviour (fixed to softmax with exponential temperature profile)
        exploration_profile = exponential_profile(params_dict["softmax_init"], 
                                                  num_iterations = N_episodes,
                                                  k = params_dict["softmax_k"],
                                                 )
        behaviour = ActionChoice(behaviour="softmax", exploration_profile=exploration_profile)
        
        ### TRAIN LOOP ### ----------------------------------------------------------------------
        policy_net, scores, losses, rewards = train_loop(behaviour, env, policy_net, target_net, loss_fn, 
                                                         optimizer, replay_mem, params_dict["gamma"],
                                                         batch_size, seed=self.env_seed, 
                                                         render=False, verbose=False,
                                                         bad_state_penalty=params_dict["bad_state_penalty"],
                                                         target_net_update_steps=params_dict["target_net_update_steps"],
                                                         angle_weight=params_dict["angle_weight"],
                                                        )
        
        # build results dictionary
        results = {"Profile"                 : exploration_profile,
                   "Loss (Huber)"            : losses,
                   "Score"                   : scores,
                   "Episode cumulated reward": rewards,
                  } 
        
        # plot and save trial results
        self.plot_trial_results(trial_id=trial.number, results=results, avg_window=20)
        
        # check if it is the best model and store it
        solved_episodes = self.keep_best_model(policy_net, scores, trial.number)
        
        # adding user-attributes
        trial.set_user_attr("hypers" , params_dict)
        trial.set_user_attr("results", results)
        
        print(f"Trial [{trial.number}] ended at:", datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
        print(f"    Solved episodes: {solved_episodes}\n")
        
        return solved_episodes
        

In [None]:
# optuna study objective function
objective = Objective(model_class  = DQN, 
                      memory_class = ReplayMemory, 
                      hp_space     = hyperparameters_space,
                      env_name     = 'CartPole-v1',
                      env_seed     = MAGIC_NUM,
                      folder       = OPTUNA_DIR,
                     )

# Make the default sampler behave in a deterministic way
sampler = optuna.samplers.TPESampler(n_startup_trials = 25,    # use random sampling at beginning
                                     seed = MAGIC_NUM,
                                    )
### create study
os.makedirs(OPTUNA_DIR, exist_ok=True)

study = optuna.create_study(study_name = study_name, 
                            direction  = "maximize",
                            pruner     = None,
                            sampler    = sampler,
                            storage    = "sqlite:///"+OPTUNA_DIR+"/"+study_name+".db",
                            load_if_exists = True,
                           )

In [None]:
### run optimization
logging.captureWarnings(True)

Ntrials = 40
MaxTime = None

print("Starting study '"+study.study_name+f"' with n_trials={Ntrials} and timeout={MaxTime}")
study.optimize(objective, 
               n_trials       = Ntrials, 
               timeout        = MaxTime, # timeout in seconds
               gc_after_trial = True,    # run garbage collection 
              ) 

logging.captureWarnings(False)

### Study results analysis

In [None]:
# load the study database 
study = optuna.load_study(study_name, 
                          storage = "sqlite:///"+OPTUNA_DIR+"/"+study_name+".db",
                         )

In [None]:
# study results dataframe
study_df = study.trials_dataframe()
study_df.drop(columns="user_attrs_hypers", inplace=True)
study_df.drop(columns="user_attrs_results", inplace=True)
study_df = study_df.sort_values(by="value", ascending=False)

# print dataframe with top-K trials
K = 10
study_df.head(K)

In [None]:
# additional libraries
import optuna
import plotly.express as px
import optuna.visualization as ov
import json

# decorator to add a function to a dictionary
def make_decorator(dictionary):
    def decorator_add_to_dict():
        def wrapper(func):
            dictionary.update({func.__name__:func})
            return func
        return wrapper
    return decorator_add_to_dict
    
class OptimizationInspector(object):
    """
    This class provides some plotting functions to analyze the outcome of an optuna study.
    """
    # dictionary of plotting function
    plot_dict = {}
    _plot_dict_member = make_decorator(plot_dict)
    
    def __init__(self, study, save_path="Results_test", figsize=(1024,600), fmt=".pdf"):
        
        self.study     = study
        self.save_path = save_path + "/"
        self.data_dict = None
        self.fmt       = fmt
        
        # set figsize
        px.defaults.width  = figsize[0]
        px.defaults.height = figsize[1]
        
        # ensure folder existence
        os.makedirs(self.save_path, exist_ok=True)
        
    def save_best_hypers_json(self, best_hypers_file):
        # save best hyperparameters to file (json)
        best_hypers = self.study.best_trial.user_attrs["hypers"]
        with open(best_hypers_file, 'w') as fp:
            json.dump(best_hypers, fp)
        print("Best hyper-parameters saved to: '"+best_hypers_file+"'.")
    
    def _handle_image(self, fig, show, name, save):
        # function to plot/save images
        if show == "1":
            fig.show()
        if (save) and (name is not None):
            full_path = self.save_path + name + self.fmt
            fig.write_image(full_path)   
            print("New image saved: ", full_path)
        return
            
    
    def print_summary(self):
        print("Summary of the Optuna study: ", self.study.study_name)
        print("   Attempted trials: ", len(self.study.trials) )
        study_df  = self.study.trials_dataframe()
        completed = len(study_df[study_df["state"]=="COMPLETE"])
        pruned    = len(study_df[study_df["state"]=="PRUNED"  ])
        print("   Completed trials: ", completed)
        print("   Pruned trials   : ", pruned   )
        print("   Best Trial ID   : ", self.study.best_trial.number)
        print("   Best value      : ", self.study.best_value )
        
        best_hypers = self.study.best_trial.user_attrs["hypers"]       
        print("\nBest set of hyper-parameters:")
        width = max([len(tt) for tt in list(best_hypers)]) # string width when printing param name 
        for key,var in best_hypers.items():
            if key == "params":
                ww = max([len(tt) for tt in list(var)])
                print("    Model parameters:")
                for kk,vv in var.items():
                    print(f"        {kk: <{ww}}: {vv}")
            else:
                print(f"    {key: <{width}}: {var}")
        print("")
        
        return
    
    
    def plot_all(self, parallel_sets = [], contour_sets = [], slice_sets = [], importance_params = [],
                 show = "100011000", save = True,
                ):
        """
        Produce all the defined plots in this class. Showing is controlled by the variable 'show'.
        It can also save all the plotted pictures. Files names are fixed to some default value.
         - show : binary string of lenght equal to the number of methods ('1' to show image, '0' to not show).
         - save : if to save the pictures on disk (bool)
        """
        self.data_dict = {"parallel"         : parallel_sets    ,
                          "contour"          : contour_sets     ,
                          "slice"            : slice_sets       ,
                          "importance_params": importance_params,
                         }
        
        for idx,key in enumerate(self.plot_dict):
            if (show[idx] == "0") and not save: #skip plots that are not showed or saved
                print("   Skipping plot function:", key)
                continue
            self.plot_dict[key](self, show=show[idx], save=save)
            
        self.data_dict = None
            
        return
    
    @_plot_dict_member()
    def optimization_history(self, show="1", name="optimization_history", save=False):
        fig = ov.plot_optimization_history(self.study)
        #fig.update_yaxes(type="log")
        self._handle_image(fig, show, name, save)
        return
    
    def intermediate_values(self, show="1", name="intermediate_values", save=False):
        fig = ov.plot_intermediate_values(self.study)
        self._handle_image(fig, show, name, save)
        return
    
    @_plot_dict_member()
    def importances(self, params=None, show="1", name="importances", save=False):
        if self.data_dict is not None:
            params = self.data_dict["importance_params"]
        fig = ov.plot_param_importances(self.study, params=params)
        self._handle_image(fig, show, name, save)
        return
    
    @_plot_dict_member()
    def time_vs_value(self, show="1", name="time_vs_value", save=False):  
        
        study_df = self.study.trials_dataframe()
        
        # compute time in minutes and the name for the hover functionality
        study_df["time"] = study_df.apply(lambda row: row['duration'].total_seconds()/60, axis=1)
        study_df["name"] = study_df.apply(lambda row: "Trial "+str(row['number']), axis=1)

        # plot picture
        fig = px.scatter(study_df, 
                         x="time", y="value",
                         labels     = {"time":"Training Time [min]", "value":"Objective Value"},
                         color      = "state",
                         symbol     = "state",
                         hover_name = "name", 
                         hover_data = {"time":True,"value":True,"state":False},
                         log_y      = True,
                        )
        fig.update_traces(marker={'size': 8})

        # plot and save image
        self._handle_image(fig, show, name, save)
        return
    
    @_plot_dict_member()
    def parallel_plots(self, parallel_sets=[], show="1", name="parallel", save=False):
        
        if self.data_dict is not None:
            parallel_sets = self.data_dict["parallel"]
        for conf in parallel_sets:        
            # build suffix 
            suffix = "_" + conf[0]   # first is the suffix for the filename
            
            fig = ov.plot_parallel_coordinate(self.study, params=conf[1:])
            self._handle_image(fig, show, name + suffix, save)

        return
    
    @_plot_dict_member() 
    def contour_plots(self, contour_sets=[], show="1", name="contour", save=False):
        
        if self.data_dict is not None:
            contour_sets = self.data_dict["contour"]
        for conf in contour_sets:        
            # build suffix based on passed parameters
            suffix = "_" + "_".join(conf)
            
            fig = ov.plot_contour(self.study, params=conf)
            self._handle_image(fig, show, name + suffix, save)

        return
    
    @_plot_dict_member()
    def slice_plots(self, slice_sets=[], show="1", name="slice", save=False):
        
        if self.data_dict is not None:
            slice_sets = self.data_dict["slice"]
        for conf in slice_sets:        
            # build suffix based on passed parameters
            suffix = "_" + "_".join(conf)
            
            fig = ov.plot_slice(self.study, params=conf)
            self._handle_image(fig, show, name + suffix, save)

        return 

In [None]:
optuna_inspector = OptimizationInspector(study, OPTUNA_DIR, figsize=(900,500))

In [None]:
# parameters sets for parallel plots 
parallel_sets = [["architecture", #name suffix   
                  "DQN_units_ID", "DQN_activ","target_net_update_steps","softmax_init","softmax_k",
                 ],
                 ["optimization", #name suffix
                  "optim","lr","gamma","bad_state_penalty","angle_weight",
                 ],
                ]

# parameters sets for contour plots
contour_sets = [["DQN_units_ID","DQN_activ"],
                ["lr","softmax_init"],
                ["lr", "gamma"],
                ["DQN_units_ID","angle_weight"]
               ]

# parameters sets for slice plots
slice_sets   = [["DQN_units_ID","DQN_activ","target_net_update_steps","softmax_init","softmax_k",
                 "optim","lr","gamma","bad_state_penalty", "angle_weight",
                ],
               ]

# parameters to use for importance plot
importance_params = ["target_net_update_steps","softmax_init","softmax_k","optim","lr",
                     "gamma","bad_state_penalty","angle_weight","DQN_units_ID",
                    ]

In [None]:
optuna_inspector.plot_all(parallel_sets     = parallel_sets,
                          contour_sets      = contour_sets,
                          slice_sets        = slice_sets,
                          importance_params = importance_params,
                          save = True,
                          show = "1111111", #"1100010",    # show options
                         )

In [None]:
optuna_inspector.print_summary()
#optuna_inspector.save_best_hypers_json(OPTUNA_DIR+"/CartPole-v1_best_hypers.json")

### Test the best agent

In [None]:
# load best model
hypers = study.best_trial.user_attrs["hypers"]

trial_id = study.best_trial.number
value    = int(study.best_trial.value)
path_to_model = OPTUNA_DIR + f"/BestDQN-TrialID_{trial_id}-Solved_{value}"

# run test loop and save a video
n_episodes = 10
test_loop(path_to_model, 
          n_episodes   = n_episodes, 
          seed         = MAGIC_NUM, 
          video_folder = OPTUNA_DIR,
          name_prefix  = f"BestDQN-TrialID_{trial_id}", 
          model_hypers = hypers,
          render       = True,
         )