Install and load all dependencies (first time only) \
NOTE: you may need to restart the runtime afterwards (CTRL+M .).

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf

!pip install gym
!pip install free-mujoco-py
!pip install stable_baselines3[extra]

Set up the custom Hopper environment and provided util functions



1.   Upload the corresponding zip hopper to the current session's file storage:


*   `our_hopper.zip`


2.   Un-zip it by running cell below


In [None]:
#!unzip custom_hopper.zip
!unzip our_hopper.zip

In [None]:
import gym
from env.customHopper_dr import *
import numpy as np

Instantiate the environments needed for the agent: the parameters printed by the udr environment are based on the 'uniform distribution' with a value range for different components of our environment: **[-1,1]** .

In [None]:
source_env = gym.make('CustomHopper-source-v0') # [2.53429174 3.92699082 2.71433605 5.0893801 ]
target_env = gym.make('CustomHopper-target-v0') # [3.53429174 3.92699082 2.71433605 5.0893801 ]
udr_env = gym.make('CustomHopper-domain_randomization-v0')


print('State space:', source_env.observation_space)   # state-space
print('Action space:', source_env.action_space)       # action-space
print('Dynamics parameters:', source_env.get_parameters())  # masses of each link of the Hopper
print('Dynamics parameters target:', target_env.get_parameters())  # masses of each link of the Hopper
print('Dynamics parameters DR:', udr_env.get_parameters()) # masses of each link domain randomization


We take the images as an input and we process them before sending them to the feature extractor
1.  stack the images into one dimensional array.
2.  convert them into tensor for GPU (cuda).
3.  Put them on the appropriate device.



In [None]:
import numpy as np
import torch
from gym.wrappers.pixel_observation import PixelObservationWrapper



def stack_pixel_observations(pixel_obs: list):
    """
    Stack only as many observations there are in the list, since want to be able to handle both 'reset'
    situation but also when 'done'==True
    """
    stacked = np.stack(pixel_obs[:])
    return stacked



def stacked_observations_to_tensor(stacked_observations):
    """
    take in stacked observations of len ranging from 1 to 4, turn into tensor
    """
    n = len(stacked_observations) #number of observations since stacked on axis=0
    tensor = torch.from_numpy(stacked_observations).view(n, 3, 224, 224)
    return tensor


def preprocess_n_observations(pixel_observations):
    """
    take in observations from 'reset' or 'step', preprocess them and return tensor to be
    used as input in alexnet
    """
    stacked = stack_pixel_observations(pixel_observations)
    tensor = stacked_observations_to_tensor(stacked)
    tensor = tensor.type(torch.float32) #change type
    tensor = tensor.to(torch.device(device))  #move to cuda
    return tensor


Importing `AlexNet`,  will be used as a feature extractor.

In [None]:
# importing AlexNet and extract features
# Create CNN model using AlexNet
import torchvision.models as models

# Instantiate an AlexNet model
alexnet = models.alexnet(weights="DEFAULT")

# Move the model to the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
alexnet = alexnet.to(device)

Implementation of Feature Extractor Environment that takes preprossessed images as an input and use them as an observation space instead of angle and velocity and dimensions of the Hopper

In [None]:
from gym import spaces

class FeatureExtractorEnv(gym.Wrapper):
    def __init__(self, env, feature_extractor):
        super().__init__(env)
        self.feature_extractor = feature_extractor
        # attempt to fix shape difficulties
        self.observation_space = spaces.Box(low=0, high=1,
                                        shape=(1000,), dtype=np.uint8)
        self.feature_list = []

    def reset(self):
        observations = []
        normal_obs = self.env.reset() # resetting the environment taken as input from the gym.Wrapper
        obs = source_env.render(mode="rgb_array", width=224, height=224) / 255 #sizing the images and performing a normalization so all the values bound between [0,1]
        observations.append(obs)
        observations_features = self.feature_extractor(preprocess_n_observations(observations))
        observations_numpy = observations_features.data.cpu().numpy()

        return observations_numpy

    def step(self, action):
        done = False
        iter_ = 0
        observations = []
        tot_reward = 0
        #The iteration is the value that represent sequence of images before moving on with the step of a new observation state.
        while not done and iter_ < 4:
            obs, reward, done, info = self.env.step(action)
            obs = source_env.render(mode="rgb_array", width=224, height=224) / 255
            observations.append(obs)
            iter_ += 1
            tot_reward += reward
        observations_features = self.feature_extractor(preprocess_n_observations(observations))
        observations_numpy = observations_features.data.cpu().numpy()
        observations_numpy = np.sum(observations_numpy, axis=0)
        self.feature_list.append(observations_numpy)
        return observations_numpy, tot_reward, done, info




Create Instances of the Feature Extractor environment considering the absence
and presence of ***Domain Randomization***.

In [None]:
import tqdm
from stable_baselines3 import SAC

# source environment based on the feature extractor without domain randomization
new_env = FeatureExtractorEnv(source_env, alexnet)

# UDR environment  based on the feature extractor with domain randomization
new_udr_env = FeatureExtractorEnv(udr_env, alexnet)


***`EvalCallback`***: Evaluate periodically the performance of an agent. It will save the best model if `best_model_save_path` folder is specified and save the evaluations results in a numpy archive `(evaluations.npz)` if `log_path` folder is specified.

In [None]:
from stable_baselines3.common.callbacks import EvalCallback
# Use deterministic actions for evaluation
eval_callback = EvalCallback(new_env, best_model_save_path="./logs/",
                             log_path="./logs/", eval_freq=3_000,
                             deterministic=True, render=False)

***Soft Actor Critic (SAC)***: Off-Policy Maximum Entropy Deep Reinforcement Learning with a Stochastic Actor.

In [None]:
# Instantiate the RL algorithm for source
model = SAC("MlpPolicy", new_udr_env,verbose=0, device="auto", buffer_size = 10_000)

# Train the RL agent on the environment
model.learn(total_timesteps=50_000, callback=eval_callback, log_interval=4, progress_bar=True)

In [None]:
# display improvement during training
data_dict = dict(np.load('/content/logs/evaluations.npz'))
timesteps = data_dict['timesteps']
rewards = data_dict['results']
ep_length = data_dict['ep_lengths']

***`evaluate_policy`***: Runs policy for `n_eval_episodes` episodes and returns average reward. It technically evaluates the trained policy on the environment chosen.

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

# source - source reward calculation
mean_reward, std_reward = evaluate_policy(source, new_env, n_eval_episodes=50)
print(f"mean_reward new_source-new_source:{mean_reward:.2f} +/- {std_reward:.2f}")


# source - target reward calculation
mean_reward, std_reward = evaluate_policy(source, new_udr_env, n_eval_episodes=50)
print(f"mean_reward source-target:{mean_reward:.2f} +/- {std_reward:.2f}")


In [None]:
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib as mpl

# back to default settings
mpl.rcParams.update(mpl.rcParamsDefault)

# use a theme
style.use('seaborn-notebook')

x = timesteps/1000
means = np.mean(rewards, axis=1)
stds = np.std(rewards, axis=1)
ep_length_mean = np.mean(ep_length, axis=1)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax2 = ax1.twinx()

lns1 = ax1.plot(x, means, label='Mean reward')
lns2 = ax1.fill_between(x, means - stds, means + stds, alpha=0.3, label='Deviation')
lns3 = ax2.plot(x, ep_length_mean, 'k-', alpha=0.5, label='Mean episode length')

lns = lns1+[lns2]+lns3
labs = [line.get_label() for line in lns]
ax1.legend(lns, labs, loc='lower right')

ax1.grid()
plt.title('Training rewards for uniform weight perturbations', fontsize=15) #, fontsize=20, weight = 'bold'
ax1.set_xlabel('Timesteps (thousands)', weight='bold') #, fontsize=15, weight='bold'
ax1.set_ylabel('Reward', weight='bold') #, fontsize=15, weight='bold'
ax2.set_ylabel('Episode length', weight='bold') #, fontsize=15, weight='bold'

plt.show()