# **OpenAI Gym LunarLandercontinues-v2 PPO** 

This colab showcase the training and evaluation of model on LunarLanderContinues-v2 enviorment. For more information visit [Git repo](https://github.com/nextgrid/deep-learning-labs-openAI)



### **Links**
[Nextgrid](https://nextgrid.ai)  
[Deep Learning Labs](https://nextgrid.ai/dll)  

----

[![Nextgrid Artificial Intelligence](https://storage.googleapis.com/nextgrid_github_repo_visuals/Github%20Graphics%20/big-banner.jpg)](https://nextgrid.ai)

 
### **Nextgrid** - _The **Superlative** destination for AI-first startups & talent_




▪️️️️️️️▪️️️️️️️▪️️️️️️️▪️️️️️️️▪️️️️️️️▪️️️️️️️▪️️️️️️️▪️️️️️️️  
*Notebook by Mathias*  
*I would love your feedback,*  
*or discuss your DL/DRL startup/business idea.*   
*find me on* _[twitter](https://twitter.com/mathiiias123)_ or _[linkedin](https://www.linkedin.com/in/imathias)_


## Install system wide packages
Install linux server packages using `apt-get` and Python packages using `pip`

In [None]:
!sudo apt-get update
!apt-get install swig cmake python3-dev libopenmpi-dev zlib1g-dev xvfb x11-utils ffmpeg #remove -qq for full output

%load_ext tensorboard

!pip install torch stable-baselines3[extra,tests,docs]>=0.10.0 box2d box2d-kengz pyvirtualdisplay pyglet==1.5.0 --quiet #remove --quiet for full output 

## Dependencis


In [None]:
import gym
import imageio
import time
import numpy as np
import base64
import IPython
import PIL.Image
import pyvirtualdisplay
import torch
import torch.nn as nn


# Video 
from pathlib import Path
from IPython import display as ipythondisplay

# Stable baselines
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

# Stable baselines 3
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.vec_env import VecVideoRecorder, SubprocVecEnv, DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CallbackList, BaseCallback, CheckpointCallback, EveryNTimesteps, \
    EvalCallback, StopTrainingOnRewardThreshold, StopTrainingOnMaxEpisodes

In [None]:
### Variables
env_id = 'LunarLanderContinuous-v2'
video_folder = '/videos'
video_length = 5000
logs_base_dir = './runs' # Log DIR
steps_total= 0 # Keep track of total steps
time_steps = 2000000
reward_threshold = 200
episodes_threshold = 1000
episodes = 0
mean_reward = 0

### Set log dir
os.makedirs(logs_base_dir, exist_ok=True)


### Enviorment 
env = gym.make(env_id)
env = Monitor(env, logs_base_dir)
score = 0
log_interval = 10          # Print avg reward after interval


### Hyperparameters 

hp = {'activation_fn': 'relu', 'batch_size': 8, 'clip_range': 0.4, 
      'ent_coef': 2.89108e-05, 'gae_lambda': 0.92, 'gamma': 0.99, 'log_std_init': -0.00775684,
      'lr': 0.000242873, 'max_grad_norm': 0.3, 'net_arch': 'medium',
      'n_epochs': 10, 'n_steps': 1024, 'ortho_init': True, 'sde_sample_freq': 8, 'vf_coef': 0.856625}


model = PPO(
    MlpPolicy,
    env,
    n_steps=hp["n_steps"],
    batch_size=hp["batch_size"],
    gamma=hp["gamma"],
    learning_rate=hp["lr"],
    ent_coef=hp["ent_coef"],
    clip_range=hp["clip_range"],
    n_epochs=hp["n_epochs"],
    gae_lambda=hp["gae_lambda"],
    max_grad_norm=hp["max_grad_norm"],
    vf_coef=hp["vf_coef"],
    sde_sample_freq=hp["sde_sample_freq"],
    policy_kwargs=dict(
        log_std_init=hp["log_std_init"],
        net_arch=[dict(pi=[128, 128], vf=[128, 128])],
        activation_fn=nn.LeakyReLU,
        ortho_init=hp["ortho_init"],

    ),
    verbose=0
)


## Record & display video





In [None]:
### Record & Display Video

import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

# Record video
def record_video(env_id, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  env = DummyVecEnv([lambda: gym.make(env_id)])
  # Start the video at step=0 and record 500 steps
  env = VecVideoRecorder(env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs)
    obs, _, _, _ = env.step(action)

  # Close the video recorder
  env.close()


## Display video
def show_videos(video_path='', prefix=''):
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

# RewardCallback function
Handles all the evaluation during training

In [None]:
class RewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
    It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """

    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(RewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

            # Retrieve training reward
            x, y = ts2xy(load_results(self.log_dir), 'timesteps')
            if len(x) > 0:
                global episodes
                global mean_reward
                episodes = len(y)
                # print(episodes)
                mean_reward = np.mean(y[-10:])
                mean_reward = round(mean_reward, 0)
                if self.verbose > 0:
                    print(f"Episodes: {episodes}")
                    print(f"Num steps: {self.num_timesteps}")
                    print(f"Mean reward: {mean_reward:.2f} ")
                    print("=========== NEXTGRID.AI ================")
                # Report intermediate objective value to Optima and Handle pruning
                # trial.report(episodes, self.num_timesteps)
                # if trial.should_prune():
                #     raise optuna.TrialPruned()

                # New best model, you could save the agent here
                if episodes > episodes_threshold:
                    print("Reward threshold achieved")
                    return False

                # New best model, you could save the agent here
                if mean_reward > reward_threshold:
                    model.save("ppo_lunarlandercontinues")
                    # record(name=steps_total, length=1750)
                    # ep100 = evaluate_policy(model, eval_env, n_eval_episodes=50, deterministic=True, render=False, callback=None, reward_threshold=None, return_episode_rewards=True)
                    # print("Mean Reward 100 Epispodes: ", ep100[0])

                    # print("<======SCORE======>")
                    # print(score)
                    # if score > reward_threshold:

                    print("Model saved")
                    return False

        return True

In [None]:
callback = RewardCallback(check_freq=1000, log_dir=logs_base_dir)
model.learn(total_timesteps=int(time_steps), callback=callback)

# Evaluation
OpenAI scores is generally messured over 100 epochs. Use code belowe to messure your avarage score over 100 rounds

In [None]:
evals = evaluate_policy(model, env, n_eval_episodes=100, deterministic=True, render=False, callback=None, reward_threshold=None, return_episode_rewards=True)
print("Score over 100 episodes", evals[0])
print(np.mean(evals[0]))

In [None]:
record_video(env_id, model, video_length=4000, prefix="name")
show_videos('videos', prefix="name")