# Works only with Python 3.7

#### Overview:
    1.Stable Baselines
    2.Training RL model using ACER
    3.Running and evaluating a Stable Baslines RL Model

#### The main goal of the lunar lander example that tries to land a spaceship on the surface of the moon
The goal is to direct the jets on the spaceship so that it is able to land in between the flags

In [1]:
%pwd
%cd /home/acr/Documents/Jupyter Notebook

/home/acr/Documents/Jupyter Notebook


# 0. Install and Import Dependencies

In [2]:
!pip install tensorflow==1.15 
!pip install tensorflow-gpu==1.15.0 
!pip install stable_baselines 
!pip install gym
!conda install swig -y # needed to build Box2D in the pip install
!pip install box2d-py # a repackaged version of pybox2d



Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [12]:
import gym
import time
from stable_baselines import ACER
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.evaluation import evaluate_policy

In [13]:
environment_name = "LunarLander-v2"

# 1. Test Random Environment

In [14]:
env= gym.make(environment_name)

In [21]:
episodes = 10
for episode in range(1,episodes+1):
  state=env.reset()
  done=False
  score=0

  while not done:
    env.render()
    time.sleep(0.009) # Slowing the rendering. It stops 0.009 seconds for each frame
    action = env.action_space.sample() # 0 and 1 represent movements like right and left
    n_state, reward, done, info = env.step(action)
    score+=reward
  print('Episode:{} Score: {}'.format(episode, score))
env.close()

Episode:1 Score: -422.84819252014483
Episode:2 Score: -164.69699791550406
Episode:3 Score: -357.3116728070553
Episode:4 Score: -103.42492749717834
Episode:5 Score: -131.75169804037608
Episode:6 Score: -158.62803100321838
Episode:7 Score: -378.9386237445764
Episode:8 Score: -79.46534265696158
Episode:9 Score: -212.40220932075215
Episode:10 Score: -139.0497339989762


In [20]:
env.close()

# 2. Build and Train the Model

#### We'll be building and training the model using stable baselines' algorithms to train a RL agent

In [22]:
env=gym.make(environment_name) # Creating the environment
env=DummyVecEnv([lambda: env]) # Wrap our environment inside of the dummy vec environment
model = ACER('MlpPolicy', env, verbose=1) # The first element defines the neural network, which is MultiLayer Perceptron

In [57]:
model.learn(total_timesteps=100000) # The objective is high explained variance and high mean episode reward
# A callback function allows the training to stop when the optimal level is reached

----------------------------------
| avg_norm_adj        | 8.65e-12 |
| avg_norm_g          | 5.27     |
| avg_norm_grads_f    | 5.27     |
| avg_norm_k          | 3.79e+05 |
| avg_norm_k_dot_g    | 1.98     |
| entropy             | 3.31     |
| explained_variance  | -2.41    |
| fps                 | 0        |
| loss                | 6.71     |
| loss_bc             | -0       |
| loss_f              | -0.355   |
| loss_policy         | -0.355   |
| loss_q              | 14.2     |
| mean_episode_length | 0        |
| mean_episode_reward | 0        |
| norm_grads          | 77.2     |
| norm_grads_policy   | 4.46     |
| norm_grads_q        | 77.1     |
| total_timesteps     | 20       |
----------------------------------
----------------------------------
| avg_norm_adj        | 0.267    |
| avg_norm_g          | 4.91     |
| avg_norm_grads_f    | 4.6      |
| avg_norm_k          | 26.1     |
| avg_norm_k_dot_g    | 4.46     |
| entropy             | 12.4     |
| explained_variance

----------------------------------
| avg_norm_adj        | 0.84     |
| avg_norm_g          | 10.4     |
| avg_norm_grads_f    | 9.86     |
| avg_norm_k          | 3.52     |
| avg_norm_k_dot_g    | 10.7     |
| entropy             | 8.56     |
| explained_variance  | 0.321    |
| fps                 | 252      |
| loss                | 0.576    |
| loss_bc             | -0       |
| loss_f              | -1.25    |
| loss_policy         | -1.25    |
| loss_q              | 3.83     |
| mean_episode_length | 520      |
| mean_episode_reward | -102     |
| norm_grads          | 74.5     |
| norm_grads_policy   | 14.4     |
| norm_grads_q        | 73.1     |
| total_timesteps     | 24020    |
----------------------------------
----------------------------------
| avg_norm_adj        | 0.0935   |
| avg_norm_g          | 8.36     |
| avg_norm_grads_f    | 8.24     |
| avg_norm_k          | 5.56     |
| avg_norm_k_dot_g    | 37.9     |
| entropy             | 7.77     |
| explained_variance

----------------------------------
| avg_norm_adj        | 0.465    |
| avg_norm_g          | 3.16     |
| avg_norm_grads_f    | 2.89     |
| avg_norm_k          | 1.81     |
| avg_norm_k_dot_g    | 2.93     |
| entropy             | 8.75     |
| explained_variance  | 0.741    |
| fps                 | 254      |
| loss                | -0.354   |
| loss_bc             | -0       |
| loss_f              | -0.677   |
| loss_policy         | -0.677   |
| loss_q              | 0.821    |
| mean_episode_length | 679      |
| mean_episode_reward | -94.5    |
| norm_grads          | 17.1     |
| norm_grads_policy   | 7.68     |
| norm_grads_q        | 15.3     |
| total_timesteps     | 48020    |
----------------------------------
----------------------------------
| avg_norm_adj        | 0.0887   |
| avg_norm_g          | 10.7     |
| avg_norm_grads_f    | 10.6     |
| avg_norm_k          | 3.75     |
| avg_norm_k_dot_g    | 11.2     |
| entropy             | 9.45     |
| explained_variance

----------------------------------
| avg_norm_adj        | 0        |
| avg_norm_g          | 6.25     |
| avg_norm_grads_f    | 6.25     |
| avg_norm_k          | 1.99     |
| avg_norm_k_dot_g    | 7.23     |
| entropy             | 8.73     |
| explained_variance  | 0.206    |
| fps                 | 246      |
| loss                | 7.09     |
| loss_bc             | -0       |
| loss_f              | 0.934    |
| loss_policy         | 0.934    |
| loss_q              | 12.5     |
| mean_episode_length | 632      |
| mean_episode_reward | -129     |
| norm_grads          | 94.7     |
| norm_grads_policy   | 15.2     |
| norm_grads_q        | 93.5     |
| total_timesteps     | 72020    |
----------------------------------
----------------------------------
| avg_norm_adj        | 0        |
| avg_norm_g          | 3.49     |
| avg_norm_grads_f    | 3.49     |
| avg_norm_k          | 2.35     |
| avg_norm_k_dot_g    | 3.1      |
| entropy             | 7.15     |
| explained_variance

----------------------------------
| avg_norm_adj        | 0        |
| avg_norm_g          | 3.75     |
| avg_norm_grads_f    | 3.75     |
| avg_norm_k          | 1.87     |
| avg_norm_k_dot_g    | 3.77     |
| entropy             | 6.59     |
| explained_variance  | 0.0819   |
| fps                 | 244      |
| loss                | 1.58     |
| loss_bc             | -0       |
| loss_f              | 0.918    |
| loss_policy         | 0.918    |
| loss_q              | 1.45     |
| mean_episode_length | 571      |
| mean_episode_reward | -129     |
| norm_grads          | 26.5     |
| norm_grads_policy   | 18.9     |
| norm_grads_q        | 18.6     |
| total_timesteps     | 96020    |
----------------------------------
----------------------------------
| avg_norm_adj        | 0.888    |
| avg_norm_g          | 4.3      |
| avg_norm_grads_f    | 3.74     |
| avg_norm_k          | 1.9      |
| avg_norm_k_dot_g    | 4.31     |
| entropy             | 8.74     |
| explained_variance

<stable_baselines.acer.acer_simple.ACER at 0x7f70ec202990>

In [47]:
# # Fazer uma list ou tuple que grava os modelos e depois usar o modelo que tenha o maior mean mean.episode.reward
# total_timesteps=100000
# for i in range(10):
#     model.learn(total_timesteps=10000)

In [48]:
# from stable_baselines.bench import Monitor
# Monitor(env, None, allow_early_resets=True)

In [49]:
# Monitor.get_episode_rewards(model)

In [50]:
help(Monitor)

Help on class Monitor in module stable_baselines.bench.monitor:

class Monitor(gym.core.Wrapper)
 |  Monitor(env: gym.core.Env, filename: Union[str, NoneType], allow_early_resets: bool = True, reset_keywords=(), info_keywords=())
 |  
 |  A monitor wrapper for Gym environments, it is used to know the episode reward, length, time and other data.
 |  
 |  :param env: (gym.Env) The environment
 |  :param filename: (Optional[str]) the location to save a log file, can be None for no log
 |  :param allow_early_resets: (bool) allows the reset of the environment before it is done
 |  :param reset_keywords: (tuple) extra keywords for the reset call, if extra parameters are needed at reset
 |  :param info_keywords: (tuple) extra information to log, from the information return of environment.step
 |  
 |  Method resolution order:
 |      Monitor
 |      gym.core.Wrapper
 |      gym.core.Env
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, env: gym.core.Env, filename:

# 3. Save and Test the model

#### Just for good practice

In [58]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

In [59]:
model.save("ACER_LunarLander_model")

In [68]:
del model

In [69]:
model = ACER.load("ACER_LunarLander_model", env=env)

In [83]:
episodes=10
for i in range(episodes):
    obs = env.reset()
    done=False
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        env.render()
        time.sleep(0.00003)
env.close()