In [3]:
#!/usr/bin/env python3
import numpy as np
import gym
import os
import multiprocessing
from pathlib import Path

import matplotlib.pyplot as plt
import time
from IPython import display
%matplotlib notebook

from stable_baselines.common.cmd_util import mujoco_arg_parser
from stable_baselines import bench, logger
from stable_baselines.common import set_global_seeds
from stable_baselines.common.vec_env.vec_normalize import VecNormalize
from stable_baselines.ppo2 import PPO2
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from stable_baselines.results_plotter import load_results, ts2xy


def train(env_id, num_timesteps, seed):
    """
    Train PPO2 model for Mujoco environment, for testing purposes
    :param env_id: (str) the environment id string
    :param num_timesteps: (int) the number of timesteps to run
    :param seed: (int) Used to seed the random generator.
    """

    n_cpu = multiprocessing.cpu_count()
    
    print("CPUs:", n_cpu)

    env_out = make_atari_env(env_id, num_env=(n_cpu-2), seed=0)
    #env_out = bench.Monitor(env_out, log_dir, allow_early_resets=True)
    env = VecFrameStack(env_out, n_stack=4)

    def callback(_locals, _globals):
        global n_steps, best_mean_reward
        print("Step:", n_steps)

        if (n_steps + 1) % 10 == 0:
            print("Saving new model")
            _locals['self'].save(log_dir + "/models/model_{}".format(n_steps))
        n_steps += 1
        return False

    set_global_seeds(seed)
    policy = CnnPolicy
    model = PPO2(policy=policy, env=env, n_steps=512, nminibatches=1, lam=0.95, gamma=0.99, noptepochs=10,
                 ent_coef=0.0, learning_rate=3e-4, cliprange=0.2, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=num_timesteps, callback=callback)
    model.save("model_{}".format(env_id))

    return model, env

In [None]:
env_id='BreakoutNoFrameskip-v4'
num_timesteps=20000000
seed=343
best_mean_reward, n_steps = -np.inf, 0

base_dir = str(Path.home()) + '/ppo_logs'
os.makedirs(base_dir, exist_ok=True)
prev = [f for f in os.listdir(base_dir) if env_id in f]
log_dir = base_dir + '/{}-{}'.format(env_id, len(prev))
os.makedirs(log_dir, exist_ok=True)
os.makedirs(log_dir + '/models', exist_ok=True)

print('Logging to {}'.format(log_dir))

logger.configure()
model, env = train(env_id, num_timesteps, seed)

Logging to /home/nathan/ppo_logs/BreakoutNoFrameskip-v4-4
Logging to /tmp/openai-2018-11-06-10-33-07-648341
CPUs: 4
Step: 0
--------------------------------------
| approxkl           | 0.00087102334 |
| clipfrac           | 0.0           |
| ep_rewmean         | 1.75          |
| eplenmean          | 197           |
| explained_variance | -0.0322       |
| fps                | 100           |
| nupdates           | 1             |
| policy_entropy     | 1.3853492     |
| policy_loss        | -0.002792877  |
| serial_timesteps   | 512           |
| time_elapsed       | 3.58e-06      |
| total_timesteps    | 1024          |
| value_loss         | 0.06465333    |
--------------------------------------
Step: 1
--------------------------------------
| approxkl           | 0.00035615417 |
| clipfrac           | 0.0           |
| ep_rewmean         | 1.7           |
| eplenmean          | 194           |
| explained_variance | 0.00489       |
| fps                | 128           |
| nupdates

Step: 14
--------------------------------------
| approxkl           | 0.0020823434  |
| clipfrac           | 0.0           |
| ep_rewmean         | 1.63          |
| eplenmean          | 194           |
| explained_variance | -0.00702      |
| fps                | 130           |
| nupdates           | 15            |
| policy_entropy     | 1.3707808     |
| policy_loss        | -0.0016089424 |
| serial_timesteps   | 7680          |
| time_elapsed       | 113           |
| total_timesteps    | 15360         |
| value_loss         | 0.043764018   |
--------------------------------------
Step: 15
--------------------------------------
| approxkl           | 0.00088722596 |
| clipfrac           | 0.0           |
| ep_rewmean         | 1.78          |
| eplenmean          | 198           |
| explained_variance | -0.00206      |
| fps                | 129           |
| nupdates           | 16            |
| policy_entropy     | 1.3733294     |
| policy_loss        | -0.0015856426 |
| seria

Step: 28
-------------------------------------
| approxkl           | 0.00608859   |
| clipfrac           | 0.082128905  |
| ep_rewmean         | 1.84         |
| eplenmean          | 196          |
| explained_variance | -0.00285     |
| fps                | 130          |
| nupdates           | 29           |
| policy_entropy     | 1.2032009    |
| policy_loss        | -0.004976895 |
| serial_timesteps   | 14848        |
| time_elapsed       | 224          |
| total_timesteps    | 29696        |
| value_loss         | 0.04756987   |
-------------------------------------
Step: 29
Saving new model
---------------------------------------
| approxkl           | 0.0011055422   |
| clipfrac           | 0.005078125    |
| ep_rewmean         | 1.73           |
| eplenmean          | 191            |
| explained_variance | -0.00879       |
| fps                | 130            |
| nupdates           | 30             |
| policy_entropy     | 1.2341485      |
| policy_loss        | -0.000412400

Step: 42
--------------------------------------
| approxkl           | 0.0026600636  |
| clipfrac           | 0.009570313   |
| ep_rewmean         | 1.7           |
| eplenmean          | 190           |
| explained_variance | 0.0107        |
| fps                | 129           |
| nupdates           | 43            |
| policy_entropy     | 1.050259      |
| policy_loss        | -0.0007099133 |
| serial_timesteps   | 22016         |
| time_elapsed       | 334           |
| total_timesteps    | 44032         |
| value_loss         | 0.039387587   |
--------------------------------------
Step: 43
--------------------------------------
| approxkl           | 0.008408783   |
| clipfrac           | 0.10205078    |
| ep_rewmean         | 1.85          |
| eplenmean          | 195           |
| explained_variance | -0.00676      |
| fps                | 131           |
| nupdates           | 44            |
| policy_entropy     | 0.8870047     |
| policy_loss        | -0.0035146729 |
| seria

Step: 56
--------------------------------------
| approxkl           | 0.002309019   |
| clipfrac           | 0.031835936   |
| ep_rewmean         | 2.52          |
| eplenmean          | 212           |
| explained_variance | 0.0092        |
| fps                | 130           |
| nupdates           | 57            |
| policy_entropy     | 0.62644756    |
| policy_loss        | 0.00019901882 |
| serial_timesteps   | 29184         |
| time_elapsed       | 444           |
| total_timesteps    | 58368         |
| value_loss         | 0.041483678   |
--------------------------------------
Step: 57
---------------------------------------
| approxkl           | 0.0006940568   |
| clipfrac           | 0.0033203126   |
| ep_rewmean         | 2.59           |
| eplenmean          | 215            |
| explained_variance | 0.0116         |
| fps                | 130            |
| nupdates           | 58             |
| policy_entropy     | 0.59003377     |
| policy_loss        | -5.7714107e-05

Step: 70
---------------------------------------
| approxkl           | 0.0025121118   |
| clipfrac           | 0.03173828     |
| ep_rewmean         | 2.62           |
| eplenmean          | 214            |
| explained_variance | 0.00538        |
| fps                | 129            |
| nupdates           | 71             |
| policy_entropy     | 0.5449337      |
| policy_loss        | -0.00084435364 |
| serial_timesteps   | 36352          |
| time_elapsed       | 554            |
| total_timesteps    | 72704          |
| value_loss         | 0.03374991     |
---------------------------------------
Step: 71
--------------------------------------
| approxkl           | 0.004913012   |
| clipfrac           | 0.06064453    |
| ep_rewmean         | 2.53          |
| eplenmean          | 212           |
| explained_variance | 0.00689       |
| fps                | 128           |
| nupdates           | 72            |
| policy_entropy     | 0.5628331     |
| policy_loss        | -0.00187

Step: 84
--------------------------------------
| approxkl           | 0.0051247217  |
| clipfrac           | 0.043554686   |
| ep_rewmean         | 2.3           |
| eplenmean          | 203           |
| explained_variance | 0.0347        |
| fps                | 128           |
| nupdates           | 85            |
| policy_entropy     | 0.8008501     |
| policy_loss        | -0.0020232268 |
| serial_timesteps   | 43520         |
| time_elapsed       | 665           |
| total_timesteps    | 87040         |
| value_loss         | 0.030494604   |
--------------------------------------
Step: 85
--------------------------------------
| approxkl           | 0.002940774   |
| clipfrac           | 0.027929688   |
| ep_rewmean         | 2.26          |
| eplenmean          | 202           |
| explained_variance | 0.0159        |
| fps                | 130           |
| nupdates           | 86            |
| policy_entropy     | 0.71936524    |
| policy_loss        | -0.0009436311 |
| seria

Step: 98
--------------------------------------
| approxkl           | 0.0017959621  |
| clipfrac           | 0.033496093   |
| ep_rewmean         | 2.33          |
| eplenmean          | 205           |
| explained_variance | 0.0276        |
| fps                | 129           |
| nupdates           | 99            |
| policy_entropy     | 0.6093819     |
| policy_loss        | -5.471846e-05 |
| serial_timesteps   | 50688         |
| time_elapsed       | 775           |
| total_timesteps    | 101376        |
| value_loss         | 0.03503204    |
--------------------------------------
Step: 99
Saving new model
--------------------------------------
| approxkl           | 0.0013425747  |
| clipfrac           | 0.028222656   |
| ep_rewmean         | 2.03          |
| eplenmean          | 195           |
| explained_variance | 0.00292       |
| fps                | 127           |
| nupdates           | 100           |
| policy_entropy     | 0.6100383     |
| policy_loss        | -0.001

Step: 112
--------------------------------------
| approxkl           | 0.0010304635  |
| clipfrac           | 0.011230469   |
| ep_rewmean         | 2.24          |
| eplenmean          | 200           |
| explained_variance | 0.0316        |
| fps                | 130           |
| nupdates           | 113           |
| policy_entropy     | 0.51882315    |
| policy_loss        | -0.0018107407 |
| serial_timesteps   | 57856         |
| time_elapsed       | 886           |
| total_timesteps    | 115712        |
| value_loss         | 0.04907758    |
--------------------------------------
Step: 113
--------------------------------------
| approxkl           | 0.0015687566  |
| clipfrac           | 0.023535157   |
| ep_rewmean         | 2.27          |
| eplenmean          | 202           |
| explained_variance | 0.0431        |
| fps                | 131           |
| nupdates           | 114           |
| policy_entropy     | 0.5078462     |
| policy_loss        | -0.0021498357 |
| ser

Step: 126
---------------------------------------
| approxkl           | 0.00096116215  |
| clipfrac           | 0.0068359375   |
| ep_rewmean         | 2.35           |
| eplenmean          | 204            |
| explained_variance | 0.00424        |
| fps                | 129            |
| nupdates           | 127            |
| policy_entropy     | 0.4362494      |
| policy_loss        | -0.00080256833 |
| serial_timesteps   | 65024          |
| time_elapsed       | 996            |
| total_timesteps    | 130048         |
| value_loss         | 0.029672882    |
---------------------------------------
Step: 127
--------------------------------------
| approxkl           | 0.0015067807  |
| clipfrac           | 0.019726563   |
| ep_rewmean         | 2.46          |
| eplenmean          | 206           |
| explained_variance | 0.00676       |
| fps                | 130           |
| nupdates           | 128           |
| policy_entropy     | 0.24982996    |
| policy_loss        | -0.000

Step: 140
--------------------------------------
| approxkl           | 0.0044907136  |
| clipfrac           | 0.04121094    |
| ep_rewmean         | 2.22          |
| eplenmean          | 198           |
| explained_variance | -0.00563      |
| fps                | 130           |
| nupdates           | 141           |
| policy_entropy     | 0.17708007    |
| policy_loss        | -0.0016289115 |
| serial_timesteps   | 72192         |
| time_elapsed       | 1.11e+03      |
| total_timesteps    | 144384        |
| value_loss         | 0.045601435   |
--------------------------------------
Step: 141
---------------------------------------
| approxkl           | 0.0009569755   |
| clipfrac           | 0.017089844    |
| ep_rewmean         | 2.31           |
| eplenmean          | 201            |
| explained_variance | 0.0324         |
| fps                | 131            |
| nupdates           | 142            |
| policy_entropy     | 0.23294416     |
| policy_loss        | -0.000726442

Step: 154
---------------------------------------
| approxkl           | 0.0011327831   |
| clipfrac           | 0.01875        |
| ep_rewmean         | 2.6            |
| eplenmean          | 209            |
| explained_variance | 0.0439         |
| fps                | 130            |
| nupdates           | 155            |
| policy_entropy     | 0.3607021      |
| policy_loss        | -0.00091385597 |
| serial_timesteps   | 79360          |
| time_elapsed       | 1.22e+03       |
| total_timesteps    | 158720         |
| value_loss         | 0.052927572    |
---------------------------------------
Step: 155
---------------------------------------
| approxkl           | 0.00076102035  |
| clipfrac           | 0.011230469    |
| ep_rewmean         | 2.4            |
| eplenmean          | 203            |
| explained_variance | 0.0387         |
| fps                | 129            |
| nupdates           | 156            |
| policy_entropy     | 0.40948683     |
| policy_loss       

Step: 168
--------------------------------------
| approxkl           | 0.0016860508  |
| clipfrac           | 0.016601562   |
| ep_rewmean         | 2.88          |
| eplenmean          | 218           |
| explained_variance | 0.0223        |
| fps                | 128           |
| nupdates           | 169           |
| policy_entropy     | 0.21554601    |
| policy_loss        | -0.0011206665 |
| serial_timesteps   | 86528         |
| time_elapsed       | 1.33e+03      |
| total_timesteps    | 173056        |
| value_loss         | 0.050122093   |
--------------------------------------
Step: 169
Saving new model
--------------------------------------
| approxkl           | 0.0016609166  |
| clipfrac           | 0.031933594   |
| ep_rewmean         | 2.79          |
| eplenmean          | 215           |
| explained_variance | -0.0129       |
| fps                | 127           |
| nupdates           | 170           |
| policy_entropy     | 0.3756787     |
| policy_loss        | -0.0

Step: 182
--------------------------------------
| approxkl           | 0.0020376265  |
| clipfrac           | 0.019921875   |
| ep_rewmean         | 2.34          |
| eplenmean          | 200           |
| explained_variance | 0.115         |
| fps                | 127           |
| nupdates           | 183           |
| policy_entropy     | 0.4152736     |
| policy_loss        | -0.0012534804 |
| serial_timesteps   | 93696         |
| time_elapsed       | 1.44e+03      |
| total_timesteps    | 187392        |
| value_loss         | 0.040535364   |
--------------------------------------
Step: 183
-------------------------------------
| approxkl           | 0.000173326  |
| clipfrac           | 0.0012695312 |
| ep_rewmean         | 2.43         |
| eplenmean          | 203          |
| explained_variance | -0.0398      |
| fps                | 129          |
| nupdates           | 184          |
| policy_entropy     | 0.21887882   |
| policy_loss        | -0.000937799 |
| serial_timest

Step: 196
--------------------------------------
| approxkl           | 0.0010343601  |
| clipfrac           | 0.015820313   |
| ep_rewmean         | 2.9           |
| eplenmean          | 218           |
| explained_variance | 0.106         |
| fps                | 129           |
| nupdates           | 197           |
| policy_entropy     | 0.33719808    |
| policy_loss        | -0.0021466757 |
| serial_timesteps   | 100864        |
| time_elapsed       | 1.55e+03      |
| total_timesteps    | 201728        |
| value_loss         | 0.05439409    |
--------------------------------------
Step: 197
--------------------------------------
| approxkl           | 0.0015661456  |
| clipfrac           | 0.019726563   |
| ep_rewmean         | 2.9           |
| eplenmean          | 218           |
| explained_variance | 0.0666        |
| fps                | 126           |
| nupdates           | 198           |
| policy_entropy     | 0.54520833    |
| policy_loss        | -0.0008730218 |
| ser