In [1]:
from baselines.common.vec_env import VecEnvWrapper, SubprocVecEnv
from pred_learn.envs import *
from a2c_ppo_acktr.envs import VecPyTorch

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


In [2]:
from a2c_ppo_acktr import algo
from a2c_ppo_acktr.arguments import get_args
from a2c_ppo_acktr.envs import make_vec_envs
from a2c_ppo_acktr.model import Policy
from a2c_ppo_acktr.storage import RolloutStorage
from a2c_ppo_acktr.utils import get_vec_normalize, update_linear_schedule
from a2c_ppo_acktr.visualize import visdom_plot
from collections import deque
import copy
import glob
import os
import time
from collections import deque

In [3]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import clear_output

In [4]:
import torch
# from skimage.transform import resize
import os
import numpy as np

In [5]:
MAX_STEPS = 500
TOTAL_STEPS = 10000
ENVS = [
    "CarRacing-v0",
    "Snake-ple-v0",
    "TetrisA-v2",
    "PuckWorld-ple-v0",
    "WaterWorld-ple-v0",
    "PixelCopter-ple-v0",
    "CubeCrash-v0",
    "Catcher-ple-v0",
    "Pong-ple-v0",
]
ENV_ID = "PixelCopter-ple-v0"

SEED = 0

# RECORD_DIR = "recorded/{}/".format(ENV_ID)
# FILE_NO = 1


# try:
#     os.makedirs(RECORD_DIR)
# except FileExistsError:
#     pass

In [6]:
env = make_env(ENV_ID, SEED)
env._env



<gym_ple.ple_env.PLEEnv at 0x7f1adebaaa20>

In [7]:
class Object(object):
    pass

args = Object()
args.cuda = True
args.vis = False
args.num_processes = 4
args.seed = SEED

args.env_name = ENV_ID

args.gamma = 0.99
args.log_dir = "tests"
args.log_interval = 1
args.eval_interval = 10
eval_log_dir = args.log_dir + "_eval"
args.add_timestep = False
args.add_padding = False
args.recurrent_policy = False
args.algo = "a2c"
args.value_loss_coef = 0.5
args.entropy_coef = 0.01
args.lr = 7e-4
args.eps = 1e-5
args.alpha = 0.99
args.max_grad_norm = 0.5
args.num_steps = 15
args.use_linear_lr_decay = False
args.use_gae = False
args.tau = 0.95
args.clip_param = 0.2
args.ppo_epoch = 4
args.num_mini_batch = 32
args.use_linear_clip_decay = False
args.save_interval = 1000
args.save_dir = "save_dir"

num_updates = 100000

try:
    os.makedirs(args.log_dir)
except OSError:
    files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
    for f in files:
        os.remove(f)
        
try:
    os.makedirs(eval_log_dir)
except OSError:
    files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv'))
    for f in files:
        os.remove(f)


In [8]:
torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")

if args.vis:
    from visdom import Visdom
    viz = Visdom(port=args.port)
    win = None

envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                     args.gamma, args.log_dir, args.add_timestep, device, True,
                     padding_type=args.add_padding)

# envs = make_rl_envs(args.env_name, args.num_processes, args.seed, device)
    
actor_critic = Policy(envs.observation_space.shape, envs.action_space,
    base_kwargs={'recurrent': args.recurrent_policy})
actor_critic.to(device)

if args.algo == 'a2c':
    agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                           args.entropy_coef, lr=args.lr,
                           eps=args.eps, alpha=args.alpha,
                           max_grad_norm=args.max_grad_norm)
elif args.algo == 'ppo':
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                     args.value_loss_coef, args.entropy_coef, lr=args.lr,
                           eps=args.eps,
                           max_grad_norm=args.max_grad_norm)
elif args.algo == 'acktr':
    agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef,
                           args.entropy_coef, acktr=True)

rollouts = RolloutStorage(args.num_steps, args.num_processes,
                    envs.observation_space.shape, envs.action_space,
                    actor_critic.recurrent_hidden_state_size)

In [9]:
# some_model = torch.load("../../pytorch-a2c-ppo-acktr/trained_models/ppo/CarRacing-v0.pt")
# actor_critic, _ = some_model
# actor_critic = actor_critic.cuda()

In [10]:
envs = make_vec_envs(args.env_name, args.seed, args.num_processes,
                     args.gamma, args.log_dir, args.add_timestep, device, True,
                     padding_type=args.add_padding)

# envs = make_rl_envs(args.env_name, args.num_processes, args.seed, device)


obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

episode_rewards = deque(maxlen=10)

training_stats = {
    'updates': [],
    'num_timesteps': [],
    'mean_reward': [],
    'median_reward': [],
    'min_reward': [],
    'max_reward': []
}

start = time.time()
for j in range(num_updates):

    if args.use_linear_lr_decay:
        # decrease learning rate linearly
        if args.algo == "acktr":
            # use optimizer's learning rate since it's hard-coded in kfac.py
            update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr)
        else:
            update_linear_schedule(agent.optimizer, j, num_updates, args.lr)

    if args.algo == 'ppo' and args.use_linear_clip_decay:
        agent.clip_param = args.clip_param  * (1 - j / float(num_updates))

    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

        # Obser reward and next obs
        obs, reward, done, infos = envs.step(action)

        for info in infos:
            if 'episode' in info.keys():
                episode_rewards.append(info['episode']['r'])

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])
        rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

    with torch.no_grad():
        next_value = actor_critic.get_value(rollouts.obs[-1],
                                            rollouts.recurrent_hidden_states[-1],
                                            rollouts.masks[-1]).detach()

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    value_loss, action_loss, dist_entropy = agent.update(rollouts)

    rollouts.after_update()

    # save for every interval-th episode or for the last epoch
    if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "":
        save_path = os.path.join(args.save_dir, args.algo)
        try:
            os.makedirs(save_path)
        except OSError:
            pass

        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if args.cuda:
            save_model = copy.deepcopy(actor_critic).cpu()

        save_model = [save_model,
                      getattr(get_vec_normalize(envs), 'ob_rms', None)]

        torch.save(save_model, os.path.join(save_path, args.env_name + ".pt"))

    total_num_steps = (j + 1) * args.num_processes * args.num_steps

#     if j % args.log_interval == 0 and len(episode_rewards) > 1:
    if len(episode_rewards) > 1:
        end = time.time()
        print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".
            format(j, total_num_steps,
                   int(total_num_steps / (end - start)),
                   len(episode_rewards),
                   np.mean(episode_rewards),
                   np.median(episode_rewards),
                   np.min(episode_rewards),
                   np.max(episode_rewards), dist_entropy,
                   value_loss, action_loss))

        training_stats['updates'].append(j)
        training_stats['num_timesteps'].append(total_num_steps)
        training_stats['mean_reward'].append(np.mean(episode_rewards))
        training_stats['median_reward'].append(np.median(episode_rewards))
        training_stats['min_reward'].append(np.min(episode_rewards))
        training_stats['max_reward'].append(np.max(episode_rewards))

    if (args.eval_interval is not None
            and len(episode_rewards) > 1
            and j % args.eval_interval == 0):
        eval_envs = make_vec_envs(
            args.env_name, args.seed + args.num_processes, args.num_processes,
            args.gamma, eval_log_dir, args.add_timestep, device, True)

        vec_norm = get_vec_normalize(eval_envs)
        if vec_norm is not None:
            vec_norm.eval()
            vec_norm.ob_rms = get_vec_normalize(envs).ob_rms

        eval_episode_rewards = []

        obs = eval_envs.reset()
        eval_recurrent_hidden_states = torch.zeros(args.num_processes,
                        actor_critic.recurrent_hidden_state_size, device=device)
        eval_masks = torch.zeros(args.num_processes, 1, device=device)

        while len(eval_episode_rewards) < 10:
            with torch.no_grad():
                _, action, _, eval_recurrent_hidden_states = actor_critic.act(
                    obs, eval_recurrent_hidden_states, eval_masks, deterministic=True)

            # Obser reward and next obs
            obs, reward, done, infos = eval_envs.step(action)

            eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                            for done_ in done]).to(device)
            for info in infos:
                if info is not None and 'episode' in info.keys():
                    eval_episode_rewards.append(info['episode']['r'])

        eval_envs.close()

        print(" Evaluation using {} episodes: mean reward {:.5f}\n".
            format(len(eval_episode_rewards),
                   np.mean(eval_episode_rewards)))

    if args.vis and j % args.vis_interval == 0:
        try:
            # Sometimes monitor doesn't properly flush the outputs
            win = visdom_plot(viz, win, args.log_dir, args.env_name,
                              args.algo, args.num_env_steps)
        except IOError:
            pass


Updates 0, num timesteps 60, FPS 279 
 Last 6 training episodes: mean/median reward -5.0/-5.0, min/max reward -5.0/-5.0

 Evaluation using 10 episodes: mean reward -3.00000

Updates 1, num timesteps 120, FPS 28 
 Last 7 training episodes: mean/median reward -5.0/-5.0, min/max reward -5.0/-5.0

Updates 2, num timesteps 180, FPS 42 
 Last 10 training episodes: mean/median reward -4.1/-5.0, min/max reward -5.0/-2.0

Updates 3, num timesteps 240, FPS 55 
 Last 10 training episodes: mean/median reward -3.6/-4.5, min/max reward -5.0/-1.0

Updates 4, num timesteps 300, FPS 67 
 Last 10 training episodes: mean/median reward -3.9/-4.0, min/max reward -5.0/-1.0

Updates 5, num timesteps 360, FPS 79 
 Last 10 training episodes: mean/median reward -4.3/-4.0, min/max reward -5.0/-3.0

Updates 6, num timesteps 420, FPS 91 
 Last 10 training episodes: mean/median reward -3.6/-4.0, min/max reward -5.0/1.0

Updates 7, num timesteps 480, FPS 103 
 Last 10 training episodes: mean/median reward -3.6/-4.0,

Updates 64, num timesteps 3900, FPS 154 
 Last 10 training episodes: mean/median reward -1.5/-1.5, min/max reward -5.0/2.0

Updates 65, num timesteps 3960, FPS 156 
 Last 10 training episodes: mean/median reward -1.4/-1.5, min/max reward -5.0/2.0

Updates 66, num timesteps 4020, FPS 158 
 Last 10 training episodes: mean/median reward -1.7/-2.0, min/max reward -5.0/2.0

Updates 67, num timesteps 4080, FPS 160 
 Last 10 training episodes: mean/median reward -1.6/-2.5, min/max reward -5.0/5.0

Updates 68, num timesteps 4140, FPS 162 
 Last 10 training episodes: mean/median reward -2.1/-3.0, min/max reward -5.0/5.0

Updates 69, num timesteps 4200, FPS 164 
 Last 10 training episodes: mean/median reward -2.4/-3.0, min/max reward -5.0/5.0

Updates 70, num timesteps 4260, FPS 166 
 Last 10 training episodes: mean/median reward -2.8/-3.0, min/max reward -3.0/-2.0

 Evaluation using 10 episodes: mean reward -3.00000

Updates 71, num timesteps 4320, FPS 151 
 Last 10 training episodes: mean/medi

Updates 128, num timesteps 7740, FPS 167 
 Last 10 training episodes: mean/median reward 4.4/2.0, min/max reward -3.0/21.0

Updates 129, num timesteps 7800, FPS 168 
 Last 10 training episodes: mean/median reward 4.4/2.0, min/max reward -3.0/21.0

Updates 130, num timesteps 7860, FPS 169 
 Last 10 training episodes: mean/median reward 3.6/3.0, min/max reward 1.0/12.0

 Evaluation using 10 episodes: mean reward 1.60000

Updates 131, num timesteps 7920, FPS 159 
 Last 10 training episodes: mean/median reward 3.6/3.0, min/max reward 1.0/12.0

Updates 132, num timesteps 7980, FPS 160 
 Last 10 training episodes: mean/median reward 3.6/3.0, min/max reward 1.0/12.0

Updates 133, num timesteps 8040, FPS 161 
 Last 10 training episodes: mean/median reward 3.6/3.0, min/max reward 1.0/12.0

Updates 134, num timesteps 8100, FPS 162 
 Last 10 training episodes: mean/median reward 3.5/3.0, min/max reward 1.0/12.0

Updates 135, num timesteps 8160, FPS 163 
 Last 10 training episodes: mean/median rew

Updates 193, num timesteps 11640, FPS 151 
 Last 10 training episodes: mean/median reward 2.8/3.0, min/max reward -3.0/11.0

Updates 194, num timesteps 11700, FPS 152 
 Last 10 training episodes: mean/median reward 2.8/3.0, min/max reward -3.0/11.0

Updates 195, num timesteps 11760, FPS 153 
 Last 10 training episodes: mean/median reward 2.8/3.0, min/max reward -3.0/11.0

Updates 196, num timesteps 11820, FPS 153 
 Last 10 training episodes: mean/median reward 2.7/3.0, min/max reward -3.0/11.0

Updates 197, num timesteps 11880, FPS 154 
 Last 10 training episodes: mean/median reward 5.5/4.0, min/max reward -2.0/25.0

Updates 198, num timesteps 11940, FPS 155 
 Last 10 training episodes: mean/median reward 5.6/4.5, min/max reward -2.0/25.0

Updates 199, num timesteps 12000, FPS 155 
 Last 10 training episodes: mean/median reward 5.6/4.5, min/max reward -2.0/25.0

Updates 200, num timesteps 12060, FPS 156 
 Last 10 training episodes: mean/median reward 6.1/4.5, min/max reward -2.0/25.0



Updates 257, num timesteps 15480, FPS 150 
 Last 10 training episodes: mean/median reward 3.0/3.5, min/max reward -2.0/6.0

Updates 258, num timesteps 15540, FPS 150 
 Last 10 training episodes: mean/median reward 3.0/3.5, min/max reward -2.0/6.0

Updates 259, num timesteps 15600, FPS 151 
 Last 10 training episodes: mean/median reward 3.0/3.5, min/max reward -2.0/6.0

Updates 260, num timesteps 15660, FPS 151 
 Last 10 training episodes: mean/median reward 3.0/3.5, min/max reward -2.0/6.0

 Evaluation using 10 episodes: mean reward 5.60000

Updates 261, num timesteps 15720, FPS 147 
 Last 10 training episodes: mean/median reward 2.9/3.0, min/max reward -2.0/6.0

Updates 262, num timesteps 15780, FPS 147 
 Last 10 training episodes: mean/median reward 3.5/3.0, min/max reward -2.0/11.0

Updates 263, num timesteps 15840, FPS 148 
 Last 10 training episodes: mean/median reward 4.7/3.5, min/max reward -2.0/14.0

Updates 264, num timesteps 15900, FPS 148 
 Last 10 training episodes: mean/me

 Evaluation using 10 episodes: mean reward -2.60000

Updates 321, num timesteps 19320, FPS 131 
 Last 10 training episodes: mean/median reward 6.9/5.5, min/max reward 1.0/16.0

Updates 322, num timesteps 19380, FPS 131 
 Last 10 training episodes: mean/median reward 6.9/5.5, min/max reward 1.0/16.0

Updates 323, num timesteps 19440, FPS 132 
 Last 10 training episodes: mean/median reward 6.4/5.5, min/max reward -3.0/16.0

Updates 324, num timesteps 19500, FPS 132 
 Last 10 training episodes: mean/median reward 5.3/2.0, min/max reward -3.0/16.0

Updates 325, num timesteps 19560, FPS 132 
 Last 10 training episodes: mean/median reward 5.3/2.0, min/max reward -3.0/16.0

Updates 326, num timesteps 19620, FPS 132 
 Last 10 training episodes: mean/median reward 4.2/1.0, min/max reward -3.0/16.0

Updates 327, num timesteps 19680, FPS 133 
 Last 10 training episodes: mean/median reward 4.4/2.0, min/max reward -3.0/16.0

Updates 328, num timesteps 19740, FPS 133 
 Last 10 training episodes: mea

Updates 384, num timesteps 23100, FPS 118 
 Last 10 training episodes: mean/median reward 5.1/4.0, min/max reward 1.0/15.0

Updates 385, num timesteps 23160, FPS 119 
 Last 10 training episodes: mean/median reward 5.1/4.0, min/max reward 1.0/15.0

Updates 386, num timesteps 23220, FPS 119 
 Last 10 training episodes: mean/median reward 5.2/4.0, min/max reward 2.0/15.0

Updates 387, num timesteps 23280, FPS 119 
 Last 10 training episodes: mean/median reward 5.5/4.0, min/max reward 2.0/15.0

Updates 388, num timesteps 23340, FPS 119 
 Last 10 training episodes: mean/median reward 4.4/4.0, min/max reward 0.0/15.0

Updates 389, num timesteps 23400, FPS 120 
 Last 10 training episodes: mean/median reward 5.1/4.0, min/max reward 0.0/15.0

Updates 390, num timesteps 23460, FPS 120 
 Last 10 training episodes: mean/median reward 5.1/4.0, min/max reward 0.0/15.0

 Evaluation using 10 episodes: mean reward 4.60000

Updates 391, num timesteps 23520, FPS 116 
 Last 10 training episodes: mean/medi

Updates 448, num timesteps 26940, FPS 114 
 Last 10 training episodes: mean/median reward 5.6/4.5, min/max reward -1.0/16.0

Updates 449, num timesteps 27000, FPS 114 
 Last 10 training episodes: mean/median reward 5.6/4.5, min/max reward -1.0/16.0

Updates 450, num timesteps 27060, FPS 114 
 Last 10 training episodes: mean/median reward 5.6/4.5, min/max reward -1.0/16.0

 Evaluation using 10 episodes: mean reward 6.30000

Updates 451, num timesteps 27120, FPS 111 
 Last 10 training episodes: mean/median reward 7.9/5.0, min/max reward -1.0/26.0

Updates 452, num timesteps 27180, FPS 111 
 Last 10 training episodes: mean/median reward 8.0/5.0, min/max reward -1.0/26.0

Updates 453, num timesteps 27240, FPS 111 
 Last 10 training episodes: mean/median reward 8.0/5.0, min/max reward -1.0/26.0

Updates 454, num timesteps 27300, FPS 111 
 Last 10 training episodes: mean/median reward 8.0/5.0, min/max reward -1.0/26.0

Updates 455, num timesteps 27360, FPS 111 
 Last 10 training episodes: me

Updates 513, num timesteps 30840, FPS 103 
 Last 10 training episodes: mean/median reward 8.0/6.0, min/max reward 1.0/25.0

Updates 514, num timesteps 30900, FPS 103 
 Last 10 training episodes: mean/median reward 6.7/6.0, min/max reward 1.0/12.0

Updates 515, num timesteps 30960, FPS 104 
 Last 10 training episodes: mean/median reward 7.2/6.0, min/max reward 2.0/12.0

Updates 516, num timesteps 31020, FPS 104 
 Last 10 training episodes: mean/median reward 7.2/6.0, min/max reward 2.0/12.0

Updates 517, num timesteps 31080, FPS 104 
 Last 10 training episodes: mean/median reward 7.7/6.5, min/max reward 3.0/12.0

Updates 518, num timesteps 31140, FPS 104 
 Last 10 training episodes: mean/median reward 7.6/6.5, min/max reward 2.0/12.0

Updates 519, num timesteps 31200, FPS 104 
 Last 10 training episodes: mean/median reward 7.6/6.5, min/max reward 2.0/12.0

Updates 520, num timesteps 31260, FPS 104 
 Last 10 training episodes: mean/median reward 7.6/6.5, min/max reward 2.0/12.0

 Evaluat

Updates 578, num timesteps 34740, FPS 103 
 Last 10 training episodes: mean/median reward 6.3/5.0, min/max reward 1.0/22.0

Updates 579, num timesteps 34800, FPS 103 
 Last 10 training episodes: mean/median reward 6.3/5.0, min/max reward 1.0/22.0

Updates 580, num timesteps 34860, FPS 103 
 Last 10 training episodes: mean/median reward 6.3/5.0, min/max reward 1.0/22.0

 Evaluation using 10 episodes: mean reward 7.70000

Updates 581, num timesteps 34920, FPS 101 
 Last 10 training episodes: mean/median reward 5.3/5.0, min/max reward 1.0/12.0

Updates 582, num timesteps 34980, FPS 102 
 Last 10 training episodes: mean/median reward 5.4/5.0, min/max reward 1.0/12.0

Updates 583, num timesteps 35040, FPS 102 
 Last 10 training episodes: mean/median reward 5.4/5.0, min/max reward 1.0/12.0

Updates 584, num timesteps 35100, FPS 102 
 Last 10 training episodes: mean/median reward 5.4/5.0, min/max reward 1.0/12.0

Updates 585, num timesteps 35160, FPS 102 
 Last 10 training episodes: mean/medi

Updates 643, num timesteps 38640, FPS 99 
 Last 10 training episodes: mean/median reward 5.8/5.0, min/max reward 1.0/13.0

Updates 644, num timesteps 38700, FPS 99 
 Last 10 training episodes: mean/median reward 10.0/5.0, min/max reward 2.0/50.0

Updates 645, num timesteps 38760, FPS 99 
 Last 10 training episodes: mean/median reward 10.0/5.0, min/max reward 2.0/50.0

Updates 646, num timesteps 38820, FPS 99 
 Last 10 training episodes: mean/median reward 10.0/5.0, min/max reward 2.0/50.0

Updates 647, num timesteps 38880, FPS 99 
 Last 10 training episodes: mean/median reward 9.9/5.0, min/max reward 1.0/50.0

Updates 648, num timesteps 38940, FPS 99 
 Last 10 training episodes: mean/median reward 10.6/6.5, min/max reward 1.0/50.0

Updates 649, num timesteps 39000, FPS 100 
 Last 10 training episodes: mean/median reward 10.6/6.5, min/max reward 1.0/50.0

Updates 650, num timesteps 39060, FPS 100 
 Last 10 training episodes: mean/median reward 10.6/6.5, min/max reward 1.0/50.0

 Evaluat

Updates 708, num timesteps 42540, FPS 100 
 Last 10 training episodes: mean/median reward 7.0/5.5, min/max reward -4.0/24.0

Updates 709, num timesteps 42600, FPS 100 
 Last 10 training episodes: mean/median reward 6.3/5.5, min/max reward -4.0/24.0

Updates 710, num timesteps 42660, FPS 101 
 Last 10 training episodes: mean/median reward 6.3/5.5, min/max reward -4.0/24.0

 Evaluation using 10 episodes: mean reward 8.40000

Updates 711, num timesteps 42720, FPS 100 
 Last 10 training episodes: mean/median reward 4.0/3.5, min/max reward -4.0/15.0

Updates 712, num timesteps 42780, FPS 100 
 Last 10 training episodes: mean/median reward 4.0/3.5, min/max reward -4.0/15.0

Updates 713, num timesteps 42840, FPS 100 
 Last 10 training episodes: mean/median reward 5.6/5.5, min/max reward 0.0/15.0

Updates 714, num timesteps 42900, FPS 100 
 Last 10 training episodes: mean/median reward 5.6/5.5, min/max reward 0.0/15.0

Updates 715, num timesteps 42960, FPS 100 
 Last 10 training episodes: mean

Updates 774, num timesteps 46500, FPS 102 
 Last 10 training episodes: mean/median reward 7.5/3.5, min/max reward -3.0/28.0

Updates 775, num timesteps 46560, FPS 103 
 Last 10 training episodes: mean/median reward 5.8/3.5, min/max reward -3.0/16.0

Updates 776, num timesteps 46620, FPS 103 
 Last 10 training episodes: mean/median reward 5.8/3.5, min/max reward -3.0/16.0

Updates 777, num timesteps 46680, FPS 103 
 Last 10 training episodes: mean/median reward 5.8/3.5, min/max reward -3.0/16.0

Updates 778, num timesteps 46740, FPS 103 
 Last 10 training episodes: mean/median reward 6.2/3.5, min/max reward -3.0/16.0

Updates 779, num timesteps 46800, FPS 103 
 Last 10 training episodes: mean/median reward 6.7/2.5, min/max reward -3.0/20.0

Updates 780, num timesteps 46860, FPS 103 
 Last 10 training episodes: mean/median reward 6.7/2.5, min/max reward -3.0/20.0

 Evaluation using 10 episodes: mean reward 4.30000

Updates 781, num timesteps 46920, FPS 103 
 Last 10 training episodes: me

Updates 837, num timesteps 50280, FPS 105 
 Last 10 training episodes: mean/median reward 8.6/5.0, min/max reward 3.0/21.0

Updates 838, num timesteps 50340, FPS 105 
 Last 10 training episodes: mean/median reward 8.6/5.0, min/max reward 3.0/21.0

Updates 839, num timesteps 50400, FPS 105 
 Last 10 training episodes: mean/median reward 7.7/5.0, min/max reward 3.0/21.0

Updates 840, num timesteps 50460, FPS 105 
 Last 10 training episodes: mean/median reward 7.7/5.0, min/max reward 3.0/21.0

 Evaluation using 10 episodes: mean reward 12.70000

Updates 841, num timesteps 50520, FPS 105 
 Last 10 training episodes: mean/median reward 7.0/4.5, min/max reward 3.0/21.0

Updates 842, num timesteps 50580, FPS 105 
 Last 10 training episodes: mean/median reward 7.0/4.5, min/max reward 3.0/21.0

Updates 843, num timesteps 50640, FPS 105 
 Last 10 training episodes: mean/median reward 7.0/4.5, min/max reward 3.0/21.0

Updates 844, num timesteps 50700, FPS 105 
 Last 10 training episodes: mean/med

 Evaluation using 10 episodes: mean reward 11.00000

Updates 901, num timesteps 54120, FPS 106 
 Last 10 training episodes: mean/median reward 8.2/5.5, min/max reward 0.0/29.0

Updates 902, num timesteps 54180, FPS 106 
 Last 10 training episodes: mean/median reward 7.7/5.0, min/max reward 0.0/29.0

Updates 903, num timesteps 54240, FPS 107 
 Last 10 training episodes: mean/median reward 7.7/5.0, min/max reward 0.0/29.0

Updates 904, num timesteps 54300, FPS 107 
 Last 10 training episodes: mean/median reward 7.7/5.0, min/max reward 0.0/29.0

Updates 905, num timesteps 54360, FPS 107 
 Last 10 training episodes: mean/median reward 10.6/5.5, min/max reward 0.0/34.0

Updates 906, num timesteps 54420, FPS 107 
 Last 10 training episodes: mean/median reward 16.0/5.5, min/max reward 1.0/62.0

Updates 907, num timesteps 54480, FPS 107 
 Last 10 training episodes: mean/median reward 16.0/5.5, min/max reward 1.0/62.0

Updates 908, num timesteps 54540, FPS 107 
 Last 10 training episodes: mean/

Updates 964, num timesteps 57900, FPS 108 
 Last 10 training episodes: mean/median reward 11.6/6.5, min/max reward 3.0/32.0

Updates 965, num timesteps 57960, FPS 108 
 Last 10 training episodes: mean/median reward 8.5/4.0, min/max reward 1.0/23.0

Updates 966, num timesteps 58020, FPS 109 
 Last 10 training episodes: mean/median reward 8.5/4.0, min/max reward 1.0/23.0

Updates 967, num timesteps 58080, FPS 109 
 Last 10 training episodes: mean/median reward 9.0/6.0, min/max reward 1.0/23.0

Updates 968, num timesteps 58140, FPS 109 
 Last 10 training episodes: mean/median reward 9.1/6.0, min/max reward 1.0/23.0

Updates 969, num timesteps 58200, FPS 109 
 Last 10 training episodes: mean/median reward 9.1/6.0, min/max reward 1.0/23.0

Updates 970, num timesteps 58260, FPS 109 
 Last 10 training episodes: mean/median reward 9.1/6.0, min/max reward 1.0/23.0

 Evaluation using 10 episodes: mean reward 6.00000

Updates 971, num timesteps 58320, FPS 108 
 Last 10 training episodes: mean/med

Updates 1028, num timesteps 61740, FPS 110 
 Last 10 training episodes: mean/median reward 12.7/6.5, min/max reward 0.0/50.0

Updates 1029, num timesteps 61800, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

Updates 1030, num timesteps 61860, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

 Evaluation using 10 episodes: mean reward 5.50000

Updates 1031, num timesteps 61920, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

Updates 1032, num timesteps 61980, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

Updates 1033, num timesteps 62040, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

Updates 1034, num timesteps 62100, FPS 110 
 Last 10 training episodes: mean/median reward 7.5/5.0, min/max reward -1.0/30.0

Updates 1035, num timesteps 62160, FPS 110 
 Last 10 training epis

 Evaluation using 10 episodes: mean reward 5.00000

Updates 1091, num timesteps 65520, FPS 111 
 Last 10 training episodes: mean/median reward 4.8/3.5, min/max reward 1.0/12.0

Updates 1092, num timesteps 65580, FPS 111 
 Last 10 training episodes: mean/median reward 6.0/3.5, min/max reward 1.0/24.0

Updates 1093, num timesteps 65640, FPS 112 
 Last 10 training episodes: mean/median reward 6.2/3.5, min/max reward 1.0/24.0

Updates 1094, num timesteps 65700, FPS 112 
 Last 10 training episodes: mean/median reward 6.3/3.5, min/max reward 1.0/24.0

Updates 1095, num timesteps 65760, FPS 112 
 Last 10 training episodes: mean/median reward 6.3/3.5, min/max reward 1.0/24.0

Updates 1096, num timesteps 65820, FPS 112 
 Last 10 training episodes: mean/median reward 6.0/3.0, min/max reward 1.0/24.0

Updates 1097, num timesteps 65880, FPS 112 
 Last 10 training episodes: mean/median reward 5.4/3.0, min/max reward 0.0/24.0

Updates 1098, num timesteps 65940, FPS 112 
 Last 10 training episodes: m

Updates 1155, num timesteps 69360, FPS 113 
 Last 10 training episodes: mean/median reward 13.1/8.5, min/max reward 1.0/37.0

Updates 1156, num timesteps 69420, FPS 113 
 Last 10 training episodes: mean/median reward 13.3/8.5, min/max reward 1.0/37.0

Updates 1157, num timesteps 69480, FPS 113 
 Last 10 training episodes: mean/median reward 13.3/8.5, min/max reward 1.0/37.0

Updates 1158, num timesteps 69540, FPS 113 
 Last 10 training episodes: mean/median reward 13.3/8.5, min/max reward 1.0/37.0

Updates 1159, num timesteps 69600, FPS 113 
 Last 10 training episodes: mean/median reward 13.3/8.5, min/max reward 1.0/37.0

Updates 1160, num timesteps 69660, FPS 114 
 Last 10 training episodes: mean/median reward 12.9/8.5, min/max reward 1.0/37.0

 Evaluation using 10 episodes: mean reward 4.30000

Updates 1161, num timesteps 69720, FPS 113 
 Last 10 training episodes: mean/median reward 12.9/8.5, min/max reward 1.0/37.0

Updates 1162, num timesteps 69780, FPS 113 
 Last 10 training epis

Updates 1218, num timesteps 73140, FPS 114 
 Last 10 training episodes: mean/median reward 15.9/16.0, min/max reward 5.0/25.0

Updates 1219, num timesteps 73200, FPS 115 
 Last 10 training episodes: mean/median reward 15.9/16.0, min/max reward 5.0/25.0

Updates 1220, num timesteps 73260, FPS 115 
 Last 10 training episodes: mean/median reward 15.9/16.0, min/max reward 5.0/25.0

 Evaluation using 10 episodes: mean reward 7.20000

Updates 1221, num timesteps 73320, FPS 114 
 Last 10 training episodes: mean/median reward 16.8/17.0, min/max reward 5.0/25.0

Updates 1222, num timesteps 73380, FPS 114 
 Last 10 training episodes: mean/median reward 16.8/17.0, min/max reward 5.0/25.0

Updates 1223, num timesteps 73440, FPS 114 
 Last 10 training episodes: mean/median reward 16.8/17.0, min/max reward 5.0/25.0

Updates 1224, num timesteps 73500, FPS 114 
 Last 10 training episodes: mean/median reward 16.8/17.0, min/max reward 5.0/25.0

Updates 1225, num timesteps 73560, FPS 114 
 Last 10 traini

 Evaluation using 10 episodes: mean reward 5.00000

Updates 1281, num timesteps 76920, FPS 112 
 Last 10 training episodes: mean/median reward 10.6/7.5, min/max reward 0.0/32.0

Updates 1282, num timesteps 76980, FPS 112 
 Last 10 training episodes: mean/median reward 10.9/7.5, min/max reward 0.0/32.0

Updates 1283, num timesteps 77040, FPS 112 
 Last 10 training episodes: mean/median reward 10.9/7.5, min/max reward 0.0/32.0

Updates 1284, num timesteps 77100, FPS 112 
 Last 10 training episodes: mean/median reward 10.9/7.5, min/max reward 0.0/32.0

Updates 1285, num timesteps 77160, FPS 112 
 Last 10 training episodes: mean/median reward 10.9/7.5, min/max reward 0.0/32.0

Updates 1286, num timesteps 77220, FPS 112 
 Last 10 training episodes: mean/median reward 10.7/6.0, min/max reward 0.0/32.0

Updates 1287, num timesteps 77280, FPS 112 
 Last 10 training episodes: mean/median reward 10.9/7.0, min/max reward 0.0/32.0

Updates 1288, num timesteps 77340, FPS 112 
 Last 10 training epis

Updates 1344, num timesteps 80700, FPS 108 
 Last 10 training episodes: mean/median reward 11.0/9.0, min/max reward 3.0/24.0

Updates 1345, num timesteps 80760, FPS 108 
 Last 10 training episodes: mean/median reward 11.0/9.0, min/max reward 3.0/24.0

Updates 1346, num timesteps 80820, FPS 108 
 Last 10 training episodes: mean/median reward 11.0/9.0, min/max reward 3.0/24.0

Updates 1347, num timesteps 80880, FPS 109 
 Last 10 training episodes: mean/median reward 12.4/9.0, min/max reward 3.0/27.0

Updates 1348, num timesteps 80940, FPS 109 
 Last 10 training episodes: mean/median reward 12.4/9.0, min/max reward 3.0/27.0

Updates 1349, num timesteps 81000, FPS 109 
 Last 10 training episodes: mean/median reward 17.8/11.0, min/max reward 3.0/64.0

Updates 1350, num timesteps 81060, FPS 109 
 Last 10 training episodes: mean/median reward 18.6/15.0, min/max reward 3.0/64.0

 Evaluation using 10 episodes: mean reward 11.10000

Updates 1351, num timesteps 81120, FPS 108 
 Last 10 training e

Updates 1408, num timesteps 84540, FPS 106 
 Last 10 training episodes: mean/median reward 12.7/8.5, min/max reward 2.0/40.0

Updates 1409, num timesteps 84600, FPS 106 
 Last 10 training episodes: mean/median reward 12.6/8.5, min/max reward 2.0/40.0

Updates 1410, num timesteps 84660, FPS 106 
 Last 10 training episodes: mean/median reward 12.6/8.5, min/max reward 2.0/40.0

 Evaluation using 10 episodes: mean reward 13.00000

Updates 1411, num timesteps 84720, FPS 105 
 Last 10 training episodes: mean/median reward 12.6/8.5, min/max reward 2.0/40.0

Updates 1412, num timesteps 84780, FPS 105 
 Last 10 training episodes: mean/median reward 16.4/17.0, min/max reward 2.0/40.0

Updates 1413, num timesteps 84840, FPS 105 
 Last 10 training episodes: mean/median reward 16.4/17.0, min/max reward 2.0/40.0

Updates 1414, num timesteps 84900, FPS 105 
 Last 10 training episodes: mean/median reward 16.4/17.0, min/max reward 2.0/40.0

Updates 1415, num timesteps 84960, FPS 105 
 Last 10 training 

 Evaluation using 10 episodes: mean reward 5.70000

Updates 1471, num timesteps 88320, FPS 103 
 Last 10 training episodes: mean/median reward 11.2/5.0, min/max reward 1.0/42.0

Updates 1472, num timesteps 88380, FPS 103 
 Last 10 training episodes: mean/median reward 9.1/5.0, min/max reward 1.0/28.0

Updates 1473, num timesteps 88440, FPS 103 
 Last 10 training episodes: mean/median reward 12.1/8.0, min/max reward 1.0/35.0

Updates 1474, num timesteps 88500, FPS 103 
 Last 10 training episodes: mean/median reward 12.1/8.0, min/max reward 1.0/35.0

Updates 1475, num timesteps 88560, FPS 103 
 Last 10 training episodes: mean/median reward 12.1/8.0, min/max reward 1.0/35.0

Updates 1476, num timesteps 88620, FPS 103 
 Last 10 training episodes: mean/median reward 12.1/8.0, min/max reward 1.0/35.0

Updates 1477, num timesteps 88680, FPS 103 
 Last 10 training episodes: mean/median reward 16.1/12.5, min/max reward 1.0/37.0

Updates 1478, num timesteps 88740, FPS 103 
 Last 10 training epis

Updates 1534, num timesteps 92100, FPS 103 
 Last 10 training episodes: mean/median reward 20.9/19.5, min/max reward 1.0/45.0

Updates 1535, num timesteps 92160, FPS 103 
 Last 10 training episodes: mean/median reward 19.7/13.5, min/max reward 1.0/45.0

Updates 1536, num timesteps 92220, FPS 103 
 Last 10 training episodes: mean/median reward 19.7/13.5, min/max reward 1.0/45.0

Updates 1537, num timesteps 92280, FPS 104 
 Last 10 training episodes: mean/median reward 19.7/13.5, min/max reward 1.0/45.0

Updates 1538, num timesteps 92340, FPS 104 
 Last 10 training episodes: mean/median reward 19.7/13.5, min/max reward 1.0/45.0

Updates 1539, num timesteps 92400, FPS 104 
 Last 10 training episodes: mean/median reward 16.2/12.5, min/max reward 1.0/45.0

Updates 1540, num timesteps 92460, FPS 104 
 Last 10 training episodes: mean/median reward 16.2/12.5, min/max reward 1.0/45.0

 Evaluation using 10 episodes: mean reward 11.10000

Updates 1541, num timesteps 92520, FPS 103 
 Last 10 train

Updates 1597, num timesteps 95880, FPS 104 
 Last 10 training episodes: mean/median reward 12.6/11.0, min/max reward 2.0/25.0

Updates 1598, num timesteps 95940, FPS 104 
 Last 10 training episodes: mean/median reward 12.6/11.0, min/max reward 2.0/25.0

Updates 1599, num timesteps 96000, FPS 104 
 Last 10 training episodes: mean/median reward 13.2/12.5, min/max reward 2.0/25.0

Updates 1600, num timesteps 96060, FPS 105 
 Last 10 training episodes: mean/median reward 13.2/12.5, min/max reward 2.0/25.0

 Evaluation using 10 episodes: mean reward 18.70000

Updates 1601, num timesteps 96120, FPS 104 
 Last 10 training episodes: mean/median reward 13.2/12.5, min/max reward 2.0/25.0

Updates 1602, num timesteps 96180, FPS 104 
 Last 10 training episodes: mean/median reward 13.2/12.5, min/max reward 2.0/25.0

Updates 1603, num timesteps 96240, FPS 104 
 Last 10 training episodes: mean/median reward 13.2/12.5, min/max reward 2.0/25.0

Updates 1604, num timesteps 96300, FPS 104 
 Last 10 train

 Evaluation using 10 episodes: mean reward 10.20000

Updates 1661, num timesteps 99720, FPS 105 
 Last 10 training episodes: mean/median reward 17.0/11.0, min/max reward 2.0/45.0

Updates 1662, num timesteps 99780, FPS 105 
 Last 10 training episodes: mean/median reward 17.0/11.0, min/max reward 2.0/45.0

Updates 1663, num timesteps 99840, FPS 105 
 Last 10 training episodes: mean/median reward 17.0/11.0, min/max reward 2.0/45.0

Updates 1664, num timesteps 99900, FPS 105 
 Last 10 training episodes: mean/median reward 17.1/11.0, min/max reward 2.0/45.0

Updates 1665, num timesteps 99960, FPS 105 
 Last 10 training episodes: mean/median reward 17.1/11.0, min/max reward 2.0/45.0

Updates 1666, num timesteps 100020, FPS 105 
 Last 10 training episodes: mean/median reward 17.1/11.0, min/max reward 2.0/45.0

Updates 1667, num timesteps 100080, FPS 105 
 Last 10 training episodes: mean/median reward 17.1/11.0, min/max reward 2.0/45.0

Updates 1668, num timesteps 100140, FPS 105 
 Last 10 tr

Updates 1723, num timesteps 103440, FPS 105 
 Last 10 training episodes: mean/median reward 14.5/11.0, min/max reward 1.0/49.0

Updates 1724, num timesteps 103500, FPS 105 
 Last 10 training episodes: mean/median reward 14.5/11.0, min/max reward 1.0/49.0

Updates 1725, num timesteps 103560, FPS 105 
 Last 10 training episodes: mean/median reward 14.5/11.0, min/max reward 1.0/49.0

Updates 1726, num timesteps 103620, FPS 105 
 Last 10 training episodes: mean/median reward 14.5/11.0, min/max reward 1.0/49.0

Updates 1727, num timesteps 103680, FPS 105 
 Last 10 training episodes: mean/median reward 14.5/11.0, min/max reward 1.0/49.0

Updates 1728, num timesteps 103740, FPS 105 
 Last 10 training episodes: mean/median reward 13.7/8.0, min/max reward 1.0/49.0

Updates 1729, num timesteps 103800, FPS 105 
 Last 10 training episodes: mean/median reward 12.6/8.0, min/max reward 1.0/49.0

Updates 1730, num timesteps 103860, FPS 105 
 Last 10 training episodes: mean/median reward 14.3/11.0, min

Updates 1788, num timesteps 107340, FPS 106 
 Last 10 training episodes: mean/median reward 25.1/17.0, min/max reward 2.0/83.0

Updates 1789, num timesteps 107400, FPS 106 
 Last 10 training episodes: mean/median reward 25.1/17.0, min/max reward 2.0/83.0

Updates 1790, num timesteps 107460, FPS 106 
 Last 10 training episodes: mean/median reward 29.8/17.0, min/max reward 2.0/130.0

 Evaluation using 10 episodes: mean reward 26.10000

Updates 1791, num timesteps 107520, FPS 106 
 Last 10 training episodes: mean/median reward 29.8/17.0, min/max reward 2.0/130.0

Updates 1792, num timesteps 107580, FPS 106 
 Last 10 training episodes: mean/median reward 29.8/17.0, min/max reward 2.0/130.0

Updates 1793, num timesteps 107640, FPS 106 
 Last 10 training episodes: mean/median reward 30.1/17.0, min/max reward 5.0/130.0

Updates 1794, num timesteps 107700, FPS 106 
 Last 10 training episodes: mean/median reward 30.1/17.0, min/max reward 5.0/130.0

Updates 1795, num timesteps 107760, FPS 106 
 

 Evaluation using 10 episodes: mean reward 10.80000

Updates 1851, num timesteps 111120, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1852, num timesteps 111180, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1853, num timesteps 111240, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1854, num timesteps 111300, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1855, num timesteps 111360, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1856, num timesteps 111420, FPS 106 
 Last 10 training episodes: mean/median reward 18.3/9.5, min/max reward 1.0/61.0

Updates 1857, num timesteps 111480, FPS 107 
 Last 10 training episodes: mean/median reward 22.4/11.5, min/max reward 1.0/61.0

Updates 1858, num timesteps 111540, FPS 107 
 Last 10 tra

Updates 1914, num timesteps 114900, FPS 107 
 Last 10 training episodes: mean/median reward 17.3/9.0, min/max reward -1.0/84.0

Updates 1915, num timesteps 114960, FPS 107 
 Last 10 training episodes: mean/median reward 21.0/14.0, min/max reward -1.0/84.0

Updates 1916, num timesteps 115020, FPS 107 
 Last 10 training episodes: mean/median reward 21.0/14.0, min/max reward -1.0/84.0

Updates 1917, num timesteps 115080, FPS 107 
 Last 10 training episodes: mean/median reward 21.0/14.0, min/max reward -1.0/84.0

Updates 1918, num timesteps 115140, FPS 107 
 Last 10 training episodes: mean/median reward 21.0/14.0, min/max reward -1.0/84.0

Updates 1919, num timesteps 115200, FPS 107 
 Last 10 training episodes: mean/median reward 19.4/9.0, min/max reward -1.0/84.0

Updates 1920, num timesteps 115260, FPS 107 
 Last 10 training episodes: mean/median reward 19.4/9.0, min/max reward -1.0/84.0

 Evaluation using 10 episodes: mean reward 14.90000

Updates 1921, num timesteps 115320, FPS 107 
 L

Updates 1978, num timesteps 118740, FPS 108 
 Last 10 training episodes: mean/median reward 16.2/12.0, min/max reward 1.0/44.0

Updates 1979, num timesteps 118800, FPS 108 
 Last 10 training episodes: mean/median reward 13.3/11.0, min/max reward 1.0/44.0

Updates 1980, num timesteps 118860, FPS 108 
 Last 10 training episodes: mean/median reward 13.3/11.0, min/max reward 1.0/44.0

 Evaluation using 10 episodes: mean reward 12.60000

Updates 1981, num timesteps 118920, FPS 107 
 Last 10 training episodes: mean/median reward 13.3/11.0, min/max reward 1.0/44.0

Updates 1982, num timesteps 118980, FPS 107 
 Last 10 training episodes: mean/median reward 13.3/11.0, min/max reward 1.0/44.0

Updates 1983, num timesteps 119040, FPS 107 
 Last 10 training episodes: mean/median reward 13.3/11.0, min/max reward 1.0/44.0

Updates 1984, num timesteps 119100, FPS 107 
 Last 10 training episodes: mean/median reward 14.4/11.0, min/max reward 1.0/44.0

Updates 1985, num timesteps 119160, FPS 107 
 Last 

 Evaluation using 10 episodes: mean reward 8.10000

Updates 2041, num timesteps 122520, FPS 108 
 Last 10 training episodes: mean/median reward 16.7/14.0, min/max reward 6.0/28.0

Updates 2042, num timesteps 122580, FPS 108 
 Last 10 training episodes: mean/median reward 19.1/17.0, min/max reward 6.0/38.0

Updates 2043, num timesteps 122640, FPS 108 
 Last 10 training episodes: mean/median reward 19.1/17.0, min/max reward 6.0/38.0

Updates 2044, num timesteps 122700, FPS 108 
 Last 10 training episodes: mean/median reward 19.1/17.0, min/max reward 6.0/38.0

Updates 2045, num timesteps 122760, FPS 108 
 Last 10 training episodes: mean/median reward 19.1/17.0, min/max reward 6.0/38.0

Updates 2046, num timesteps 122820, FPS 108 
 Last 10 training episodes: mean/median reward 20.2/20.5, min/max reward 6.0/38.0

Updates 2047, num timesteps 122880, FPS 108 
 Last 10 training episodes: mean/median reward 20.1/20.5, min/max reward 5.0/38.0

Updates 2048, num timesteps 122940, FPS 108 
 Last 1

Updates 2105, num timesteps 126360, FPS 108 
 Last 10 training episodes: mean/median reward 9.3/4.0, min/max reward 0.0/35.0

Updates 2106, num timesteps 126420, FPS 108 
 Last 10 training episodes: mean/median reward 9.7/5.5, min/max reward 0.0/35.0

Updates 2107, num timesteps 126480, FPS 108 
 Last 10 training episodes: mean/median reward 12.3/6.0, min/max reward 0.0/35.0

Updates 2108, num timesteps 126540, FPS 108 
 Last 10 training episodes: mean/median reward 12.3/6.0, min/max reward 0.0/35.0

Updates 2109, num timesteps 126600, FPS 108 
 Last 10 training episodes: mean/median reward 12.3/6.0, min/max reward 0.0/35.0

Updates 2110, num timesteps 126660, FPS 108 
 Last 10 training episodes: mean/median reward 12.3/6.0, min/max reward 0.0/35.0

 Evaluation using 10 episodes: mean reward 15.10000

Updates 2111, num timesteps 126720, FPS 108 
 Last 10 training episodes: mean/median reward 12.3/6.0, min/max reward 0.0/35.0

Updates 2112, num timesteps 126780, FPS 108 
 Last 10 traini

Updates 2168, num timesteps 130140, FPS 109 
 Last 10 training episodes: mean/median reward 6.0/4.5, min/max reward -1.0/12.0

Updates 2169, num timesteps 130200, FPS 109 
 Last 10 training episodes: mean/median reward 6.0/4.5, min/max reward -1.0/12.0

Updates 2170, num timesteps 130260, FPS 109 
 Last 10 training episodes: mean/median reward 9.2/7.0, min/max reward -1.0/35.0

 Evaluation using 10 episodes: mean reward 8.90000

Updates 2171, num timesteps 130320, FPS 109 
 Last 10 training episodes: mean/median reward 9.2/7.0, min/max reward -1.0/35.0

Updates 2172, num timesteps 130380, FPS 109 
 Last 10 training episodes: mean/median reward 9.2/7.0, min/max reward -1.0/35.0

Updates 2173, num timesteps 130440, FPS 109 
 Last 10 training episodes: mean/median reward 9.2/7.0, min/max reward -1.0/35.0

Updates 2174, num timesteps 130500, FPS 109 
 Last 10 training episodes: mean/median reward 9.2/7.0, min/max reward -1.0/35.0

Updates 2175, num timesteps 130560, FPS 109 
 Last 10 train

 Evaluation using 10 episodes: mean reward 15.60000

Updates 2231, num timesteps 133920, FPS 109 
 Last 10 training episodes: mean/median reward 10.7/6.0, min/max reward 2.0/37.0

Updates 2232, num timesteps 133980, FPS 109 
 Last 10 training episodes: mean/median reward 12.7/8.5, min/max reward 2.0/37.0

Updates 2233, num timesteps 134040, FPS 109 
 Last 10 training episodes: mean/median reward 12.7/8.5, min/max reward 2.0/37.0

Updates 2234, num timesteps 134100, FPS 109 
 Last 10 training episodes: mean/median reward 12.7/8.5, min/max reward 2.0/37.0

Updates 2235, num timesteps 134160, FPS 109 
 Last 10 training episodes: mean/median reward 11.1/6.0, min/max reward 2.0/27.0

Updates 2236, num timesteps 134220, FPS 109 
 Last 10 training episodes: mean/median reward 11.1/6.0, min/max reward 2.0/27.0

Updates 2237, num timesteps 134280, FPS 109 
 Last 10 training episodes: mean/median reward 11.1/6.0, min/max reward 2.0/27.0

Updates 2238, num timesteps 134340, FPS 109 
 Last 10 trai

Updates 2294, num timesteps 137700, FPS 109 
 Last 10 training episodes: mean/median reward 18.6/7.5, min/max reward 2.0/63.0

Updates 2295, num timesteps 137760, FPS 109 
 Last 10 training episodes: mean/median reward 19.5/11.5, min/max reward 2.0/63.0

Updates 2296, num timesteps 137820, FPS 109 
 Last 10 training episodes: mean/median reward 19.5/11.5, min/max reward 2.0/63.0

Updates 2297, num timesteps 137880, FPS 109 
 Last 10 training episodes: mean/median reward 19.5/11.5, min/max reward 2.0/63.0

Updates 2298, num timesteps 137940, FPS 109 
 Last 10 training episodes: mean/median reward 20.5/14.0, min/max reward 2.0/63.0

Updates 2299, num timesteps 138000, FPS 109 
 Last 10 training episodes: mean/median reward 23.9/18.5, min/max reward 4.0/63.0

Updates 2300, num timesteps 138060, FPS 109 
 Last 10 training episodes: mean/median reward 22.0/14.0, min/max reward 4.0/63.0

 Evaluation using 10 episodes: mean reward 14.80000

Updates 2301, num timesteps 138120, FPS 109 
 Last 1

Updates 2356, num timesteps 141420, FPS 108 
 Last 10 training episodes: mean/median reward 13.1/9.5, min/max reward -2.0/44.0

Updates 2357, num timesteps 141480, FPS 108 
 Last 10 training episodes: mean/median reward 13.1/9.5, min/max reward -2.0/44.0

Updates 2358, num timesteps 141540, FPS 108 
 Last 10 training episodes: mean/median reward 13.1/9.5, min/max reward -2.0/44.0

Updates 2359, num timesteps 141600, FPS 108 
 Last 10 training episodes: mean/median reward 13.1/9.5, min/max reward -2.0/44.0

Updates 2360, num timesteps 141660, FPS 108 
 Last 10 training episodes: mean/median reward 15.0/13.0, min/max reward -2.0/44.0

 Evaluation using 10 episodes: mean reward 10.30000

Updates 2361, num timesteps 141720, FPS 107 
 Last 10 training episodes: mean/median reward 15.0/13.0, min/max reward -2.0/44.0

Updates 2362, num timesteps 141780, FPS 107 
 Last 10 training episodes: mean/median reward 15.0/13.0, min/max reward -2.0/44.0

Updates 2363, num timesteps 141840, FPS 107 
 La

Updates 2418, num timesteps 145140, FPS 105 
 Last 10 training episodes: mean/median reward 27.5/21.5, min/max reward 3.0/61.0

Updates 2419, num timesteps 145200, FPS 105 
 Last 10 training episodes: mean/median reward 27.5/21.5, min/max reward 3.0/61.0

Updates 2420, num timesteps 145260, FPS 105 
 Last 10 training episodes: mean/median reward 26.9/18.5, min/max reward 3.0/61.0

 Evaluation using 10 episodes: mean reward 12.30000

Updates 2421, num timesteps 145320, FPS 105 
 Last 10 training episodes: mean/median reward 26.9/18.5, min/max reward 3.0/61.0

Updates 2422, num timesteps 145380, FPS 105 
 Last 10 training episodes: mean/median reward 26.9/18.5, min/max reward 3.0/61.0

Updates 2423, num timesteps 145440, FPS 105 
 Last 10 training episodes: mean/median reward 26.9/18.5, min/max reward 3.0/61.0

Updates 2424, num timesteps 145500, FPS 105 
 Last 10 training episodes: mean/median reward 21.1/14.0, min/max reward 3.0/56.0

Updates 2425, num timesteps 145560, FPS 105 
 Last 

Updates 2480, num timesteps 148860, FPS 103 
 Last 10 training episodes: mean/median reward 8.6/8.0, min/max reward 1.0/22.0

 Evaluation using 10 episodes: mean reward 22.80000

Updates 2481, num timesteps 148920, FPS 102 
 Last 10 training episodes: mean/median reward 8.6/8.0, min/max reward 1.0/22.0

Updates 2482, num timesteps 148980, FPS 102 
 Last 10 training episodes: mean/median reward 10.5/12.0, min/max reward 1.0/22.0

Updates 2483, num timesteps 149040, FPS 103 
 Last 10 training episodes: mean/median reward 10.5/12.0, min/max reward 1.0/22.0

Updates 2484, num timesteps 149100, FPS 103 
 Last 10 training episodes: mean/median reward 10.5/12.0, min/max reward 1.0/22.0

Updates 2485, num timesteps 149160, FPS 103 
 Last 10 training episodes: mean/median reward 10.5/12.0, min/max reward 1.0/22.0

Updates 2486, num timesteps 149220, FPS 103 
 Last 10 training episodes: mean/median reward 9.4/8.0, min/max reward 1.0/22.0

Updates 2487, num timesteps 149280, FPS 103 
 Last 10 tra

KeyboardInterrupt: 

In [None]:
j

In [None]:
envs = make_rl_envs(args.env_name, args.num_processes, args.seed,  device)

obs = envs.reset()
rollouts.obs[0].copy_(obs)
rollouts.to(device)

episode_rewards = deque(maxlen=10)

training_stats = {
    'updates': [],
    'num_timesteps': [],
    'mean_reward': [],
    'median_reward': [],
    'min_reward': [],
    'max_reward': []
}

start = time.time()
for j in range(50):
    for step in range(args.num_steps):
        # Sample actions
        with torch.no_grad():
            value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                    rollouts.obs[step],
                    rollouts.recurrent_hidden_states[step],
                    rollouts.masks[step])

        # Obser reward and next obs
        obs, reward, done, infos = envs.step(action)

        for info in infos:
            if 'episode' in info.keys():
                episode_rewards.append(info['episode']['r'])

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0]
                                   for done_ in done])
        rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)
        
#         obs = obs.values().clone()
        obs = torch.FloatTensor(obs.cpu().data).cuda()
        obs.requires_grad = True
        next_value = actor_critic.get_value(obs,
                                    rollouts.recurrent_hidden_states[-1],
                                    rollouts.masks[-1])
        next_value[0, ...].mean().backward()        
        
        im_display = np.concatenate(np.split(obs[0, ...].detach().cpu().numpy().transpose([1, 2, 0]).astype('uint8'), 4, axis=2), axis=1)
        
        print("reward", reward[0])
        print("action", action[0])
        
        clear_output(True)
        plt.figure(figsize=(10, 10))
        plt.subplot(1, 2, 1)
        plt.imshow(im_display, cmap="gray")        
        
        plt.subplot(1, 2, 2)
        plt.imshow(obs.grad[0,...].sum(dim=0).abs().cpu().numpy(), cmap="gray")        
        
        plt.pause(0.05)


In [None]:
obs

In [None]:
im_display = np.concatenate(np.split(obs[0, ...].detach().cpu().numpy().transpose([1, 2, 0]).astype('uint8'), 4, axis=2), axis=1)
plt.imshow(im_display)