# Simple q-learning agent with experience replay

We re-write q-learning algorithm using _agentnet_ - a helper for lasagne that implements some RL techniques.

In [1]:
# ! pip install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
%env THEANO_FLAGS='floatX=float32'

#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

env: THEANO_FLAGS='floatX=float32'


### Experiment setup
* Here we simply load the game and check that it works

In [3]:
import gym
make_env = lambda: gym.make("BipedalWalker-v2")

env=make_env()
env.reset()

state_shape = env.observation_space.shape
n_actions = env.action_space.n

ImportError: /home/anatoly/anaconda3/lib/python3.6/site-packages/Box2D/_Box2D.cpython-36m-x86_64-linux-gnu.so: undefined symbol: _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm

In [5]:
# plt.imshow(env.render("rgb_array"))
# del env

# Neural Network body

In [6]:
import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import elu


#image observation at current tick goes here, shape = (sample_i,x,y,color)
observation_layer = InputLayer((None,)+state_shape)

nn = DenseLayer(observation_layer, 256, nonlinearity=elu)
nn = DenseLayer(nn, 64, nonlinearity=elu)

#a layer that predicts Qvalues
qvalues_layer = DenseLayer(nn, num_units=n_actions,
                           nonlinearity=None,name="q-values")

ERROR (theano.gpuarray): pygpu was configured but could not be imported or is too old (version 0.6 or higher required)
NoneType: None


Picking actions is done by yet another layer, that implements $ \epsilon$ -greedy policy

In [7]:
from agentnet.resolver import EpsilonGreedyResolver
action_layer = EpsilonGreedyResolver(qvalues_layer)

#set starting epsilon
action_layer.epsilon.set_value(np.float32(0.05))


### Agent

We define an agent entirely composed of a lasagne network:
* Observations as InputLayer(s)
* Actions as intermediate Layer(s)
* `policy_estimators` is "whatever else you want to keep track of"

Each parameter can be either one layer or a list of layers

In [8]:
from agentnet.agent import Agent
agent = Agent(observation_layers=observation_layer,
              action_layers=action_layer,
              policy_estimators=qvalues_layer,)


In [9]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(action_layer,trainable=True)
weights

[W, b, W, b, q-values.W, q-values.b]

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [10]:
from agentnet.experiments.openai_gym.pool import EnvPool
pool = EnvPool(agent,make_env,n_games=10,max_size=10000)

In [11]:
%%time
#interact for 7 ticks
obs_log,action_log,reward_log,_,_,_  = pool.interact(5)


print('actions:',action_log)
print('rewards:',reward_log)

actions: [[1 1 1 1 0]
 [1 1 1 1 1]
 [2 1 1 1 0]
 [1 1 1 1 1]
 [1 1 1 1 1]
 [1 1 1 2 2]
 [1 1 1 1 0]
 [1 0 1 1 2]
 [1 1 1 1 0]
 [1 1 1 1 0]]
rewards: [[-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]
 [-1. -1. -1. -1.  0.]]
CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 16.9 ms


In [12]:
#we'll train on rollouts of 10 steps (required by n-step algorithms and rnns later)
SEQ_LENGTH=10

#load first sessions (this function calls interact and stores sessions in the pool)

for _ in range(100):
    pool.update(SEQ_LENGTH)

# q-learning

We shall now define a function that replays recent game sessions and updates network weights

In [13]:
#get agent's Qvalues obtained via experience replay
replay = pool.experience_replay.sample_session_batch(100)
qvalues_seq = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)[-1]



In [14]:
#loss for Qlearning = (Q(s,a) - (r+gamma*Q(s',a_max)))^2, like you implemented before in lasagne.

from agentnet.learning import qlearning
elwise_mse_loss = qlearning.get_elementwise_objective(qvalues_seq,
                                                      replay.actions[0],
                                                      replay.rewards,
                                                      replay.is_alive,
                                                      gamma_or_gammas=0.99,
                                                      n_steps=1,)

#compute mean loss over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

In [15]:
#get weight updates
updates = lasagne.updates.adam(loss,weights,learning_rate=1e-4)

#compile train function
import theano
train_step = theano.function([],loss,updates=updates)

# Demo run

Play full session with an untrained agent

In [16]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)

Episode finished after 166 timesteps with reward=-165.0


In [18]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./records/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./records/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

# Training loop

In [19]:
epoch_counter = 1 #starting epoch
rewards = {} #full game rewards
target_score = -90

In [20]:
from tqdm import trange

for i in trange(10000):    
    
    #play
    for _ in range(5):
        pool.update(SEQ_LENGTH,append=True)
    
    #train
    train_step()
    
    #update epsilon
    epsilon = 0.05 + 0.95*np.exp(-epoch_counter/1000.)
    action_layer.epsilon.set_value(np.float32(epsilon))
    
    #play a few games for evaluation
    if epoch_counter%100==0:
        rewards[epoch_counter] = np.mean(pool.evaluate(n_games=3,record_video=False))
        print("iter=%i\tepsilon=%.3f"%(epoch_counter,action_layer.epsilon.get_value(),))
        print("Current score(mean over %i) = %.3f"%(3,np.mean(rewards[epoch_counter])))
    
        if rewards[epoch_counter] >= target_score:
            print("You win!")
            break

    
    epoch_counter  +=1

    
# Time to drink some coffee!

  1%|          | 99/10000 [00:25<42:56,  3.84it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 500 timesteps with reward=-500.0


  1%|          | 100/10000 [00:27<45:32,  3.62it/s]

Episode finished after 500 timesteps with reward=-500.0
iter=100	epsilon=0.910
Current score(mean over 3) = -500.000


  2%|▏         | 199/10000 [00:53<44:12,  3.69it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 500 timesteps with reward=-500.0


  2%|▏         | 200/10000 [00:55<45:08,  3.62it/s]

Episode finished after 500 timesteps with reward=-500.0
iter=200	epsilon=0.828
Current score(mean over 3) = -500.000


  3%|▎         | 299/10000 [01:22<44:21,  3.64it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 500 timesteps with reward=-500.0


  3%|▎         | 300/10000 [01:23<44:58,  3.59it/s]

Episode finished after 500 timesteps with reward=-500.0
iter=300	epsilon=0.754
Current score(mean over 3) = -500.000


  4%|▍         | 399/10000 [01:50<44:28,  3.60it/s]

Episode finished after 474 timesteps with reward=-473.0
Episode finished after 500 timesteps with reward=-500.0


  4%|▍         | 400/10000 [01:52<44:54,  3.56it/s]

Episode finished after 500 timesteps with reward=-500.0
iter=400	epsilon=0.687
Current score(mean over 3) = -491.000


  5%|▍         | 499/10000 [02:20<44:25,  3.56it/s]

Episode finished after 327 timesteps with reward=-326.0
Episode finished after 500 timesteps with reward=-500.0


  5%|▌         | 500/10000 [02:21<44:43,  3.54it/s]

Episode finished after 485 timesteps with reward=-484.0
iter=500	epsilon=0.626
Current score(mean over 3) = -436.667


  6%|▌         | 599/10000 [02:47<43:48,  3.58it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 490 timesteps with reward=-489.0


  6%|▌         | 600/10000 [02:48<44:05,  3.55it/s]

Episode finished after 500 timesteps with reward=-500.0
iter=600	epsilon=0.571
Current score(mean over 3) = -496.333


  7%|▋         | 699/10000 [03:15<43:26,  3.57it/s]

Episode finished after 288 timesteps with reward=-287.0
Episode finished after 276 timesteps with reward=-275.0


  7%|▋         | 700/10000 [03:16<43:33,  3.56it/s]

Episode finished after 224 timesteps with reward=-223.0
iter=700	epsilon=0.522
Current score(mean over 3) = -261.667


  8%|▊         | 799/10000 [03:44<43:04,  3.56it/s]

Episode finished after 218 timesteps with reward=-217.0
Episode finished after 192 timesteps with reward=-191.0


  8%|▊         | 800/10000 [03:45<43:09,  3.55it/s]

Episode finished after 174 timesteps with reward=-173.0
iter=800	epsilon=0.477
Current score(mean over 3) = -193.667


  9%|▉         | 899/10000 [04:12<42:40,  3.55it/s]

Episode finished after 144 timesteps with reward=-143.0
Episode finished after 183 timesteps with reward=-182.0


  9%|▉         | 900/10000 [04:13<42:43,  3.55it/s]

Episode finished after 126 timesteps with reward=-125.0
iter=900	epsilon=0.436
Current score(mean over 3) = -150.000


 10%|▉         | 999/10000 [04:40<42:07,  3.56it/s]

Episode finished after 144 timesteps with reward=-143.0
Episode finished after 162 timesteps with reward=-161.0


 10%|█         | 1000/10000 [04:41<42:10,  3.56it/s]

Episode finished after 126 timesteps with reward=-125.0
iter=1000	epsilon=0.399
Current score(mean over 3) = -143.000


 11%|█         | 1099/10000 [05:07<41:30,  3.57it/s]

Episode finished after 157 timesteps with reward=-156.0
Episode finished after 235 timesteps with reward=-234.0


 11%|█         | 1100/10000 [05:08<41:33,  3.57it/s]

Episode finished after 248 timesteps with reward=-247.0
iter=1100	epsilon=0.366
Current score(mean over 3) = -212.333


 12%|█▏        | 1200/10000 [05:35<40:58,  3.58it/s]

Episode finished after 115 timesteps with reward=-114.0
Episode finished after 97 timesteps with reward=-96.0
Episode finished after 167 timesteps with reward=-166.0
iter=1200	epsilon=0.336
Current score(mean over 3) = -125.333


 13%|█▎        | 1299/10000 [06:01<40:24,  3.59it/s]

Episode finished after 121 timesteps with reward=-120.0
Episode finished after 122 timesteps with reward=-121.0


 13%|█▎        | 1300/10000 [06:02<40:26,  3.59it/s]

Episode finished after 245 timesteps with reward=-244.0
iter=1300	epsilon=0.309
Current score(mean over 3) = -161.667


 14%|█▍        | 1399/10000 [06:29<39:51,  3.60it/s]

Episode finished after 168 timesteps with reward=-167.0
Episode finished after 162 timesteps with reward=-161.0


 14%|█▍        | 1400/10000 [06:29<39:53,  3.59it/s]

Episode finished after 154 timesteps with reward=-153.0
iter=1400	epsilon=0.284
Current score(mean over 3) = -160.333


 15%|█▍        | 1499/10000 [06:56<39:21,  3.60it/s]

Episode finished after 212 timesteps with reward=-211.0
Episode finished after 199 timesteps with reward=-198.0


 15%|█▌        | 1500/10000 [06:57<39:24,  3.60it/s]

Episode finished after 280 timesteps with reward=-279.0
iter=1500	epsilon=0.262
Current score(mean over 3) = -229.333


 16%|█▌        | 1599/10000 [07:24<38:55,  3.60it/s]

Episode finished after 156 timesteps with reward=-155.0
Episode finished after 263 timesteps with reward=-262.0


 16%|█▌        | 1600/10000 [07:25<38:58,  3.59it/s]

Episode finished after 206 timesteps with reward=-205.0
iter=1600	epsilon=0.242
Current score(mean over 3) = -207.333


 17%|█▋        | 1700/10000 [07:53<38:31,  3.59it/s]

Episode finished after 133 timesteps with reward=-132.0
Episode finished after 129 timesteps with reward=-128.0
Episode finished after 145 timesteps with reward=-144.0
iter=1700	epsilon=0.224
Current score(mean over 3) = -134.667


 18%|█▊        | 1799/10000 [08:21<38:04,  3.59it/s]

Episode finished after 114 timesteps with reward=-113.0
Episode finished after 174 timesteps with reward=-173.0


 18%|█▊        | 1800/10000 [08:21<38:06,  3.59it/s]

Episode finished after 161 timesteps with reward=-160.0
iter=1800	epsilon=0.207
Current score(mean over 3) = -148.667


 19%|█▉        | 1899/10000 [08:48<37:35,  3.59it/s]

Episode finished after 191 timesteps with reward=-190.0
Episode finished after 125 timesteps with reward=-124.0


 19%|█▉        | 1900/10000 [08:49<37:36,  3.59it/s]

Episode finished after 143 timesteps with reward=-142.0
iter=1900	epsilon=0.192
Current score(mean over 3) = -152.000


 20%|██        | 2000/10000 [09:16<37:05,  3.60it/s]

Episode finished after 146 timesteps with reward=-145.0
Episode finished after 129 timesteps with reward=-128.0
Episode finished after 135 timesteps with reward=-134.0
iter=2000	epsilon=0.179
Current score(mean over 3) = -135.667


 21%|██        | 2099/10000 [09:43<36:36,  3.60it/s]

Episode finished after 109 timesteps with reward=-108.0
Episode finished after 172 timesteps with reward=-171.0


 21%|██        | 2100/10000 [09:44<36:37,  3.59it/s]

Episode finished after 223 timesteps with reward=-222.0
iter=2100	epsilon=0.166
Current score(mean over 3) = -167.000


 22%|██▏       | 2199/10000 [10:12<36:12,  3.59it/s]

Episode finished after 289 timesteps with reward=-288.0
Episode finished after 500 timesteps with reward=-500.0


 22%|██▏       | 2200/10000 [10:13<36:16,  3.58it/s]

Episode finished after 295 timesteps with reward=-294.0
iter=2200	epsilon=0.155
Current score(mean over 3) = -360.667


 23%|██▎       | 2299/10000 [10:42<35:51,  3.58it/s]

Episode finished after 307 timesteps with reward=-306.0
Episode finished after 230 timesteps with reward=-229.0


 23%|██▎       | 2300/10000 [10:43<35:54,  3.57it/s]

Episode finished after 153 timesteps with reward=-152.0
iter=2300	epsilon=0.145
Current score(mean over 3) = -229.000


 24%|██▍       | 2400/10000 [11:11<35:25,  3.58it/s]

Episode finished after 159 timesteps with reward=-158.0
Episode finished after 117 timesteps with reward=-116.0
Episode finished after 151 timesteps with reward=-150.0
iter=2400	epsilon=0.136
Current score(mean over 3) = -141.333


 25%|██▌       | 2500/10000 [11:41<35:05,  3.56it/s]

Episode finished after 126 timesteps with reward=-125.0
Episode finished after 140 timesteps with reward=-139.0
Episode finished after 109 timesteps with reward=-108.0
iter=2500	epsilon=0.128
Current score(mean over 3) = -124.000


 26%|██▌       | 2599/10000 [12:08<34:34,  3.57it/s]

Episode finished after 259 timesteps with reward=-258.0
Episode finished after 216 timesteps with reward=-215.0


 26%|██▌       | 2600/10000 [12:09<34:36,  3.56it/s]

Episode finished after 468 timesteps with reward=-467.0
iter=2600	epsilon=0.121
Current score(mean over 3) = -313.333


 27%|██▋       | 2699/10000 [12:35<34:04,  3.57it/s]

Episode finished after 237 timesteps with reward=-236.0
Episode finished after 146 timesteps with reward=-145.0


 27%|██▋       | 2700/10000 [12:36<34:05,  3.57it/s]

Episode finished after 217 timesteps with reward=-216.0
iter=2700	epsilon=0.114
Current score(mean over 3) = -199.000


 28%|██▊       | 2800/10000 [13:03<33:34,  3.57it/s]

Episode finished after 101 timesteps with reward=-100.0
Episode finished after 103 timesteps with reward=-102.0
Episode finished after 93 timesteps with reward=-92.0
iter=2800	epsilon=0.108
Current score(mean over 3) = -98.000


 29%|██▉       | 2900/10000 [13:30<33:03,  3.58it/s]

Episode finished after 97 timesteps with reward=-96.0
Episode finished after 98 timesteps with reward=-97.0
Episode finished after 98 timesteps with reward=-97.0
iter=2900	epsilon=0.102
Current score(mean over 3) = -96.667


 30%|██▉       | 2999/10000 [13:56<32:32,  3.59it/s]

Episode finished after 156 timesteps with reward=-155.0


 30%|███       | 3000/10000 [13:57<32:33,  3.58it/s]

Episode finished after 224 timesteps with reward=-223.0
Episode finished after 120 timesteps with reward=-119.0
iter=3000	epsilon=0.097
Current score(mean over 3) = -165.667


 31%|███       | 3100/10000 [14:24<32:03,  3.59it/s]

Episode finished after 139 timesteps with reward=-138.0
Episode finished after 109 timesteps with reward=-108.0
Episode finished after 136 timesteps with reward=-135.0
iter=3100	epsilon=0.093
Current score(mean over 3) = -127.000


 32%|███▏      | 3200/10000 [14:51<31:33,  3.59it/s]

Episode finished after 181 timesteps with reward=-180.0
Episode finished after 105 timesteps with reward=-104.0
Episode finished after 143 timesteps with reward=-142.0
iter=3200	epsilon=0.089
Current score(mean over 3) = -142.000


 33%|███▎      | 3300/10000 [15:17<31:03,  3.59it/s]

Episode finished after 183 timesteps with reward=-182.0
Episode finished after 122 timesteps with reward=-121.0
Episode finished after 112 timesteps with reward=-111.0
iter=3300	epsilon=0.085
Current score(mean over 3) = -138.000


 34%|███▍      | 3400/10000 [15:44<30:33,  3.60it/s]

Episode finished after 154 timesteps with reward=-153.0
Episode finished after 121 timesteps with reward=-120.0
Episode finished after 101 timesteps with reward=-100.0
iter=3400	epsilon=0.082
Current score(mean over 3) = -124.333


 35%|███▌      | 3500/10000 [16:11<30:04,  3.60it/s]

Episode finished after 92 timesteps with reward=-91.0
Episode finished after 157 timesteps with reward=-156.0
Episode finished after 103 timesteps with reward=-102.0
iter=3500	epsilon=0.079
Current score(mean over 3) = -116.333


 36%|███▌      | 3600/10000 [16:38<29:35,  3.60it/s]

Episode finished after 110 timesteps with reward=-109.0
Episode finished after 119 timesteps with reward=-118.0
Episode finished after 113 timesteps with reward=-112.0
iter=3600	epsilon=0.076
Current score(mean over 3) = -113.000


 37%|███▋      | 3700/10000 [17:05<29:06,  3.61it/s]

Episode finished after 185 timesteps with reward=-184.0
Episode finished after 138 timesteps with reward=-137.0
Episode finished after 124 timesteps with reward=-123.0
iter=3700	epsilon=0.073
Current score(mean over 3) = -148.000


 38%|███▊      | 3799/10000 [17:32<28:37,  3.61it/s]

Episode finished after 105 timesteps with reward=-104.0


 38%|███▊      | 3800/10000 [17:32<28:38,  3.61it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 112 timesteps with reward=-111.0
iter=3800	epsilon=0.071
Current score(mean over 3) = -238.333


 39%|███▉      | 3899/10000 [17:59<28:09,  3.61it/s]

Episode finished after 137 timesteps with reward=-136.0
Episode finished after 94 timesteps with reward=-93.0


 39%|███▉      | 3900/10000 [18:00<28:09,  3.61it/s]

Episode finished after 152 timesteps with reward=-151.0
iter=3900	epsilon=0.069
Current score(mean over 3) = -126.667


 40%|████      | 4000/10000 [18:27<27:40,  3.61it/s]

Episode finished after 143 timesteps with reward=-142.0
Episode finished after 113 timesteps with reward=-112.0
Episode finished after 100 timesteps with reward=-99.0
iter=4000	epsilon=0.067
Current score(mean over 3) = -117.667


 41%|████      | 4100/10000 [18:53<27:11,  3.62it/s]

Episode finished after 104 timesteps with reward=-103.0
Episode finished after 102 timesteps with reward=-101.0
Episode finished after 169 timesteps with reward=-168.0
iter=4100	epsilon=0.066
Current score(mean over 3) = -124.000


 42%|████▏     | 4199/10000 [19:20<26:42,  3.62it/s]

Episode finished after 294 timesteps with reward=-293.0
Episode finished after 121 timesteps with reward=-120.0


 42%|████▏     | 4200/10000 [19:20<26:43,  3.62it/s]

Episode finished after 205 timesteps with reward=-204.0
iter=4200	epsilon=0.064
Current score(mean over 3) = -205.667


 43%|████▎     | 4300/10000 [19:47<26:14,  3.62it/s]

Episode finished after 90 timesteps with reward=-89.0
Episode finished after 119 timesteps with reward=-118.0
Episode finished after 100 timesteps with reward=-99.0
iter=4300	epsilon=0.063
Current score(mean over 3) = -102.000


 44%|████▍     | 4400/10000 [20:14<25:45,  3.62it/s]

Episode finished after 111 timesteps with reward=-110.0
Episode finished after 94 timesteps with reward=-93.0
Episode finished after 90 timesteps with reward=-89.0
iter=4400	epsilon=0.062
Current score(mean over 3) = -97.333


 45%|████▌     | 4500/10000 [20:41<25:17,  3.62it/s]

Episode finished after 128 timesteps with reward=-127.0
Episode finished after 119 timesteps with reward=-118.0
Episode finished after 113 timesteps with reward=-112.0
iter=4500	epsilon=0.061
Current score(mean over 3) = -119.000


 46%|████▌     | 4599/10000 [21:07<24:48,  3.63it/s]

Episode finished after 248 timesteps with reward=-247.0
Episode finished after 167 timesteps with reward=-166.0


 46%|████▌     | 4600/10000 [21:08<24:49,  3.63it/s]

Episode finished after 386 timesteps with reward=-385.0
iter=4600	epsilon=0.060
Current score(mean over 3) = -266.000


 47%|████▋     | 4699/10000 [21:34<24:20,  3.63it/s]

Episode finished after 132 timesteps with reward=-131.0
Episode finished after 158 timesteps with reward=-157.0


 47%|████▋     | 4700/10000 [21:35<24:20,  3.63it/s]

Episode finished after 201 timesteps with reward=-200.0
iter=4700	epsilon=0.059
Current score(mean over 3) = -162.667


 48%|████▊     | 4800/10000 [22:02<23:52,  3.63it/s]

Episode finished after 135 timesteps with reward=-134.0
Episode finished after 155 timesteps with reward=-154.0
Episode finished after 123 timesteps with reward=-122.0
iter=4800	epsilon=0.058
Current score(mean over 3) = -136.667


 49%|████▉     | 4900/10000 [22:29<23:24,  3.63it/s]

Episode finished after 164 timesteps with reward=-163.0
Episode finished after 97 timesteps with reward=-96.0
Episode finished after 126 timesteps with reward=-125.0
iter=4900	epsilon=0.057
Current score(mean over 3) = -128.000


 50%|█████     | 5000/10000 [22:55<22:55,  3.63it/s]

Episode finished after 109 timesteps with reward=-108.0
Episode finished after 114 timesteps with reward=-113.0
Episode finished after 94 timesteps with reward=-93.0
iter=5000	epsilon=0.056
Current score(mean over 3) = -104.667


 51%|█████     | 5100/10000 [23:24<22:29,  3.63it/s]

Episode finished after 105 timesteps with reward=-104.0
Episode finished after 106 timesteps with reward=-105.0
Episode finished after 89 timesteps with reward=-88.0
iter=5100	epsilon=0.056
Current score(mean over 3) = -99.000


 52%|█████▏    | 5199/10000 [23:50<22:01,  3.63it/s]

Episode finished after 196 timesteps with reward=-195.0
Episode finished after 137 timesteps with reward=-136.0


 52%|█████▏    | 5200/10000 [23:51<22:01,  3.63it/s]

Episode finished after 191 timesteps with reward=-190.0
iter=5200	epsilon=0.055
Current score(mean over 3) = -173.667


 53%|█████▎    | 5300/10000 [24:18<21:33,  3.63it/s]

Episode finished after 128 timesteps with reward=-127.0
Episode finished after 132 timesteps with reward=-131.0
Episode finished after 110 timesteps with reward=-109.0
iter=5300	epsilon=0.055
Current score(mean over 3) = -122.333


 54%|█████▍    | 5400/10000 [24:45<21:05,  3.64it/s]

Episode finished after 141 timesteps with reward=-140.0
Episode finished after 158 timesteps with reward=-157.0
Episode finished after 99 timesteps with reward=-98.0
iter=5400	epsilon=0.054
Current score(mean over 3) = -131.667


 55%|█████▌    | 5500/10000 [25:11<20:36,  3.64it/s]

Episode finished after 104 timesteps with reward=-103.0
Episode finished after 153 timesteps with reward=-152.0
Episode finished after 96 timesteps with reward=-95.0
iter=5500	epsilon=0.054
Current score(mean over 3) = -116.667


 56%|█████▌    | 5600/10000 [25:38<20:08,  3.64it/s]

Episode finished after 117 timesteps with reward=-116.0
Episode finished after 127 timesteps with reward=-126.0
Episode finished after 147 timesteps with reward=-146.0
iter=5600	epsilon=0.054
Current score(mean over 3) = -129.333


 57%|█████▋    | 5700/10000 [26:06<19:41,  3.64it/s]

Episode finished after 106 timesteps with reward=-105.0
Episode finished after 106 timesteps with reward=-105.0
Episode finished after 98 timesteps with reward=-97.0
iter=5700	epsilon=0.053
Current score(mean over 3) = -102.333


 58%|█████▊    | 5800/10000 [26:32<19:13,  3.64it/s]

Episode finished after 87 timesteps with reward=-86.0
Episode finished after 97 timesteps with reward=-96.0
Episode finished after 112 timesteps with reward=-111.0
iter=5800	epsilon=0.053
Current score(mean over 3) = -97.667


 59%|█████▉    | 5900/10000 [27:00<18:45,  3.64it/s]

Episode finished after 500 timesteps with reward=-500.0
Episode finished after 87 timesteps with reward=-86.0
Episode finished after 113 timesteps with reward=-112.0
iter=5900	epsilon=0.053
Current score(mean over 3) = -232.667


 60%|██████    | 6000/10000 [27:26<18:17,  3.64it/s]

Episode finished after 90 timesteps with reward=-89.0
Episode finished after 103 timesteps with reward=-102.0
Episode finished after 89 timesteps with reward=-88.0
iter=6000	epsilon=0.052
Current score(mean over 3) = -93.000


 61%|██████    | 6100/10000 [27:53<17:49,  3.65it/s]

Episode finished after 162 timesteps with reward=-161.0
Episode finished after 103 timesteps with reward=-102.0
Episode finished after 134 timesteps with reward=-133.0
iter=6100	epsilon=0.052
Current score(mean over 3) = -132.000


 62%|██████▏   | 6199/10000 [28:19<17:22,  3.65it/s]

Episode finished after 207 timesteps with reward=-206.0
Episode finished after 262 timesteps with reward=-261.0


 62%|██████▏   | 6200/10000 [28:20<17:22,  3.65it/s]

Episode finished after 335 timesteps with reward=-334.0
iter=6200	epsilon=0.052
Current score(mean over 3) = -267.000


 63%|██████▎   | 6299/10000 [28:47<16:54,  3.65it/s]

Episode finished after 275 timesteps with reward=-274.0


 63%|██████▎   | 6300/10000 [28:48<16:54,  3.65it/s]

Episode finished after 358 timesteps with reward=-357.0
Episode finished after 183 timesteps with reward=-182.0
iter=6300	epsilon=0.052
Current score(mean over 3) = -271.000


 64%|██████▍   | 6399/10000 [29:14<16:27,  3.65it/s]

Episode finished after 447 timesteps with reward=-446.0
Episode finished after 232 timesteps with reward=-231.0


 64%|██████▍   | 6400/10000 [29:15<16:27,  3.65it/s]

Episode finished after 220 timesteps with reward=-219.0
iter=6400	epsilon=0.052
Current score(mean over 3) = -298.667


 65%|██████▍   | 6499/10000 [29:41<15:59,  3.65it/s]

Episode finished after 168 timesteps with reward=-167.0


 65%|██████▌   | 6500/10000 [29:42<15:59,  3.65it/s]

Episode finished after 281 timesteps with reward=-280.0
Episode finished after 188 timesteps with reward=-187.0
iter=6500	epsilon=0.051
Current score(mean over 3) = -211.333


 66%|██████▌   | 6599/10000 [30:08<15:32,  3.65it/s]

Episode finished after 244 timesteps with reward=-243.0
Episode finished after 212 timesteps with reward=-211.0


 66%|██████▌   | 6600/10000 [30:09<15:32,  3.65it/s]

Episode finished after 336 timesteps with reward=-335.0
iter=6600	epsilon=0.051
Current score(mean over 3) = -263.000


 67%|██████▋   | 6699/10000 [30:36<15:04,  3.65it/s]

Episode finished after 342 timesteps with reward=-341.0
Episode finished after 364 timesteps with reward=-363.0


 67%|██████▋   | 6700/10000 [30:37<15:04,  3.65it/s]

Episode finished after 426 timesteps with reward=-425.0
iter=6700	epsilon=0.051
Current score(mean over 3) = -376.333


 68%|██████▊   | 6799/10000 [31:03<14:37,  3.65it/s]

Episode finished after 206 timesteps with reward=-205.0
Episode finished after 125 timesteps with reward=-124.0


 68%|██████▊   | 6800/10000 [31:04<14:37,  3.65it/s]

Episode finished after 372 timesteps with reward=-371.0
iter=6800	epsilon=0.051
Current score(mean over 3) = -233.333


 69%|██████▉   | 6899/10000 [31:31<14:10,  3.65it/s]

Episode finished after 259 timesteps with reward=-258.0
Episode finished after 165 timesteps with reward=-164.0


 69%|██████▉   | 6900/10000 [31:31<14:09,  3.65it/s]

Episode finished after 141 timesteps with reward=-140.0
iter=6900	epsilon=0.051
Current score(mean over 3) = -187.333


 70%|██████▉   | 6999/10000 [31:58<13:42,  3.65it/s]

Episode finished after 231 timesteps with reward=-230.0
Episode finished after 259 timesteps with reward=-258.0


 70%|███████   | 7000/10000 [31:58<13:42,  3.65it/s]

Episode finished after 231 timesteps with reward=-230.0
iter=7000	epsilon=0.051
Current score(mean over 3) = -239.333


 71%|███████   | 7099/10000 [32:25<13:14,  3.65it/s]

Episode finished after 354 timesteps with reward=-353.0
Episode finished after 207 timesteps with reward=-206.0


 71%|███████   | 7100/10000 [32:26<13:14,  3.65it/s]

Episode finished after 230 timesteps with reward=-229.0
iter=7100	epsilon=0.051
Current score(mean over 3) = -262.667


 72%|███████▏  | 7199/10000 [32:52<12:47,  3.65it/s]

Episode finished after 202 timesteps with reward=-201.0
Episode finished after 268 timesteps with reward=-267.0


 72%|███████▏  | 7200/10000 [32:53<12:47,  3.65it/s]

Episode finished after 156 timesteps with reward=-155.0
iter=7200	epsilon=0.051
Current score(mean over 3) = -207.667


 73%|███████▎  | 7299/10000 [33:19<12:19,  3.65it/s]

Episode finished after 282 timesteps with reward=-281.0
Episode finished after 248 timesteps with reward=-247.0


 73%|███████▎  | 7300/10000 [33:20<12:19,  3.65it/s]

Episode finished after 203 timesteps with reward=-202.0
iter=7300	epsilon=0.051
Current score(mean over 3) = -243.333


 73%|███████▎  | 7346/10000 [33:33<12:07,  3.65it/s]

KeyboardInterrupt: 

In [None]:
from pandas import ewma
iters,session_rewards=zip(*sorted(rewards.items(),key=lambda (k,v):k))
plt.plot(iters,ewma(np.array(session_rewards),span=10))

In [None]:
final_reward = pool.evaluate(n_games=10,save_path="./records",record_video=True)

print("average reward:",final_reward)

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./records/")))

for video_name in video_names:
    HTML("""
    <video width="640" height="480" controls>
      <source src="{}" type="video/mp4">
    </video>
    """.format("./records/"+video_name)) #this may or may not be _last_ video. Try other indices