In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-2
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [2]:
%time hist1 = train_model(env, model, total_episodes=7000)

  logger.deprecation(


episode 1.000000, reward total was -19.000000. running mean: -19.000000
episode 2.000000, reward total was -21.000000. running mean: -19.020000
episode 3.000000, reward total was -21.000000. running mean: -19.039800
episode 4.000000, reward total was -20.000000. running mean: -19.049402
episode 5.000000, reward total was -21.000000. running mean: -19.068908
episode 6.000000, reward total was -21.000000. running mean: -19.088219
episode 7.000000, reward total was -21.000000. running mean: -19.107337
episode 8.000000, reward total was -21.000000. running mean: -19.126263
episode 9.000000, reward total was -21.000000. running mean: -19.145001
episode 10.000000, reward total was -21.000000. running mean: -19.163551
episode 11.000000, reward total was -21.000000. running mean: -19.181915
episode 12.000000, reward total was -21.000000. running mean: -19.200096
episode 13.000000, reward total was -21.000000. running mean: -19.218095
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -20.000000. running mean: -20.101851
episode 115.000000, reward total was -20.000000. running mean: -20.100833
episode 116.000000, reward total was -21.000000. running mean: -20.109824
episode 117.000000, reward total was -21.000000. running mean: -20.118726
episode 118.000000, reward total was -20.000000. running mean: -20.117539
episode 119.000000, reward total was -21.000000. running mean: -20.126363
episode 120.000000, reward total was -20.000000. running mean: -20.125100
episode 121.000000, reward total was -21.000000. running mean: -20.133849
episode 122.000000, reward total was -21.000000. running mean: -20.142510
episode 123.000000, reward total was -21.000000. running mean: -20.151085
episode 124.000000, reward total was -21.000000. running mean: -20.159574
episode 125.000000, reward total was -21.000000. running mean: -20.167979
episode 126.000000, reward total was -21.000000. running mean: -20.176299
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.182362
episode 226.000000, reward total was -20.000000. running mean: -20.180538
episode 227.000000, reward total was -21.000000. running mean: -20.188733
episode 228.000000, reward total was -20.000000. running mean: -20.186846
episode 229.000000, reward total was -20.000000. running mean: -20.184977
episode 230.000000, reward total was -19.000000. running mean: -20.173127
episode 231.000000, reward total was -21.000000. running mean: -20.181396
episode 232.000000, reward total was -19.000000. running mean: -20.169582
episode 233.000000, reward total was -20.000000. running mean: -20.167886
episode 234.000000, reward total was -21.000000. running mean: -20.176207
episode 235.000000, reward total was -21.000000. running mean: -20.184445
episode 236.000000, reward total was -21.000000. running mean: -20.192601
episode 237.000000, reward total was -21.000000. running mean: -20.200675
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -20.185232
episode 337.000000, reward total was -20.000000. running mean: -20.183379
episode 338.000000, reward total was -20.000000. running mean: -20.181546
episode 339.000000, reward total was -19.000000. running mean: -20.169730
episode 340.000000, reward total was -20.000000. running mean: -20.168033
episode 341.000000, reward total was -21.000000. running mean: -20.176352
episode 342.000000, reward total was -21.000000. running mean: -20.184589
episode 343.000000, reward total was -21.000000. running mean: -20.192743
episode 344.000000, reward total was -21.000000. running mean: -20.200816
episode 345.000000, reward total was -21.000000. running mean: -20.208807
episode 346.000000, reward total was -21.000000. running mean: -20.216719
episode 347.000000, reward total was -20.000000. running mean: -20.214552
episode 348.000000, reward total was -20.000000. running mean: -20.212407
episode 349.000000, reward total was -

episode 447.000000, reward total was -20.000000. running mean: -20.255579
episode 448.000000, reward total was -21.000000. running mean: -20.263023
episode 449.000000, reward total was -19.000000. running mean: -20.250393
episode 450.000000, reward total was -20.000000. running mean: -20.247889
episode 451.000000, reward total was -20.000000. running mean: -20.245410
episode 452.000000, reward total was -21.000000. running mean: -20.252956
episode 453.000000, reward total was -20.000000. running mean: -20.250426
episode 454.000000, reward total was -21.000000. running mean: -20.257922
episode 455.000000, reward total was -21.000000. running mean: -20.265343
episode 456.000000, reward total was -20.000000. running mean: -20.262689
episode 457.000000, reward total was -21.000000. running mean: -20.270062
episode 458.000000, reward total was -21.000000. running mean: -20.277362
episode 459.000000, reward total was -21.000000. running mean: -20.284588
episode 460.000000, reward total was -

episode 558.000000, reward total was -20.000000. running mean: -19.970513
episode 559.000000, reward total was -20.000000. running mean: -19.970807
episode 560.000000, reward total was -19.000000. running mean: -19.961099
episode 561.000000, reward total was -21.000000. running mean: -19.971488
episode 562.000000, reward total was -21.000000. running mean: -19.981773
episode 563.000000, reward total was -20.000000. running mean: -19.981956
episode 564.000000, reward total was -20.000000. running mean: -19.982136
episode 565.000000, reward total was -20.000000. running mean: -19.982315
episode 566.000000, reward total was -20.000000. running mean: -19.982492
episode 567.000000, reward total was -19.000000. running mean: -19.972667
episode 568.000000, reward total was -20.000000. running mean: -19.972940
episode 569.000000, reward total was -19.000000. running mean: -19.963211
episode 570.000000, reward total was -20.000000. running mean: -19.963579
episode 571.000000, reward total was -

episode 669.000000, reward total was -20.000000. running mean: -19.902225
episode 670.000000, reward total was -20.000000. running mean: -19.903203
episode 671.000000, reward total was -20.000000. running mean: -19.904171
episode 672.000000, reward total was -20.000000. running mean: -19.905129
episode 673.000000, reward total was -18.000000. running mean: -19.886078
episode 674.000000, reward total was -20.000000. running mean: -19.887217
episode 675.000000, reward total was -20.000000. running mean: -19.888345
episode 676.000000, reward total was -21.000000. running mean: -19.899462
episode 677.000000, reward total was -19.000000. running mean: -19.890467
episode 678.000000, reward total was -20.000000. running mean: -19.891562
episode 679.000000, reward total was -21.000000. running mean: -19.902647
episode 680.000000, reward total was -18.000000. running mean: -19.883620
episode 681.000000, reward total was -18.000000. running mean: -19.864784
episode 682.000000, reward total was -

episode 780.000000, reward total was -14.000000. running mean: -19.702125
episode 781.000000, reward total was -20.000000. running mean: -19.705104
episode 782.000000, reward total was -19.000000. running mean: -19.698053
episode 783.000000, reward total was -18.000000. running mean: -19.681072
episode 784.000000, reward total was -21.000000. running mean: -19.694262
episode 785.000000, reward total was -20.000000. running mean: -19.697319
episode 786.000000, reward total was -20.000000. running mean: -19.700346
episode 787.000000, reward total was -19.000000. running mean: -19.693342
episode 788.000000, reward total was -17.000000. running mean: -19.666409
episode 789.000000, reward total was -21.000000. running mean: -19.679745
episode 790.000000, reward total was -20.000000. running mean: -19.682947
episode 791.000000, reward total was -20.000000. running mean: -19.686118
episode 792.000000, reward total was -21.000000. running mean: -19.699257
episode 793.000000, reward total was -

episode 891.000000, reward total was -20.000000. running mean: -19.518529
episode 892.000000, reward total was -19.000000. running mean: -19.513344
episode 893.000000, reward total was -18.000000. running mean: -19.498210
episode 894.000000, reward total was -21.000000. running mean: -19.513228
episode 895.000000, reward total was -20.000000. running mean: -19.518096
episode 896.000000, reward total was -20.000000. running mean: -19.522915
episode 897.000000, reward total was -15.000000. running mean: -19.477686
episode 898.000000, reward total was -17.000000. running mean: -19.452909
episode 899.000000, reward total was -19.000000. running mean: -19.448380
episode 900.000000, reward total was -21.000000. running mean: -19.463896
episode 901.000000, reward total was -21.000000. running mean: -19.479257
episode 902.000000, reward total was -18.000000. running mean: -19.464464
episode 903.000000, reward total was -21.000000. running mean: -19.479820
episode 904.000000, reward total was -

episode 1002.000000, reward total was -21.000000. running mean: -19.401625
episode 1003.000000, reward total was -17.000000. running mean: -19.377609
episode 1004.000000, reward total was -18.000000. running mean: -19.363832
episode 1005.000000, reward total was -20.000000. running mean: -19.370194
episode 1006.000000, reward total was -20.000000. running mean: -19.376492
episode 1007.000000, reward total was -20.000000. running mean: -19.382727
episode 1008.000000, reward total was -20.000000. running mean: -19.388900
episode 1009.000000, reward total was -17.000000. running mean: -19.365011
episode 1010.000000, reward total was -21.000000. running mean: -19.381361
episode 1011.000000, reward total was -19.000000. running mean: -19.377547
episode 1012.000000, reward total was -21.000000. running mean: -19.393772
episode 1013.000000, reward total was -20.000000. running mean: -19.399834
episode 1014.000000, reward total was -19.000000. running mean: -19.395836
episode 1015.000000, rewa

episode 1112.000000, reward total was -18.000000. running mean: -19.737347
episode 1113.000000, reward total was -18.000000. running mean: -19.719973
episode 1114.000000, reward total was -19.000000. running mean: -19.712773
episode 1115.000000, reward total was -20.000000. running mean: -19.715646
episode 1116.000000, reward total was -20.000000. running mean: -19.718489
episode 1117.000000, reward total was -18.000000. running mean: -19.701304
episode 1118.000000, reward total was -21.000000. running mean: -19.714291
episode 1119.000000, reward total was -21.000000. running mean: -19.727148
episode 1120.000000, reward total was -18.000000. running mean: -19.709877
episode 1121.000000, reward total was -20.000000. running mean: -19.712778
episode 1122.000000, reward total was -18.000000. running mean: -19.695650
episode 1123.000000, reward total was -21.000000. running mean: -19.708694
episode 1124.000000, reward total was -20.000000. running mean: -19.711607
episode 1125.000000, rewa

episode 1222.000000, reward total was -19.000000. running mean: -19.276486
episode 1223.000000, reward total was -19.000000. running mean: -19.273721
episode 1224.000000, reward total was -19.000000. running mean: -19.270984
episode 1225.000000, reward total was -18.000000. running mean: -19.258274
episode 1226.000000, reward total was -20.000000. running mean: -19.265691
episode 1227.000000, reward total was -18.000000. running mean: -19.253034
episode 1228.000000, reward total was -20.000000. running mean: -19.260504
episode 1229.000000, reward total was -21.000000. running mean: -19.277899
episode 1230.000000, reward total was -20.000000. running mean: -19.285120
episode 1231.000000, reward total was -20.000000. running mean: -19.292268
episode 1232.000000, reward total was -19.000000. running mean: -19.289346
episode 1233.000000, reward total was -20.000000. running mean: -19.296452
episode 1234.000000, reward total was -18.000000. running mean: -19.283488
episode 1235.000000, rewa

episode 1332.000000, reward total was -19.000000. running mean: -19.248457
episode 1333.000000, reward total was -21.000000. running mean: -19.265973
episode 1334.000000, reward total was -20.000000. running mean: -19.273313
episode 1335.000000, reward total was -21.000000. running mean: -19.290580
episode 1336.000000, reward total was -20.000000. running mean: -19.297674
episode 1337.000000, reward total was -20.000000. running mean: -19.304697
episode 1338.000000, reward total was -17.000000. running mean: -19.281650
episode 1339.000000, reward total was -19.000000. running mean: -19.278834
episode 1340.000000, reward total was -19.000000. running mean: -19.276045
episode 1341.000000, reward total was -20.000000. running mean: -19.283285
episode 1342.000000, reward total was -19.000000. running mean: -19.280452
episode 1343.000000, reward total was -18.000000. running mean: -19.267648
episode 1344.000000, reward total was -20.000000. running mean: -19.274971
episode 1345.000000, rewa

episode 1442.000000, reward total was -21.000000. running mean: -19.056485
episode 1443.000000, reward total was -21.000000. running mean: -19.075921
episode 1444.000000, reward total was -20.000000. running mean: -19.085161
episode 1445.000000, reward total was -20.000000. running mean: -19.094310
episode 1446.000000, reward total was -15.000000. running mean: -19.053367
episode 1447.000000, reward total was -20.000000. running mean: -19.062833
episode 1448.000000, reward total was -18.000000. running mean: -19.052205
episode 1449.000000, reward total was -19.000000. running mean: -19.051683
episode 1450.000000, reward total was -21.000000. running mean: -19.071166
episode 1451.000000, reward total was -20.000000. running mean: -19.080454
episode 1452.000000, reward total was -14.000000. running mean: -19.029650
episode 1453.000000, reward total was -21.000000. running mean: -19.049353
episode 1454.000000, reward total was -18.000000. running mean: -19.038860
episode 1455.000000, rewa

episode 1552.000000, reward total was -21.000000. running mean: -19.068865
episode 1553.000000, reward total was -18.000000. running mean: -19.058176
episode 1554.000000, reward total was -20.000000. running mean: -19.067595
episode 1555.000000, reward total was -15.000000. running mean: -19.026919
episode 1556.000000, reward total was -21.000000. running mean: -19.046650
episode 1557.000000, reward total was -20.000000. running mean: -19.056183
episode 1558.000000, reward total was -20.000000. running mean: -19.065621
episode 1559.000000, reward total was -18.000000. running mean: -19.054965
episode 1560.000000, reward total was -20.000000. running mean: -19.064415
episode 1561.000000, reward total was -20.000000. running mean: -19.073771
episode 1562.000000, reward total was -19.000000. running mean: -19.073034
episode 1563.000000, reward total was -20.000000. running mean: -19.082303
episode 1564.000000, reward total was -20.000000. running mean: -19.091480
episode 1565.000000, rewa

episode 1662.000000, reward total was -20.000000. running mean: -19.021834
episode 1663.000000, reward total was -20.000000. running mean: -19.031616
episode 1664.000000, reward total was -20.000000. running mean: -19.041300
episode 1665.000000, reward total was -18.000000. running mean: -19.030887
episode 1666.000000, reward total was -18.000000. running mean: -19.020578
episode 1667.000000, reward total was -19.000000. running mean: -19.020372
episode 1668.000000, reward total was -19.000000. running mean: -19.020169
episode 1669.000000, reward total was -18.000000. running mean: -19.009967
episode 1670.000000, reward total was -18.000000. running mean: -18.999867
episode 1671.000000, reward total was -20.000000. running mean: -19.009869
episode 1672.000000, reward total was -19.000000. running mean: -19.009770
episode 1673.000000, reward total was -21.000000. running mean: -19.029672
episode 1674.000000, reward total was -21.000000. running mean: -19.049375
episode 1675.000000, rewa

episode 1772.000000, reward total was -16.000000. running mean: -18.630488
episode 1773.000000, reward total was -17.000000. running mean: -18.614183
episode 1774.000000, reward total was -19.000000. running mean: -18.618041
episode 1775.000000, reward total was -19.000000. running mean: -18.621861
episode 1776.000000, reward total was -20.000000. running mean: -18.635642
episode 1777.000000, reward total was -21.000000. running mean: -18.659286
episode 1778.000000, reward total was -21.000000. running mean: -18.682693
episode 1779.000000, reward total was -20.000000. running mean: -18.695866
episode 1780.000000, reward total was -18.000000. running mean: -18.688907
episode 1781.000000, reward total was -18.000000. running mean: -18.682018
episode 1782.000000, reward total was -14.000000. running mean: -18.635198
episode 1783.000000, reward total was -17.000000. running mean: -18.618846
episode 1784.000000, reward total was -18.000000. running mean: -18.612657
episode 1785.000000, rewa

episode 1882.000000, reward total was -17.000000. running mean: -18.152753
episode 1883.000000, reward total was -20.000000. running mean: -18.171226
episode 1884.000000, reward total was -20.000000. running mean: -18.189514
episode 1885.000000, reward total was -19.000000. running mean: -18.197618
episode 1886.000000, reward total was -18.000000. running mean: -18.195642
episode 1887.000000, reward total was -19.000000. running mean: -18.203686
episode 1888.000000, reward total was -16.000000. running mean: -18.181649
episode 1889.000000, reward total was -15.000000. running mean: -18.149833
episode 1890.000000, reward total was -20.000000. running mean: -18.168334
episode 1891.000000, reward total was -18.000000. running mean: -18.166651
episode 1892.000000, reward total was -19.000000. running mean: -18.174984
episode 1893.000000, reward total was -18.000000. running mean: -18.173234
episode 1894.000000, reward total was -20.000000. running mean: -18.191502
episode 1895.000000, rewa

episode 1992.000000, reward total was -20.000000. running mean: -18.044554
episode 1993.000000, reward total was -18.000000. running mean: -18.044108
episode 1994.000000, reward total was -14.000000. running mean: -18.003667
episode 1995.000000, reward total was -16.000000. running mean: -17.983631
episode 1996.000000, reward total was -14.000000. running mean: -17.943794
episode 1997.000000, reward total was -17.000000. running mean: -17.934356
episode 1998.000000, reward total was -16.000000. running mean: -17.915013
episode 1999.000000, reward total was -16.000000. running mean: -17.895863
episode 2000.000000, reward total was -18.000000. running mean: -17.896904
episode 2001.000000, reward total was -18.000000. running mean: -17.897935
episode 2002.000000, reward total was -12.000000. running mean: -17.838956
episode 2003.000000, reward total was -17.000000. running mean: -17.830566
episode 2004.000000, reward total was -15.000000. running mean: -17.802261
episode 2005.000000, rewa

episode 2102.000000, reward total was -17.000000. running mean: -17.453213
episode 2103.000000, reward total was -17.000000. running mean: -17.448681
episode 2104.000000, reward total was -20.000000. running mean: -17.474194
episode 2105.000000, reward total was -15.000000. running mean: -17.449452
episode 2106.000000, reward total was -19.000000. running mean: -17.464958
episode 2107.000000, reward total was -21.000000. running mean: -17.500308
episode 2108.000000, reward total was -14.000000. running mean: -17.465305
episode 2109.000000, reward total was -18.000000. running mean: -17.470652
episode 2110.000000, reward total was -19.000000. running mean: -17.485946
episode 2111.000000, reward total was -17.000000. running mean: -17.481086
episode 2112.000000, reward total was -21.000000. running mean: -17.516275
episode 2113.000000, reward total was -19.000000. running mean: -17.531112
episode 2114.000000, reward total was -17.000000. running mean: -17.525801
episode 2115.000000, rewa

episode 2212.000000, reward total was -18.000000. running mean: -17.356164
episode 2213.000000, reward total was -19.000000. running mean: -17.372602
episode 2214.000000, reward total was -16.000000. running mean: -17.358876
episode 2215.000000, reward total was -19.000000. running mean: -17.375287
episode 2216.000000, reward total was -16.000000. running mean: -17.361534
episode 2217.000000, reward total was -21.000000. running mean: -17.397919
episode 2218.000000, reward total was -12.000000. running mean: -17.343940
episode 2219.000000, reward total was -14.000000. running mean: -17.310501
episode 2220.000000, reward total was -18.000000. running mean: -17.317396
episode 2221.000000, reward total was -19.000000. running mean: -17.334222
episode 2222.000000, reward total was -19.000000. running mean: -17.350879
episode 2223.000000, reward total was -16.000000. running mean: -17.337371
episode 2224.000000, reward total was -18.000000. running mean: -17.343997
episode 2225.000000, rewa

episode 2322.000000, reward total was -20.000000. running mean: -17.356287
episode 2323.000000, reward total was -16.000000. running mean: -17.342724
episode 2324.000000, reward total was -17.000000. running mean: -17.339297
episode 2325.000000, reward total was -18.000000. running mean: -17.345904
episode 2326.000000, reward total was -16.000000. running mean: -17.332445
episode 2327.000000, reward total was -16.000000. running mean: -17.319120
episode 2328.000000, reward total was -17.000000. running mean: -17.315929
episode 2329.000000, reward total was -16.000000. running mean: -17.302770
episode 2330.000000, reward total was -14.000000. running mean: -17.269742
episode 2331.000000, reward total was -16.000000. running mean: -17.257045
episode 2332.000000, reward total was -21.000000. running mean: -17.294474
episode 2333.000000, reward total was -16.000000. running mean: -17.281529
episode 2334.000000, reward total was -18.000000. running mean: -17.288714
episode 2335.000000, rewa

episode 2432.000000, reward total was -17.000000. running mean: -17.380417
episode 2433.000000, reward total was -17.000000. running mean: -17.376613
episode 2434.000000, reward total was -15.000000. running mean: -17.352847
episode 2435.000000, reward total was -15.000000. running mean: -17.329318
episode 2436.000000, reward total was -18.000000. running mean: -17.336025
episode 2437.000000, reward total was -14.000000. running mean: -17.302665
episode 2438.000000, reward total was -18.000000. running mean: -17.309638
episode 2439.000000, reward total was -18.000000. running mean: -17.316542
episode 2440.000000, reward total was -17.000000. running mean: -17.313376
episode 2441.000000, reward total was -19.000000. running mean: -17.330243
episode 2442.000000, reward total was -10.000000. running mean: -17.256940
episode 2443.000000, reward total was -14.000000. running mean: -17.224371
episode 2444.000000, reward total was -16.000000. running mean: -17.212127
episode 2445.000000, rewa

episode 2542.000000, reward total was -13.000000. running mean: -17.017912
episode 2543.000000, reward total was -17.000000. running mean: -17.017733
episode 2544.000000, reward total was -15.000000. running mean: -16.997556
episode 2545.000000, reward total was -18.000000. running mean: -17.007580
episode 2546.000000, reward total was -18.000000. running mean: -17.017504
episode 2547.000000, reward total was -17.000000. running mean: -17.017329
episode 2548.000000, reward total was -12.000000. running mean: -16.967156
episode 2549.000000, reward total was -14.000000. running mean: -16.937484
episode 2550.000000, reward total was -17.000000. running mean: -16.938110
episode 2551.000000, reward total was -19.000000. running mean: -16.958728
episode 2552.000000, reward total was -14.000000. running mean: -16.929141
episode 2553.000000, reward total was -18.000000. running mean: -16.939850
episode 2554.000000, reward total was -17.000000. running mean: -16.940451
episode 2555.000000, rewa

episode 2652.000000, reward total was -15.000000. running mean: -16.639101
episode 2653.000000, reward total was -17.000000. running mean: -16.642710
episode 2654.000000, reward total was -17.000000. running mean: -16.646283
episode 2655.000000, reward total was -12.000000. running mean: -16.599820
episode 2656.000000, reward total was -19.000000. running mean: -16.623822
episode 2657.000000, reward total was -12.000000. running mean: -16.577584
episode 2658.000000, reward total was -18.000000. running mean: -16.591808
episode 2659.000000, reward total was -17.000000. running mean: -16.595890
episode 2660.000000, reward total was -17.000000. running mean: -16.599931
episode 2661.000000, reward total was -16.000000. running mean: -16.593932
episode 2662.000000, reward total was -16.000000. running mean: -16.587992
episode 2663.000000, reward total was -17.000000. running mean: -16.592112
episode 2664.000000, reward total was -19.000000. running mean: -16.616191
episode 2665.000000, rewa

episode 2762.000000, reward total was -10.000000. running mean: -16.356855
episode 2763.000000, reward total was -17.000000. running mean: -16.363286
episode 2764.000000, reward total was -20.000000. running mean: -16.399653
episode 2765.000000, reward total was -16.000000. running mean: -16.395657
episode 2766.000000, reward total was -20.000000. running mean: -16.431700
episode 2767.000000, reward total was -18.000000. running mean: -16.447383
episode 2768.000000, reward total was -12.000000. running mean: -16.402910
episode 2769.000000, reward total was -15.000000. running mean: -16.388880
episode 2770.000000, reward total was -15.000000. running mean: -16.374992
episode 2771.000000, reward total was -13.000000. running mean: -16.341242
episode 2772.000000, reward total was -19.000000. running mean: -16.367829
episode 2773.000000, reward total was -19.000000. running mean: -16.394151
episode 2774.000000, reward total was -18.000000. running mean: -16.410209
episode 2775.000000, rewa

episode 2872.000000, reward total was -20.000000. running mean: -16.075125
episode 2873.000000, reward total was -17.000000. running mean: -16.084374
episode 2874.000000, reward total was -14.000000. running mean: -16.063530
episode 2875.000000, reward total was -14.000000. running mean: -16.042895
episode 2876.000000, reward total was -18.000000. running mean: -16.062466
episode 2877.000000, reward total was -17.000000. running mean: -16.071841
episode 2878.000000, reward total was -17.000000. running mean: -16.081123
episode 2879.000000, reward total was -18.000000. running mean: -16.100311
episode 2880.000000, reward total was -13.000000. running mean: -16.069308
episode 2881.000000, reward total was -14.000000. running mean: -16.048615
episode 2882.000000, reward total was -12.000000. running mean: -16.008129
episode 2883.000000, reward total was -11.000000. running mean: -15.958048
episode 2884.000000, reward total was -9.000000. running mean: -15.888467
episode 2885.000000, rewar

episode 2982.000000, reward total was -18.000000. running mean: -16.330729
episode 2983.000000, reward total was -15.000000. running mean: -16.317422
episode 2984.000000, reward total was -14.000000. running mean: -16.294248
episode 2985.000000, reward total was -18.000000. running mean: -16.311305
episode 2986.000000, reward total was -17.000000. running mean: -16.318192
episode 2987.000000, reward total was -12.000000. running mean: -16.275010
episode 2988.000000, reward total was -20.000000. running mean: -16.312260
episode 2989.000000, reward total was -11.000000. running mean: -16.259138
episode 2990.000000, reward total was -14.000000. running mean: -16.236546
episode 2991.000000, reward total was -15.000000. running mean: -16.224181
episode 2992.000000, reward total was -19.000000. running mean: -16.251939
episode 2993.000000, reward total was -13.000000. running mean: -16.219420
episode 2994.000000, reward total was -12.000000. running mean: -16.177226
episode 2995.000000, rewa

episode 3092.000000, reward total was -15.000000. running mean: -15.132534
episode 3093.000000, reward total was -19.000000. running mean: -15.171209
episode 3094.000000, reward total was -19.000000. running mean: -15.209497
episode 3095.000000, reward total was -12.000000. running mean: -15.177402
episode 3096.000000, reward total was -18.000000. running mean: -15.205628
episode 3097.000000, reward total was -14.000000. running mean: -15.193572
episode 3098.000000, reward total was -19.000000. running mean: -15.231636
episode 3099.000000, reward total was -17.000000. running mean: -15.249319
episode 3100.000000, reward total was -15.000000. running mean: -15.246826
episode 3101.000000, reward total was -21.000000. running mean: -15.304358
episode 3102.000000, reward total was -14.000000. running mean: -15.291314
episode 3103.000000, reward total was -9.000000. running mean: -15.228401
episode 3104.000000, reward total was -8.000000. running mean: -15.156117
episode 3105.000000, reward

episode 3202.000000, reward total was -17.000000. running mean: -15.120786
episode 3203.000000, reward total was -18.000000. running mean: -15.149578
episode 3204.000000, reward total was -15.000000. running mean: -15.148082
episode 3205.000000, reward total was -15.000000. running mean: -15.146602
episode 3206.000000, reward total was -15.000000. running mean: -15.145136
episode 3207.000000, reward total was -16.000000. running mean: -15.153684
episode 3208.000000, reward total was -15.000000. running mean: -15.152147
episode 3209.000000, reward total was -17.000000. running mean: -15.170626
episode 3210.000000, reward total was -15.000000. running mean: -15.168920
episode 3211.000000, reward total was -12.000000. running mean: -15.137231
episode 3212.000000, reward total was -17.000000. running mean: -15.155858
episode 3213.000000, reward total was -13.000000. running mean: -15.134300
episode 3214.000000, reward total was -16.000000. running mean: -15.142957
episode 3215.000000, rewa

episode 3312.000000, reward total was -9.000000. running mean: -14.535338
episode 3313.000000, reward total was -12.000000. running mean: -14.509985
episode 3314.000000, reward total was -9.000000. running mean: -14.454885
episode 3315.000000, reward total was -18.000000. running mean: -14.490336
episode 3316.000000, reward total was -17.000000. running mean: -14.515432
episode 3317.000000, reward total was -15.000000. running mean: -14.520278
episode 3318.000000, reward total was -15.000000. running mean: -14.525075
episode 3319.000000, reward total was -9.000000. running mean: -14.469825
episode 3320.000000, reward total was -15.000000. running mean: -14.475126
episode 3321.000000, reward total was -15.000000. running mean: -14.480375
episode 3322.000000, reward total was -16.000000. running mean: -14.495571
episode 3323.000000, reward total was -17.000000. running mean: -14.520616
episode 3324.000000, reward total was -18.000000. running mean: -14.555409
episode 3325.000000, reward 

episode 3422.000000, reward total was -16.000000. running mean: -13.891361
episode 3423.000000, reward total was -11.000000. running mean: -13.862447
episode 3424.000000, reward total was -15.000000. running mean: -13.873823
episode 3425.000000, reward total was -3.000000. running mean: -13.765085
episode 3426.000000, reward total was -9.000000. running mean: -13.717434
episode 3427.000000, reward total was -16.000000. running mean: -13.740259
episode 3428.000000, reward total was -6.000000. running mean: -13.662857
episode 3429.000000, reward total was -16.000000. running mean: -13.686228
episode 3430.000000, reward total was -11.000000. running mean: -13.659366
episode 3431.000000, reward total was -16.000000. running mean: -13.682772
episode 3432.000000, reward total was -15.000000. running mean: -13.695945
episode 3433.000000, reward total was -13.000000. running mean: -13.688985
episode 3434.000000, reward total was -11.000000. running mean: -13.662095
episode 3435.000000, reward 

episode 3532.000000, reward total was -17.000000. running mean: -14.884081
episode 3533.000000, reward total was -14.000000. running mean: -14.875240
episode 3534.000000, reward total was -16.000000. running mean: -14.886488
episode 3535.000000, reward total was -21.000000. running mean: -14.947623
episode 3536.000000, reward total was -18.000000. running mean: -14.978146
episode 3537.000000, reward total was -15.000000. running mean: -14.978365
episode 3538.000000, reward total was -15.000000. running mean: -14.978581
episode 3539.000000, reward total was -14.000000. running mean: -14.968796
episode 3540.000000, reward total was -17.000000. running mean: -14.989108
episode 3541.000000, reward total was -16.000000. running mean: -14.999216
episode 3542.000000, reward total was -19.000000. running mean: -15.039224
episode 3543.000000, reward total was -15.000000. running mean: -15.038832
episode 3544.000000, reward total was -14.000000. running mean: -15.028444
episode 3545.000000, rewa

episode 3642.000000, reward total was -18.000000. running mean: -13.349899
episode 3643.000000, reward total was -16.000000. running mean: -13.376400
episode 3644.000000, reward total was -15.000000. running mean: -13.392636
episode 3645.000000, reward total was -19.000000. running mean: -13.448710
episode 3646.000000, reward total was -19.000000. running mean: -13.504223
episode 3647.000000, reward total was -16.000000. running mean: -13.529181
episode 3648.000000, reward total was -16.000000. running mean: -13.553889
episode 3649.000000, reward total was -19.000000. running mean: -13.608350
episode 3650.000000, reward total was -19.000000. running mean: -13.662266
episode 3651.000000, reward total was -18.000000. running mean: -13.705644
episode 3652.000000, reward total was -16.000000. running mean: -13.728587
episode 3653.000000, reward total was -15.000000. running mean: -13.741301
episode 3654.000000, reward total was -8.000000. running mean: -13.683888
episode 3655.000000, rewar

episode 3752.000000, reward total was -14.000000. running mean: -13.240858
episode 3753.000000, reward total was -12.000000. running mean: -13.228450
episode 3754.000000, reward total was -12.000000. running mean: -13.216165
episode 3755.000000, reward total was 2.000000. running mean: -13.064004
episode 3756.000000, reward total was -12.000000. running mean: -13.053364
episode 3757.000000, reward total was -9.000000. running mean: -13.012830
episode 3758.000000, reward total was -8.000000. running mean: -12.962702
episode 3759.000000, reward total was -13.000000. running mean: -12.963075
episode 3760.000000, reward total was -18.000000. running mean: -13.013444
episode 3761.000000, reward total was -17.000000. running mean: -13.053310
episode 3762.000000, reward total was -17.000000. running mean: -13.092776
episode 3763.000000, reward total was -21.000000. running mean: -13.171849
episode 3764.000000, reward total was -18.000000. running mean: -13.220130
episode 3765.000000, reward t

episode 3862.000000, reward total was -13.000000. running mean: -12.660599
episode 3863.000000, reward total was -9.000000. running mean: -12.623993
episode 3864.000000, reward total was -17.000000. running mean: -12.667753
episode 3865.000000, reward total was -15.000000. running mean: -12.691075
episode 3866.000000, reward total was -8.000000. running mean: -12.644165
episode 3867.000000, reward total was -15.000000. running mean: -12.667723
episode 3868.000000, reward total was -13.000000. running mean: -12.671046
episode 3869.000000, reward total was -17.000000. running mean: -12.714335
episode 3870.000000, reward total was -14.000000. running mean: -12.727192
episode 3871.000000, reward total was -9.000000. running mean: -12.689920
episode 3872.000000, reward total was -13.000000. running mean: -12.693021
episode 3873.000000, reward total was -16.000000. running mean: -12.726091
episode 3874.000000, reward total was -13.000000. running mean: -12.728830
episode 3875.000000, reward 

episode 3972.000000, reward total was -17.000000. running mean: -12.950420
episode 3973.000000, reward total was -12.000000. running mean: -12.940916
episode 3974.000000, reward total was -4.000000. running mean: -12.851507
episode 3975.000000, reward total was -13.000000. running mean: -12.852992
episode 3976.000000, reward total was -16.000000. running mean: -12.884462
episode 3977.000000, reward total was -14.000000. running mean: -12.895617
episode 3978.000000, reward total was -15.000000. running mean: -12.916661
episode 3979.000000, reward total was -15.000000. running mean: -12.937494
episode 3980.000000, reward total was -14.000000. running mean: -12.948119
episode 3981.000000, reward total was -12.000000. running mean: -12.938638
episode 3982.000000, reward total was -13.000000. running mean: -12.939252
episode 3983.000000, reward total was -15.000000. running mean: -12.959859
episode 3984.000000, reward total was -16.000000. running mean: -12.990261
episode 3985.000000, rewar

episode 4082.000000, reward total was -16.000000. running mean: -13.066702
episode 4083.000000, reward total was -16.000000. running mean: -13.096035
episode 4084.000000, reward total was -13.000000. running mean: -13.095075
episode 4085.000000, reward total was -10.000000. running mean: -13.064124
episode 4086.000000, reward total was -10.000000. running mean: -13.033483
episode 4087.000000, reward total was -20.000000. running mean: -13.103148
episode 4088.000000, reward total was -6.000000. running mean: -13.032116
episode 4089.000000, reward total was -7.000000. running mean: -12.971795
episode 4090.000000, reward total was -15.000000. running mean: -12.992077
episode 4091.000000, reward total was -14.000000. running mean: -13.002156
episode 4092.000000, reward total was -16.000000. running mean: -13.032135
episode 4093.000000, reward total was -14.000000. running mean: -13.041814
episode 4094.000000, reward total was -17.000000. running mean: -13.081395
episode 4095.000000, reward

episode 4192.000000, reward total was -2.000000. running mean: -12.402113
episode 4193.000000, reward total was -13.000000. running mean: -12.408092
episode 4194.000000, reward total was -12.000000. running mean: -12.404011
episode 4195.000000, reward total was -8.000000. running mean: -12.359971
episode 4196.000000, reward total was -13.000000. running mean: -12.366371
episode 4197.000000, reward total was -14.000000. running mean: -12.382708
episode 4198.000000, reward total was -15.000000. running mean: -12.408880
episode 4199.000000, reward total was -9.000000. running mean: -12.374792
episode 4200.000000, reward total was -14.000000. running mean: -12.391044
episode 4201.000000, reward total was -12.000000. running mean: -12.387133
episode 4202.000000, reward total was -10.000000. running mean: -12.363262
episode 4203.000000, reward total was -8.000000. running mean: -12.319629
episode 4204.000000, reward total was -12.000000. running mean: -12.316433
episode 4205.000000, reward t

episode 4302.000000, reward total was -10.000000. running mean: -12.355865
episode 4303.000000, reward total was -9.000000. running mean: -12.322307
episode 4304.000000, reward total was -3.000000. running mean: -12.229084
episode 4305.000000, reward total was -20.000000. running mean: -12.306793
episode 4306.000000, reward total was -11.000000. running mean: -12.293725
episode 4307.000000, reward total was -5.000000. running mean: -12.220788
episode 4308.000000, reward total was -9.000000. running mean: -12.188580
episode 4309.000000, reward total was -17.000000. running mean: -12.236694
episode 4310.000000, reward total was -10.000000. running mean: -12.214327
episode 4311.000000, reward total was -16.000000. running mean: -12.252184
episode 4312.000000, reward total was -10.000000. running mean: -12.229662
episode 4313.000000, reward total was -18.000000. running mean: -12.287365
episode 4314.000000, reward total was -11.000000. running mean: -12.274492
episode 4315.000000, reward t

episode 4412.000000, reward total was -14.000000. running mean: -11.355292
episode 4413.000000, reward total was -17.000000. running mean: -11.411739
episode 4414.000000, reward total was -5.000000. running mean: -11.347622
episode 4415.000000, reward total was -11.000000. running mean: -11.344146
episode 4416.000000, reward total was -10.000000. running mean: -11.330704
episode 4417.000000, reward total was -8.000000. running mean: -11.297397
episode 4418.000000, reward total was -10.000000. running mean: -11.284423
episode 4419.000000, reward total was -13.000000. running mean: -11.301579
episode 4420.000000, reward total was -12.000000. running mean: -11.308563
episode 4421.000000, reward total was -14.000000. running mean: -11.335478
episode 4422.000000, reward total was -14.000000. running mean: -11.362123
episode 4423.000000, reward total was -12.000000. running mean: -11.368502
episode 4424.000000, reward total was -14.000000. running mean: -11.394817
episode 4425.000000, reward

episode 4522.000000, reward total was -9.000000. running mean: -10.582485
episode 4523.000000, reward total was -6.000000. running mean: -10.536660
episode 4524.000000, reward total was -10.000000. running mean: -10.531293
episode 4525.000000, reward total was -10.000000. running mean: -10.525980
episode 4526.000000, reward total was -7.000000. running mean: -10.490720
episode 4527.000000, reward total was -3.000000. running mean: -10.415813
episode 4528.000000, reward total was -12.000000. running mean: -10.431655
episode 4529.000000, reward total was -12.000000. running mean: -10.447338
episode 4530.000000, reward total was -3.000000. running mean: -10.372865
episode 4531.000000, reward total was -8.000000. running mean: -10.349136
episode 4532.000000, reward total was -12.000000. running mean: -10.365645
episode 4533.000000, reward total was -9.000000. running mean: -10.351989
episode 4534.000000, reward total was -12.000000. running mean: -10.368469
episode 4535.000000, reward tota

episode 4632.000000, reward total was -6.000000. running mean: -10.330060
episode 4633.000000, reward total was -17.000000. running mean: -10.396759
episode 4634.000000, reward total was -11.000000. running mean: -10.402792
episode 4635.000000, reward total was -13.000000. running mean: -10.428764
episode 4636.000000, reward total was -14.000000. running mean: -10.464476
episode 4637.000000, reward total was -15.000000. running mean: -10.509831
episode 4638.000000, reward total was -12.000000. running mean: -10.524733
episode 4639.000000, reward total was -14.000000. running mean: -10.559486
episode 4640.000000, reward total was -8.000000. running mean: -10.533891
episode 4641.000000, reward total was -15.000000. running mean: -10.578552
episode 4642.000000, reward total was -15.000000. running mean: -10.622767
episode 4643.000000, reward total was -9.000000. running mean: -10.606539
episode 4644.000000, reward total was -13.000000. running mean: -10.630473
episode 4645.000000, reward 

episode 4742.000000, reward total was -17.000000. running mean: -10.886923
episode 4743.000000, reward total was 2.000000. running mean: -10.758054
episode 4744.000000, reward total was -14.000000. running mean: -10.790473
episode 4745.000000, reward total was -18.000000. running mean: -10.862569
episode 4746.000000, reward total was -10.000000. running mean: -10.853943
episode 4747.000000, reward total was -15.000000. running mean: -10.895404
episode 4748.000000, reward total was 1.000000. running mean: -10.776450
episode 4749.000000, reward total was -7.000000. running mean: -10.738685
episode 4750.000000, reward total was -13.000000. running mean: -10.761298
episode 4751.000000, reward total was -17.000000. running mean: -10.823685
episode 4752.000000, reward total was -5.000000. running mean: -10.765448
episode 4753.000000, reward total was -12.000000. running mean: -10.777794
episode 4754.000000, reward total was -16.000000. running mean: -10.830016
episode 4755.000000, reward tot

episode 4853.000000, reward total was -15.000000. running mean: -10.196360
episode 4854.000000, reward total was -6.000000. running mean: -10.154396
episode 4855.000000, reward total was -13.000000. running mean: -10.182852
episode 4856.000000, reward total was -10.000000. running mean: -10.181024
episode 4857.000000, reward total was -13.000000. running mean: -10.209214
episode 4858.000000, reward total was -12.000000. running mean: -10.227121
episode 4859.000000, reward total was -6.000000. running mean: -10.184850
episode 4860.000000, reward total was -3.000000. running mean: -10.113002
episode 4861.000000, reward total was -8.000000. running mean: -10.091872
episode 4862.000000, reward total was -13.000000. running mean: -10.120953
episode 4863.000000, reward total was -11.000000. running mean: -10.129743
episode 4864.000000, reward total was -9.000000. running mean: -10.118446
episode 4865.000000, reward total was -5.000000. running mean: -10.067262
episode 4866.000000, reward tot

episode 4964.000000, reward total was -14.000000. running mean: -9.703245
episode 4965.000000, reward total was -12.000000. running mean: -9.726212
episode 4966.000000, reward total was -17.000000. running mean: -9.798950
episode 4967.000000, reward total was -15.000000. running mean: -9.850961
episode 4968.000000, reward total was -11.000000. running mean: -9.862451
episode 4969.000000, reward total was -11.000000. running mean: -9.873827
episode 4970.000000, reward total was -5.000000. running mean: -9.825088
episode 4971.000000, reward total was -15.000000. running mean: -9.876837
episode 4972.000000, reward total was -5.000000. running mean: -9.828069
episode 4973.000000, reward total was -14.000000. running mean: -9.869788
episode 4974.000000, reward total was 2.000000. running mean: -9.751091
episode 4975.000000, reward total was -4.000000. running mean: -9.693580
episode 4976.000000, reward total was -12.000000. running mean: -9.716644
episode 4977.000000, reward total was -6.00

episode 5075.000000, reward total was -12.000000. running mean: -10.356478
episode 5076.000000, reward total was -15.000000. running mean: -10.402913
episode 5077.000000, reward total was -12.000000. running mean: -10.418884
episode 5078.000000, reward total was -7.000000. running mean: -10.384695
episode 5079.000000, reward total was -7.000000. running mean: -10.350849
episode 5080.000000, reward total was -12.000000. running mean: -10.367340
episode 5081.000000, reward total was -7.000000. running mean: -10.333667
episode 5082.000000, reward total was -11.000000. running mean: -10.340330
episode 5083.000000, reward total was -11.000000. running mean: -10.346927
episode 5084.000000, reward total was -4.000000. running mean: -10.283457
episode 5085.000000, reward total was -13.000000. running mean: -10.310623
episode 5086.000000, reward total was -5.000000. running mean: -10.257517
episode 5087.000000, reward total was 2.000000. running mean: -10.134941
episode 5088.000000, reward tota

episode 5185.000000, reward total was -12.000000. running mean: -11.602779
episode 5186.000000, reward total was -8.000000. running mean: -11.566751
episode 5187.000000, reward total was -9.000000. running mean: -11.541083
episode 5188.000000, reward total was -2.000000. running mean: -11.445672
episode 5189.000000, reward total was -15.000000. running mean: -11.481216
episode 5190.000000, reward total was -3.000000. running mean: -11.396404
episode 5191.000000, reward total was -13.000000. running mean: -11.412439
episode 5192.000000, reward total was -4.000000. running mean: -11.338315
episode 5193.000000, reward total was -12.000000. running mean: -11.344932
episode 5194.000000, reward total was -8.000000. running mean: -11.311483
episode 5195.000000, reward total was -17.000000. running mean: -11.368368
episode 5196.000000, reward total was -9.000000. running mean: -11.344684
episode 5197.000000, reward total was -18.000000. running mean: -11.411237
episode 5198.000000, reward tota

episode 5295.000000, reward total was -9.000000. running mean: -11.003675
episode 5296.000000, reward total was -15.000000. running mean: -11.043638
episode 5297.000000, reward total was -7.000000. running mean: -11.003201
episode 5298.000000, reward total was -7.000000. running mean: -10.963169
episode 5299.000000, reward total was -12.000000. running mean: -10.973538
episode 5300.000000, reward total was -11.000000. running mean: -10.973802
episode 5301.000000, reward total was -12.000000. running mean: -10.984064
episode 5302.000000, reward total was -12.000000. running mean: -10.994224
episode 5303.000000, reward total was -6.000000. running mean: -10.944281
episode 5304.000000, reward total was -8.000000. running mean: -10.914839
episode 5305.000000, reward total was -5.000000. running mean: -10.855690
episode 5306.000000, reward total was -5.000000. running mean: -10.797133
episode 5307.000000, reward total was -9.000000. running mean: -10.779162
episode 5308.000000, reward total

episode 5405.000000, reward total was -4.000000. running mean: -10.434793
episode 5406.000000, reward total was -17.000000. running mean: -10.500445
episode 5407.000000, reward total was -17.000000. running mean: -10.565441
episode 5408.000000, reward total was -13.000000. running mean: -10.589786
episode 5409.000000, reward total was -11.000000. running mean: -10.593888
episode 5410.000000, reward total was -5.000000. running mean: -10.537950
episode 5411.000000, reward total was -7.000000. running mean: -10.502570
episode 5412.000000, reward total was -10.000000. running mean: -10.497544
episode 5413.000000, reward total was -11.000000. running mean: -10.502569
episode 5414.000000, reward total was -15.000000. running mean: -10.547543
episode 5415.000000, reward total was -10.000000. running mean: -10.542068
episode 5416.000000, reward total was -6.000000. running mean: -10.496647
episode 5417.000000, reward total was -8.000000. running mean: -10.471681
episode 5418.000000, reward to

episode 5515.000000, reward total was -15.000000. running mean: -10.464752
episode 5516.000000, reward total was -7.000000. running mean: -10.430105
episode 5517.000000, reward total was -14.000000. running mean: -10.465804
episode 5518.000000, reward total was -17.000000. running mean: -10.531146
episode 5519.000000, reward total was -13.000000. running mean: -10.555834
episode 5520.000000, reward total was -17.000000. running mean: -10.620276
episode 5521.000000, reward total was -10.000000. running mean: -10.614073
episode 5522.000000, reward total was -17.000000. running mean: -10.677933
episode 5523.000000, reward total was -8.000000. running mean: -10.651153
episode 5524.000000, reward total was -14.000000. running mean: -10.684642
episode 5525.000000, reward total was -11.000000. running mean: -10.687795
episode 5526.000000, reward total was -15.000000. running mean: -10.730917
episode 5527.000000, reward total was -17.000000. running mean: -10.793608
episode 5528.000000, reward

episode 5625.000000, reward total was -10.000000. running mean: -11.213193
episode 5626.000000, reward total was -4.000000. running mean: -11.141061
episode 5627.000000, reward total was -12.000000. running mean: -11.149651
episode 5628.000000, reward total was -17.000000. running mean: -11.208154
episode 5629.000000, reward total was -15.000000. running mean: -11.246073
episode 5630.000000, reward total was -16.000000. running mean: -11.293612
episode 5631.000000, reward total was -13.000000. running mean: -11.310676
episode 5632.000000, reward total was -12.000000. running mean: -11.317569
episode 5633.000000, reward total was -11.000000. running mean: -11.314393
episode 5634.000000, reward total was -12.000000. running mean: -11.321249
episode 5635.000000, reward total was -16.000000. running mean: -11.368037
episode 5636.000000, reward total was -13.000000. running mean: -11.384357
episode 5637.000000, reward total was -17.000000. running mean: -11.440513
episode 5638.000000, rewar

episode 5735.000000, reward total was -15.000000. running mean: -10.710799
episode 5736.000000, reward total was -12.000000. running mean: -10.723691
episode 5737.000000, reward total was -8.000000. running mean: -10.696454
episode 5738.000000, reward total was -10.000000. running mean: -10.689489
episode 5739.000000, reward total was -13.000000. running mean: -10.712594
episode 5740.000000, reward total was -8.000000. running mean: -10.685468
episode 5741.000000, reward total was -2.000000. running mean: -10.598614
episode 5742.000000, reward total was -10.000000. running mean: -10.592628
episode 5743.000000, reward total was -15.000000. running mean: -10.636701
episode 5744.000000, reward total was -12.000000. running mean: -10.650334
episode 5745.000000, reward total was -9.000000. running mean: -10.633831
episode 5746.000000, reward total was -1.000000. running mean: -10.537493
episode 5747.000000, reward total was -9.000000. running mean: -10.522118
episode 5748.000000, reward tot

episode 5846.000000, reward total was -13.000000. running mean: -9.781345
episode 5847.000000, reward total was -11.000000. running mean: -9.793532
episode 5848.000000, reward total was -12.000000. running mean: -9.815597
episode 5849.000000, reward total was -9.000000. running mean: -9.807441
episode 5850.000000, reward total was -6.000000. running mean: -9.769366
episode 5851.000000, reward total was -5.000000. running mean: -9.721672
episode 5852.000000, reward total was -9.000000. running mean: -9.714456
episode 5853.000000, reward total was -13.000000. running mean: -9.747311
episode 5854.000000, reward total was -6.000000. running mean: -9.709838
episode 5855.000000, reward total was -2.000000. running mean: -9.632740
episode 5856.000000, reward total was -14.000000. running mean: -9.676412
episode 5857.000000, reward total was -11.000000. running mean: -9.689648
episode 5858.000000, reward total was 4.000000. running mean: -9.552752
episode 5859.000000, reward total was -7.00000

episode 5958.000000, reward total was -8.000000. running mean: -8.892868
episode 5959.000000, reward total was -15.000000. running mean: -8.953940
episode 5960.000000, reward total was -14.000000. running mean: -9.004400
episode 5961.000000, reward total was -9.000000. running mean: -9.004356
episode 5962.000000, reward total was 1.000000. running mean: -8.904313
episode 5963.000000, reward total was -6.000000. running mean: -8.875270
episode 5964.000000, reward total was -4.000000. running mean: -8.826517
episode 5965.000000, reward total was -7.000000. running mean: -8.808252
episode 5966.000000, reward total was -8.000000. running mean: -8.800169
episode 5967.000000, reward total was -13.000000. running mean: -8.842168
episode 5968.000000, reward total was 16.000000. running mean: -8.593746
episode 5969.000000, reward total was -6.000000. running mean: -8.567808
episode 5970.000000, reward total was -7.000000. running mean: -8.552130
episode 5971.000000, reward total was -5.000000. 

episode 6070.000000, reward total was -12.000000. running mean: -8.490574
episode 6071.000000, reward total was 7.000000. running mean: -8.335668
episode 6072.000000, reward total was -5.000000. running mean: -8.302311
episode 6073.000000, reward total was -7.000000. running mean: -8.289288
episode 6074.000000, reward total was -12.000000. running mean: -8.326395
episode 6075.000000, reward total was -12.000000. running mean: -8.363132
episode 6076.000000, reward total was -12.000000. running mean: -8.399500
episode 6077.000000, reward total was -8.000000. running mean: -8.395505
episode 6078.000000, reward total was -8.000000. running mean: -8.391550
episode 6079.000000, reward total was -13.000000. running mean: -8.437635
episode 6080.000000, reward total was -7.000000. running mean: -8.423258
episode 6081.000000, reward total was 9.000000. running mean: -8.249026
episode 6082.000000, reward total was -12.000000. running mean: -8.286535
episode 6083.000000, reward total was -7.000000

episode 6182.000000, reward total was -20.000000. running mean: -8.780800
episode 6183.000000, reward total was -2.000000. running mean: -8.712992
episode 6184.000000, reward total was -2.000000. running mean: -8.645862
episode 6185.000000, reward total was -1.000000. running mean: -8.569403
episode 6186.000000, reward total was -10.000000. running mean: -8.583709
episode 6187.000000, reward total was -5.000000. running mean: -8.547872
episode 6188.000000, reward total was -2.000000. running mean: -8.482393
episode 6189.000000, reward total was -9.000000. running mean: -8.487570
episode 6190.000000, reward total was -12.000000. running mean: -8.522694
episode 6191.000000, reward total was -5.000000. running mean: -8.487467
episode 6192.000000, reward total was -10.000000. running mean: -8.502592
episode 6193.000000, reward total was -6.000000. running mean: -8.477566
episode 6194.000000, reward total was -9.000000. running mean: -8.482791
episode 6195.000000, reward total was -13.00000

episode 6294.000000, reward total was 2.000000. running mean: -7.507235
episode 6295.000000, reward total was -8.000000. running mean: -7.512163
episode 6296.000000, reward total was -8.000000. running mean: -7.517041
episode 6297.000000, reward total was -4.000000. running mean: -7.481871
episode 6298.000000, reward total was -7.000000. running mean: -7.477052
episode 6299.000000, reward total was 7.000000. running mean: -7.332281
episode 6300.000000, reward total was -10.000000. running mean: -7.358958
episode 6301.000000, reward total was -6.000000. running mean: -7.345369
episode 6302.000000, reward total was -7.000000. running mean: -7.341915
episode 6303.000000, reward total was -12.000000. running mean: -7.388496
episode 6304.000000, reward total was -4.000000. running mean: -7.354611
episode 6305.000000, reward total was -11.000000. running mean: -7.391065
episode 6306.000000, reward total was -14.000000. running mean: -7.457154
episode 6307.000000, reward total was -14.000000.

episode 6406.000000, reward total was -7.000000. running mean: -8.481071
episode 6407.000000, reward total was -5.000000. running mean: -8.446260
episode 6408.000000, reward total was -6.000000. running mean: -8.421797
episode 6409.000000, reward total was -3.000000. running mean: -8.367579
episode 6410.000000, reward total was -7.000000. running mean: -8.353904
episode 6411.000000, reward total was -5.000000. running mean: -8.320365
episode 6412.000000, reward total was -5.000000. running mean: -8.287161
episode 6413.000000, reward total was -10.000000. running mean: -8.304289
episode 6414.000000, reward total was -5.000000. running mean: -8.271246
episode 6415.000000, reward total was -7.000000. running mean: -8.258534
episode 6416.000000, reward total was -8.000000. running mean: -8.255949
episode 6417.000000, reward total was -17.000000. running mean: -8.343389
episode 6418.000000, reward total was -2.000000. running mean: -8.279955
episode 6419.000000, reward total was -9.000000. 

episode 6518.000000, reward total was -14.000000. running mean: -7.503562
episode 6519.000000, reward total was -14.000000. running mean: -7.568526
episode 6520.000000, reward total was -9.000000. running mean: -7.582841
episode 6521.000000, reward total was -10.000000. running mean: -7.607012
episode 6522.000000, reward total was -3.000000. running mean: -7.560942
episode 6523.000000, reward total was -7.000000. running mean: -7.555333
episode 6524.000000, reward total was -8.000000. running mean: -7.559780
episode 6525.000000, reward total was 2.000000. running mean: -7.464182
episode 6526.000000, reward total was -2.000000. running mean: -7.409540
episode 6527.000000, reward total was -11.000000. running mean: -7.445445
episode 6528.000000, reward total was -15.000000. running mean: -7.520990
episode 6529.000000, reward total was -13.000000. running mean: -7.575780
episode 6530.000000, reward total was -9.000000. running mean: -7.590022
episode 6531.000000, reward total was -11.0000

episode 6630.000000, reward total was -5.000000. running mean: -7.945917
episode 6631.000000, reward total was -1.000000. running mean: -7.876458
episode 6632.000000, reward total was -9.000000. running mean: -7.887693
episode 6633.000000, reward total was -10.000000. running mean: -7.908816
episode 6634.000000, reward total was -7.000000. running mean: -7.899728
episode 6635.000000, reward total was -11.000000. running mean: -7.930731
episode 6636.000000, reward total was -11.000000. running mean: -7.961424
episode 6637.000000, reward total was -10.000000. running mean: -7.981809
episode 6638.000000, reward total was -4.000000. running mean: -7.941991
episode 6639.000000, reward total was -5.000000. running mean: -7.912571
episode 6640.000000, reward total was -3.000000. running mean: -7.863446
episode 6641.000000, reward total was -14.000000. running mean: -7.924811
episode 6642.000000, reward total was -13.000000. running mean: -7.975563
episode 6643.000000, reward total was -15.000

episode 6743.000000, reward total was -13.000000. running mean: -6.629602
episode 6744.000000, reward total was -11.000000. running mean: -6.673306
episode 6745.000000, reward total was -9.000000. running mean: -6.696573
episode 6746.000000, reward total was 10.000000. running mean: -6.529607
episode 6747.000000, reward total was -4.000000. running mean: -6.504311
episode 6748.000000, reward total was -9.000000. running mean: -6.529268
episode 6749.000000, reward total was 2.000000. running mean: -6.443976
episode 6750.000000, reward total was -1.000000. running mean: -6.389536
episode 6751.000000, reward total was -10.000000. running mean: -6.425640
episode 6752.000000, reward total was -5.000000. running mean: -6.411384
episode 6753.000000, reward total was -11.000000. running mean: -6.457270
episode 6754.000000, reward total was -8.000000. running mean: -6.472697
episode 6755.000000, reward total was -12.000000. running mean: -6.527970
episode 6756.000000, reward total was 2.000000.

episode 6855.000000, reward total was -12.000000. running mean: -6.365372
episode 6856.000000, reward total was -15.000000. running mean: -6.451718
episode 6857.000000, reward total was 5.000000. running mean: -6.337201
episode 6858.000000, reward total was -17.000000. running mean: -6.443829
episode 6859.000000, reward total was -15.000000. running mean: -6.529391
episode 6860.000000, reward total was -10.000000. running mean: -6.564097
episode 6861.000000, reward total was 9.000000. running mean: -6.408456
episode 6862.000000, reward total was -7.000000. running mean: -6.414371
episode 6863.000000, reward total was -3.000000. running mean: -6.380228
episode 6864.000000, reward total was -5.000000. running mean: -6.366425
episode 6865.000000, reward total was -7.000000. running mean: -6.372761
episode 6866.000000, reward total was 13.000000. running mean: -6.179034
episode 6867.000000, reward total was -9.000000. running mean: -6.207243
episode 6868.000000, reward total was -9.000000.

episode 6967.000000, reward total was -10.000000. running mean: -7.271538
episode 6968.000000, reward total was -10.000000. running mean: -7.298822
episode 6969.000000, reward total was -13.000000. running mean: -7.355834
episode 6970.000000, reward total was 2.000000. running mean: -7.262276
episode 6971.000000, reward total was -12.000000. running mean: -7.309653
episode 6972.000000, reward total was 3.000000. running mean: -7.206556
episode 6973.000000, reward total was -15.000000. running mean: -7.284491
episode 6974.000000, reward total was -15.000000. running mean: -7.361646
episode 6975.000000, reward total was -10.000000. running mean: -7.388029
episode 6976.000000, reward total was -9.000000. running mean: -7.404149
episode 6977.000000, reward total was -16.000000. running mean: -7.490108
episode 6978.000000, reward total was -10.000000. running mean: -7.515207
episode 6979.000000, reward total was -9.000000. running mean: -7.530055
episode 6980.000000, reward total was -7.000