In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [2]:
%time hist1 = train_model(env, model, total_episodes=7000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -21.000000. running mean: -21.000000
episode 3.000000, reward total was -20.000000. running mean: -20.990000
episode 4.000000, reward total was -21.000000. running mean: -20.990100
episode 5.000000, reward total was -21.000000. running mean: -20.990199
episode 6.000000, reward total was -21.000000. running mean: -20.990297
episode 7.000000, reward total was -20.000000. running mean: -20.980394
episode 8.000000, reward total was -21.000000. running mean: -20.980590
episode 9.000000, reward total was -21.000000. running mean: -20.980784
episode 10.000000, reward total was -20.000000. running mean: -20.970976
episode 11.000000, reward total was -20.000000. running mean: -20.961267
episode 12.000000, reward total was -20.000000. running mean: -20.951654
episode 13.000000, reward total was -21.000000. running mean: -20.952137
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -20.000000. running mean: -20.526205
episode 115.000000, reward total was -19.000000. running mean: -20.510943
episode 116.000000, reward total was -20.000000. running mean: -20.505833
episode 117.000000, reward total was -19.000000. running mean: -20.490775
episode 118.000000, reward total was -21.000000. running mean: -20.495867
episode 119.000000, reward total was -21.000000. running mean: -20.500909
episode 120.000000, reward total was -21.000000. running mean: -20.505900
episode 121.000000, reward total was -20.000000. running mean: -20.500841
episode 122.000000, reward total was -21.000000. running mean: -20.505832
episode 123.000000, reward total was -21.000000. running mean: -20.510774
episode 124.000000, reward total was -20.000000. running mean: -20.505666
episode 125.000000, reward total was -21.000000. running mean: -20.510609
episode 126.000000, reward total was -21.000000. running mean: -20.515503
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.266297
episode 226.000000, reward total was -20.000000. running mean: -20.263634
episode 227.000000, reward total was -21.000000. running mean: -20.270997
episode 228.000000, reward total was -21.000000. running mean: -20.278287
episode 229.000000, reward total was -21.000000. running mean: -20.285504
episode 230.000000, reward total was -21.000000. running mean: -20.292649
episode 231.000000, reward total was -19.000000. running mean: -20.279723
episode 232.000000, reward total was -21.000000. running mean: -20.286926
episode 233.000000, reward total was -20.000000. running mean: -20.284056
episode 234.000000, reward total was -19.000000. running mean: -20.271216
episode 235.000000, reward total was -21.000000. running mean: -20.278504
episode 236.000000, reward total was -21.000000. running mean: -20.285719
episode 237.000000, reward total was -21.000000. running mean: -20.292861
episode 238.000000, reward total was -

episode 336.000000, reward total was -19.000000. running mean: -20.053859
episode 337.000000, reward total was -21.000000. running mean: -20.063320
episode 338.000000, reward total was -21.000000. running mean: -20.072687
episode 339.000000, reward total was -19.000000. running mean: -20.061960
episode 340.000000, reward total was -21.000000. running mean: -20.071340
episode 341.000000, reward total was -18.000000. running mean: -20.050627
episode 342.000000, reward total was -20.000000. running mean: -20.050121
episode 343.000000, reward total was -16.000000. running mean: -20.009620
episode 344.000000, reward total was -21.000000. running mean: -20.019523
episode 345.000000, reward total was -21.000000. running mean: -20.029328
episode 346.000000, reward total was -20.000000. running mean: -20.029035
episode 347.000000, reward total was -21.000000. running mean: -20.038745
episode 348.000000, reward total was -19.000000. running mean: -20.028357
episode 349.000000, reward total was -

episode 447.000000, reward total was -17.000000. running mean: -19.738268
episode 448.000000, reward total was -17.000000. running mean: -19.710885
episode 449.000000, reward total was -19.000000. running mean: -19.703777
episode 450.000000, reward total was -21.000000. running mean: -19.716739
episode 451.000000, reward total was -21.000000. running mean: -19.729571
episode 452.000000, reward total was -21.000000. running mean: -19.742276
episode 453.000000, reward total was -19.000000. running mean: -19.734853
episode 454.000000, reward total was -19.000000. running mean: -19.727504
episode 455.000000, reward total was -21.000000. running mean: -19.740229
episode 456.000000, reward total was -17.000000. running mean: -19.712827
episode 457.000000, reward total was -21.000000. running mean: -19.725699
episode 458.000000, reward total was -19.000000. running mean: -19.718442
episode 459.000000, reward total was -19.000000. running mean: -19.711257
episode 460.000000, reward total was -

episode 558.000000, reward total was -21.000000. running mean: -19.369571
episode 559.000000, reward total was -20.000000. running mean: -19.375875
episode 560.000000, reward total was -21.000000. running mean: -19.392117
episode 561.000000, reward total was -17.000000. running mean: -19.368195
episode 562.000000, reward total was -17.000000. running mean: -19.344513
episode 563.000000, reward total was -16.000000. running mean: -19.311068
episode 564.000000, reward total was -21.000000. running mean: -19.327958
episode 565.000000, reward total was -18.000000. running mean: -19.314678
episode 566.000000, reward total was -12.000000. running mean: -19.241531
episode 567.000000, reward total was -21.000000. running mean: -19.259116
episode 568.000000, reward total was -17.000000. running mean: -19.236525
episode 569.000000, reward total was -20.000000. running mean: -19.244159
episode 570.000000, reward total was -19.000000. running mean: -19.241718
episode 571.000000, reward total was -

episode 669.000000, reward total was -19.000000. running mean: -18.804415
episode 670.000000, reward total was -17.000000. running mean: -18.786371
episode 671.000000, reward total was -18.000000. running mean: -18.778507
episode 672.000000, reward total was -21.000000. running mean: -18.800722
episode 673.000000, reward total was -17.000000. running mean: -18.782715
episode 674.000000, reward total was -17.000000. running mean: -18.764887
episode 675.000000, reward total was -20.000000. running mean: -18.777239
episode 676.000000, reward total was -20.000000. running mean: -18.789466
episode 677.000000, reward total was -17.000000. running mean: -18.771572
episode 678.000000, reward total was -17.000000. running mean: -18.753856
episode 679.000000, reward total was -18.000000. running mean: -18.746317
episode 680.000000, reward total was -18.000000. running mean: -18.738854
episode 681.000000, reward total was -21.000000. running mean: -18.761466
episode 682.000000, reward total was -

episode 780.000000, reward total was -19.000000. running mean: -18.135275
episode 781.000000, reward total was -18.000000. running mean: -18.133922
episode 782.000000, reward total was -17.000000. running mean: -18.122583
episode 783.000000, reward total was -19.000000. running mean: -18.131357
episode 784.000000, reward total was -17.000000. running mean: -18.120044
episode 785.000000, reward total was -19.000000. running mean: -18.128843
episode 786.000000, reward total was -19.000000. running mean: -18.137555
episode 787.000000, reward total was -15.000000. running mean: -18.106179
episode 788.000000, reward total was -18.000000. running mean: -18.105117
episode 789.000000, reward total was -18.000000. running mean: -18.104066
episode 790.000000, reward total was -21.000000. running mean: -18.133025
episode 791.000000, reward total was -16.000000. running mean: -18.111695
episode 792.000000, reward total was -21.000000. running mean: -18.140578
episode 793.000000, reward total was -

episode 891.000000, reward total was -19.000000. running mean: -17.476566
episode 892.000000, reward total was -15.000000. running mean: -17.451800
episode 893.000000, reward total was -9.000000. running mean: -17.367282
episode 894.000000, reward total was -16.000000. running mean: -17.353609
episode 895.000000, reward total was -17.000000. running mean: -17.350073
episode 896.000000, reward total was -17.000000. running mean: -17.346573
episode 897.000000, reward total was -17.000000. running mean: -17.343107
episode 898.000000, reward total was -19.000000. running mean: -17.359676
episode 899.000000, reward total was -18.000000. running mean: -17.366079
episode 900.000000, reward total was -15.000000. running mean: -17.342418
episode 901.000000, reward total was -16.000000. running mean: -17.328994
episode 902.000000, reward total was -13.000000. running mean: -17.285704
episode 903.000000, reward total was -21.000000. running mean: -17.322847
episode 904.000000, reward total was -1

episode 1002.000000, reward total was -13.000000. running mean: -17.007910
episode 1003.000000, reward total was -13.000000. running mean: -16.967831
episode 1004.000000, reward total was -17.000000. running mean: -16.968153
episode 1005.000000, reward total was -16.000000. running mean: -16.958471
episode 1006.000000, reward total was -14.000000. running mean: -16.928886
episode 1007.000000, reward total was -17.000000. running mean: -16.929597
episode 1008.000000, reward total was -17.000000. running mean: -16.930301
episode 1009.000000, reward total was -12.000000. running mean: -16.880998
episode 1010.000000, reward total was -15.000000. running mean: -16.862188
episode 1011.000000, reward total was -16.000000. running mean: -16.853567
episode 1012.000000, reward total was -16.000000. running mean: -16.845031
episode 1013.000000, reward total was -18.000000. running mean: -16.856581
episode 1014.000000, reward total was -18.000000. running mean: -16.868015
episode 1015.000000, rewa

episode 1112.000000, reward total was -14.000000. running mean: -15.852421
episode 1113.000000, reward total was -15.000000. running mean: -15.843897
episode 1114.000000, reward total was -16.000000. running mean: -15.845458
episode 1115.000000, reward total was -9.000000. running mean: -15.777004
episode 1116.000000, reward total was -15.000000. running mean: -15.769234
episode 1117.000000, reward total was -19.000000. running mean: -15.801541
episode 1118.000000, reward total was -19.000000. running mean: -15.833526
episode 1119.000000, reward total was -14.000000. running mean: -15.815191
episode 1120.000000, reward total was -14.000000. running mean: -15.797039
episode 1121.000000, reward total was -20.000000. running mean: -15.839068
episode 1122.000000, reward total was -15.000000. running mean: -15.830678
episode 1123.000000, reward total was -14.000000. running mean: -15.812371
episode 1124.000000, reward total was -17.000000. running mean: -15.824247
episode 1125.000000, rewar

episode 1222.000000, reward total was -8.000000. running mean: -14.550289
episode 1223.000000, reward total was -17.000000. running mean: -14.574786
episode 1224.000000, reward total was -15.000000. running mean: -14.579038
episode 1225.000000, reward total was -14.000000. running mean: -14.573248
episode 1226.000000, reward total was -14.000000. running mean: -14.567516
episode 1227.000000, reward total was -16.000000. running mean: -14.581840
episode 1228.000000, reward total was -15.000000. running mean: -14.586022
episode 1229.000000, reward total was -18.000000. running mean: -14.620162
episode 1230.000000, reward total was -12.000000. running mean: -14.593960
episode 1231.000000, reward total was -19.000000. running mean: -14.638021
episode 1232.000000, reward total was -19.000000. running mean: -14.681640
episode 1233.000000, reward total was -17.000000. running mean: -14.704824
episode 1234.000000, reward total was -13.000000. running mean: -14.687776
episode 1235.000000, rewar

episode 1332.000000, reward total was -17.000000. running mean: -13.953111
episode 1333.000000, reward total was -13.000000. running mean: -13.943580
episode 1334.000000, reward total was -13.000000. running mean: -13.934144
episode 1335.000000, reward total was -8.000000. running mean: -13.874803
episode 1336.000000, reward total was -16.000000. running mean: -13.896055
episode 1337.000000, reward total was -15.000000. running mean: -13.907094
episode 1338.000000, reward total was -21.000000. running mean: -13.978023
episode 1339.000000, reward total was -14.000000. running mean: -13.978243
episode 1340.000000, reward total was -13.000000. running mean: -13.968460
episode 1341.000000, reward total was -14.000000. running mean: -13.968776
episode 1342.000000, reward total was -5.000000. running mean: -13.879088
episode 1343.000000, reward total was -15.000000. running mean: -13.890297
episode 1344.000000, reward total was -14.000000. running mean: -13.891394
episode 1345.000000, reward

episode 1442.000000, reward total was -16.000000. running mean: -13.001273
episode 1443.000000, reward total was -9.000000. running mean: -12.961261
episode 1444.000000, reward total was -11.000000. running mean: -12.941648
episode 1445.000000, reward total was -15.000000. running mean: -12.962231
episode 1446.000000, reward total was -11.000000. running mean: -12.942609
episode 1447.000000, reward total was -12.000000. running mean: -12.933183
episode 1448.000000, reward total was -15.000000. running mean: -12.953851
episode 1449.000000, reward total was -17.000000. running mean: -12.994313
episode 1450.000000, reward total was -15.000000. running mean: -13.014370
episode 1451.000000, reward total was -8.000000. running mean: -12.964226
episode 1452.000000, reward total was -16.000000. running mean: -12.994584
episode 1453.000000, reward total was -9.000000. running mean: -12.954638
episode 1454.000000, reward total was -18.000000. running mean: -13.005091
episode 1455.000000, reward 

episode 1552.000000, reward total was -10.000000. running mean: -11.539608
episode 1553.000000, reward total was -17.000000. running mean: -11.594212
episode 1554.000000, reward total was -13.000000. running mean: -11.608270
episode 1555.000000, reward total was -13.000000. running mean: -11.622188
episode 1556.000000, reward total was -11.000000. running mean: -11.615966
episode 1557.000000, reward total was -5.000000. running mean: -11.549806
episode 1558.000000, reward total was -8.000000. running mean: -11.514308
episode 1559.000000, reward total was -18.000000. running mean: -11.579165
episode 1560.000000, reward total was -19.000000. running mean: -11.653373
episode 1561.000000, reward total was -14.000000. running mean: -11.676840
episode 1562.000000, reward total was -10.000000. running mean: -11.660071
episode 1563.000000, reward total was -12.000000. running mean: -11.663470
episode 1564.000000, reward total was -5.000000. running mean: -11.596836
episode 1565.000000, reward 

episode 1662.000000, reward total was -16.000000. running mean: -11.055986
episode 1663.000000, reward total was -11.000000. running mean: -11.055427
episode 1664.000000, reward total was -1.000000. running mean: -10.954872
episode 1665.000000, reward total was -17.000000. running mean: -11.015324
episode 1666.000000, reward total was -9.000000. running mean: -10.995170
episode 1667.000000, reward total was -12.000000. running mean: -11.005219
episode 1668.000000, reward total was -10.000000. running mean: -10.995166
episode 1669.000000, reward total was -13.000000. running mean: -11.015215
episode 1670.000000, reward total was -14.000000. running mean: -11.045063
episode 1671.000000, reward total was -14.000000. running mean: -11.074612
episode 1672.000000, reward total was -5.000000. running mean: -11.013866
episode 1673.000000, reward total was -6.000000. running mean: -10.963727
episode 1674.000000, reward total was -15.000000. running mean: -11.004090
episode 1675.000000, reward t

episode 1772.000000, reward total was -10.000000. running mean: -10.733385
episode 1773.000000, reward total was -13.000000. running mean: -10.756051
episode 1774.000000, reward total was -7.000000. running mean: -10.718491
episode 1775.000000, reward total was -8.000000. running mean: -10.691306
episode 1776.000000, reward total was -18.000000. running mean: -10.764393
episode 1777.000000, reward total was -8.000000. running mean: -10.736749
episode 1778.000000, reward total was -12.000000. running mean: -10.749381
episode 1779.000000, reward total was -10.000000. running mean: -10.741887
episode 1780.000000, reward total was -10.000000. running mean: -10.734469
episode 1781.000000, reward total was -11.000000. running mean: -10.737124
episode 1782.000000, reward total was -11.000000. running mean: -10.739753
episode 1783.000000, reward total was -6.000000. running mean: -10.692355
episode 1784.000000, reward total was -11.000000. running mean: -10.695432
episode 1785.000000, reward t

episode 1882.000000, reward total was -18.000000. running mean: -10.537658
episode 1883.000000, reward total was -6.000000. running mean: -10.492281
episode 1884.000000, reward total was -11.000000. running mean: -10.497358
episode 1885.000000, reward total was -10.000000. running mean: -10.492385
episode 1886.000000, reward total was -9.000000. running mean: -10.477461
episode 1887.000000, reward total was -7.000000. running mean: -10.442686
episode 1888.000000, reward total was -11.000000. running mean: -10.448259
episode 1889.000000, reward total was -12.000000. running mean: -10.463777
episode 1890.000000, reward total was -4.000000. running mean: -10.399139
episode 1891.000000, reward total was -14.000000. running mean: -10.435148
episode 1892.000000, reward total was -12.000000. running mean: -10.450796
episode 1893.000000, reward total was -12.000000. running mean: -10.466288
episode 1894.000000, reward total was -4.000000. running mean: -10.401625
episode 1895.000000, reward to

episode 1993.000000, reward total was -12.000000. running mean: -9.225813
episode 1994.000000, reward total was -17.000000. running mean: -9.303555
episode 1995.000000, reward total was -10.000000. running mean: -9.310520
episode 1996.000000, reward total was -2.000000. running mean: -9.237414
episode 1997.000000, reward total was -15.000000. running mean: -9.295040
episode 1998.000000, reward total was -13.000000. running mean: -9.332090
episode 1999.000000, reward total was -7.000000. running mean: -9.308769
episode 2000.000000, reward total was -10.000000. running mean: -9.315681
episode 2001.000000, reward total was -17.000000. running mean: -9.392524
episode 2002.000000, reward total was -9.000000. running mean: -9.388599
episode 2003.000000, reward total was -2.000000. running mean: -9.314713
episode 2004.000000, reward total was -9.000000. running mean: -9.311566
episode 2005.000000, reward total was -14.000000. running mean: -9.358450
episode 2006.000000, reward total was -9.00

episode 2105.000000, reward total was -8.000000. running mean: -8.875535
episode 2106.000000, reward total was -8.000000. running mean: -8.866779
episode 2107.000000, reward total was -12.000000. running mean: -8.898111
episode 2108.000000, reward total was -12.000000. running mean: -8.929130
episode 2109.000000, reward total was -11.000000. running mean: -8.949839
episode 2110.000000, reward total was -13.000000. running mean: -8.990341
episode 2111.000000, reward total was -4.000000. running mean: -8.940437
episode 2112.000000, reward total was -9.000000. running mean: -8.941033
episode 2113.000000, reward total was -9.000000. running mean: -8.941623
episode 2114.000000, reward total was -7.000000. running mean: -8.922206
episode 2115.000000, reward total was -11.000000. running mean: -8.942984
episode 2116.000000, reward total was -15.000000. running mean: -9.003554
episode 2117.000000, reward total was -5.000000. running mean: -8.963519
episode 2118.000000, reward total was -6.0000

episode 2217.000000, reward total was -9.000000. running mean: -8.592611
episode 2218.000000, reward total was -6.000000. running mean: -8.566685
episode 2219.000000, reward total was -11.000000. running mean: -8.591018
episode 2220.000000, reward total was -10.000000. running mean: -8.605108
episode 2221.000000, reward total was -8.000000. running mean: -8.599057
episode 2222.000000, reward total was -15.000000. running mean: -8.663066
episode 2223.000000, reward total was -13.000000. running mean: -8.706436
episode 2224.000000, reward total was -9.000000. running mean: -8.709371
episode 2225.000000, reward total was -11.000000. running mean: -8.732278
episode 2226.000000, reward total was -9.000000. running mean: -8.734955
episode 2227.000000, reward total was -13.000000. running mean: -8.777605
episode 2228.000000, reward total was -12.000000. running mean: -8.809829
episode 2229.000000, reward total was -7.000000. running mean: -8.791731
episode 2230.000000, reward total was -8.000

episode 2329.000000, reward total was 6.000000. running mean: -8.288398
episode 2330.000000, reward total was 1.000000. running mean: -8.195514
episode 2331.000000, reward total was -8.000000. running mean: -8.193558
episode 2332.000000, reward total was -6.000000. running mean: -8.171623
episode 2333.000000, reward total was 3.000000. running mean: -8.059907
episode 2334.000000, reward total was -7.000000. running mean: -8.049308
episode 2335.000000, reward total was -7.000000. running mean: -8.038814
episode 2336.000000, reward total was -11.000000. running mean: -8.068426
episode 2337.000000, reward total was -13.000000. running mean: -8.117742
episode 2338.000000, reward total was -8.000000. running mean: -8.116565
episode 2339.000000, reward total was -4.000000. running mean: -8.075399
episode 2340.000000, reward total was -8.000000. running mean: -8.074645
episode 2341.000000, reward total was -9.000000. running mean: -8.083899
episode 2342.000000, reward total was -13.000000. ru

episode 2441.000000, reward total was -9.000000. running mean: -7.879684
episode 2442.000000, reward total was -2.000000. running mean: -7.820887
episode 2443.000000, reward total was -10.000000. running mean: -7.842679
episode 2444.000000, reward total was -14.000000. running mean: -7.904252
episode 2445.000000, reward total was -10.000000. running mean: -7.925209
episode 2446.000000, reward total was -5.000000. running mean: -7.895957
episode 2447.000000, reward total was -11.000000. running mean: -7.926998
episode 2448.000000, reward total was -6.000000. running mean: -7.907728
episode 2449.000000, reward total was -2.000000. running mean: -7.848650
episode 2450.000000, reward total was -1.000000. running mean: -7.780164
episode 2451.000000, reward total was -7.000000. running mean: -7.772362
episode 2452.000000, reward total was -12.000000. running mean: -7.814639
episode 2453.000000, reward total was -14.000000. running mean: -7.876492
episode 2454.000000, reward total was -5.0000

episode 2553.000000, reward total was -2.000000. running mean: -7.691665
episode 2554.000000, reward total was 1.000000. running mean: -7.604748
episode 2555.000000, reward total was -8.000000. running mean: -7.608701
episode 2556.000000, reward total was -3.000000. running mean: -7.562614
episode 2557.000000, reward total was -2.000000. running mean: -7.506988
episode 2558.000000, reward total was -4.000000. running mean: -7.471918
episode 2559.000000, reward total was -4.000000. running mean: -7.437198
episode 2560.000000, reward total was -8.000000. running mean: -7.442826
episode 2561.000000, reward total was -15.000000. running mean: -7.518398
episode 2562.000000, reward total was -9.000000. running mean: -7.533214
episode 2563.000000, reward total was -5.000000. running mean: -7.507882
episode 2564.000000, reward total was -17.000000. running mean: -7.602803
episode 2565.000000, reward total was -8.000000. running mean: -7.606775
episode 2566.000000, reward total was -6.000000. r

episode 2665.000000, reward total was 5.000000. running mean: -6.540282
episode 2666.000000, reward total was -9.000000. running mean: -6.564880
episode 2667.000000, reward total was -7.000000. running mean: -6.569231
episode 2668.000000, reward total was -6.000000. running mean: -6.563538
episode 2669.000000, reward total was -8.000000. running mean: -6.577903
episode 2670.000000, reward total was -2.000000. running mean: -6.532124
episode 2671.000000, reward total was -13.000000. running mean: -6.596803
episode 2672.000000, reward total was -12.000000. running mean: -6.650835
episode 2673.000000, reward total was 2.000000. running mean: -6.564326
episode 2674.000000, reward total was -6.000000. running mean: -6.558683
episode 2675.000000, reward total was -15.000000. running mean: -6.643096
episode 2676.000000, reward total was -8.000000. running mean: -6.656665
episode 2677.000000, reward total was -9.000000. running mean: -6.680099
episode 2678.000000, reward total was -10.000000. 

episode 2777.000000, reward total was -9.000000. running mean: -6.577565
episode 2778.000000, reward total was -11.000000. running mean: -6.621789
episode 2779.000000, reward total was -16.000000. running mean: -6.715571
episode 2780.000000, reward total was -13.000000. running mean: -6.778416
episode 2781.000000, reward total was -10.000000. running mean: -6.810632
episode 2782.000000, reward total was -10.000000. running mean: -6.842525
episode 2783.000000, reward total was -11.000000. running mean: -6.884100
episode 2784.000000, reward total was -16.000000. running mean: -6.975259
episode 2785.000000, reward total was -9.000000. running mean: -6.995506
episode 2786.000000, reward total was -9.000000. running mean: -7.015551
episode 2787.000000, reward total was -12.000000. running mean: -7.065396
episode 2788.000000, reward total was -1.000000. running mean: -7.004742
episode 2789.000000, reward total was -14.000000. running mean: -7.074694
episode 2790.000000, reward total was -11.

episode 2890.000000, reward total was -4.000000. running mean: -4.800175
episode 2891.000000, reward total was -5.000000. running mean: -4.802174
episode 2892.000000, reward total was 3.000000. running mean: -4.724152
episode 2893.000000, reward total was -10.000000. running mean: -4.776910
episode 2894.000000, reward total was 4.000000. running mean: -4.689141
episode 2895.000000, reward total was 5.000000. running mean: -4.592250
episode 2896.000000, reward total was 12.000000. running mean: -4.426327
episode 2897.000000, reward total was -8.000000. running mean: -4.462064
episode 2898.000000, reward total was -4.000000. running mean: -4.457444
episode 2899.000000, reward total was -9.000000. running mean: -4.502869
episode 2900.000000, reward total was -7.000000. running mean: -4.527840
episode 2901.000000, reward total was -15.000000. running mean: -4.632562
episode 2902.000000, reward total was 1.000000. running mean: -4.576236
episode 2903.000000, reward total was 4.000000. runni

episode 3003.000000, reward total was -13.000000. running mean: -4.459620
episode 3004.000000, reward total was -9.000000. running mean: -4.505024
episode 3005.000000, reward total was -14.000000. running mean: -4.599974
episode 3006.000000, reward total was -7.000000. running mean: -4.623974
episode 3007.000000, reward total was -8.000000. running mean: -4.657734
episode 3008.000000, reward total was -11.000000. running mean: -4.721157
episode 3009.000000, reward total was 4.000000. running mean: -4.633946
episode 3010.000000, reward total was -3.000000. running mean: -4.617606
episode 3011.000000, reward total was 2.000000. running mean: -4.551430
episode 3012.000000, reward total was -9.000000. running mean: -4.595916
episode 3013.000000, reward total was -7.000000. running mean: -4.619957
episode 3014.000000, reward total was -9.000000. running mean: -4.663757
episode 3015.000000, reward total was -2.000000. running mean: -4.637119
episode 3016.000000, reward total was -16.000000. 

episode 3116.000000, reward total was -9.000000. running mean: -4.984519
episode 3117.000000, reward total was -9.000000. running mean: -5.024674
episode 3118.000000, reward total was -9.000000. running mean: -5.064427
episode 3119.000000, reward total was -7.000000. running mean: -5.083783
episode 3120.000000, reward total was -7.000000. running mean: -5.102945
episode 3121.000000, reward total was 1.000000. running mean: -5.041916
episode 3122.000000, reward total was -3.000000. running mean: -5.021497
episode 3123.000000, reward total was -9.000000. running mean: -5.061282
episode 3124.000000, reward total was -10.000000. running mean: -5.110669
episode 3125.000000, reward total was 5.000000. running mean: -5.009562
episode 3126.000000, reward total was -9.000000. running mean: -5.049467
episode 3127.000000, reward total was 5.000000. running mean: -4.948972
episode 3128.000000, reward total was -13.000000. running mean: -5.029482
episode 3129.000000, reward total was -4.000000. run

episode 3229.000000, reward total was -3.000000. running mean: -5.092588
episode 3230.000000, reward total was 7.000000. running mean: -4.971662
episode 3231.000000, reward total was -2.000000. running mean: -4.941945
episode 3232.000000, reward total was 5.000000. running mean: -4.842526
episode 3233.000000, reward total was -8.000000. running mean: -4.874101
episode 3234.000000, reward total was -12.000000. running mean: -4.945360
episode 3235.000000, reward total was -10.000000. running mean: -4.995906
episode 3236.000000, reward total was -1.000000. running mean: -4.955947
episode 3237.000000, reward total was -10.000000. running mean: -5.006388
episode 3238.000000, reward total was -7.000000. running mean: -5.026324
episode 3239.000000, reward total was -2.000000. running mean: -4.996060
episode 3240.000000, reward total was -12.000000. running mean: -5.066100
episode 3241.000000, reward total was -8.000000. running mean: -5.095439
episode 3242.000000, reward total was -5.000000. 

episode 3342.000000, reward total was -3.000000. running mean: -4.308070
episode 3343.000000, reward total was -2.000000. running mean: -4.284989
episode 3344.000000, reward total was -4.000000. running mean: -4.282139
episode 3345.000000, reward total was 5.000000. running mean: -4.189318
episode 3346.000000, reward total was -11.000000. running mean: -4.257425
episode 3347.000000, reward total was -10.000000. running mean: -4.314850
episode 3348.000000, reward total was -13.000000. running mean: -4.401702
episode 3349.000000, reward total was -3.000000. running mean: -4.387685
episode 3350.000000, reward total was -2.000000. running mean: -4.363808
episode 3351.000000, reward total was -1.000000. running mean: -4.330170
episode 3352.000000, reward total was -7.000000. running mean: -4.356868
episode 3353.000000, reward total was -3.000000. running mean: -4.343300
episode 3354.000000, reward total was -7.000000. running mean: -4.369867
episode 3355.000000, reward total was -8.000000. 

episode 3455.000000, reward total was -9.000000. running mean: -3.897192
episode 3456.000000, reward total was -6.000000. running mean: -3.918220
episode 3457.000000, reward total was 5.000000. running mean: -3.829038
episode 3458.000000, reward total was -3.000000. running mean: -3.820748
episode 3459.000000, reward total was -3.000000. running mean: -3.812540
episode 3460.000000, reward total was -12.000000. running mean: -3.894415
episode 3461.000000, reward total was 5.000000. running mean: -3.805471
episode 3462.000000, reward total was -11.000000. running mean: -3.877416
episode 3463.000000, reward total was -11.000000. running mean: -3.948642
episode 3464.000000, reward total was -6.000000. running mean: -3.969155
episode 3465.000000, reward total was -13.000000. running mean: -4.059464
episode 3466.000000, reward total was -7.000000. running mean: -4.088869
episode 3467.000000, reward total was 9.000000. running mean: -3.957980
episode 3468.000000, reward total was -1.000000. r

episode 3568.000000, reward total was -17.000000. running mean: -3.631212
episode 3569.000000, reward total was -11.000000. running mean: -3.704900
episode 3570.000000, reward total was -4.000000. running mean: -3.707851
episode 3571.000000, reward total was 9.000000. running mean: -3.580772
episode 3572.000000, reward total was 2.000000. running mean: -3.524965
episode 3573.000000, reward total was 2.000000. running mean: -3.469715
episode 3574.000000, reward total was -8.000000. running mean: -3.515018
episode 3575.000000, reward total was -10.000000. running mean: -3.579868
episode 3576.000000, reward total was 4.000000. running mean: -3.504069
episode 3577.000000, reward total was -4.000000. running mean: -3.509028
episode 3578.000000, reward total was -9.000000. running mean: -3.563938
episode 3579.000000, reward total was -3.000000. running mean: -3.558299
episode 3580.000000, reward total was -7.000000. running mean: -3.592716
episode 3581.000000, reward total was 5.000000. runn

episode 3681.000000, reward total was -11.000000. running mean: -2.870095
episode 3682.000000, reward total was -3.000000. running mean: -2.871394
episode 3683.000000, reward total was 1.000000. running mean: -2.832680
episode 3684.000000, reward total was 2.000000. running mean: -2.784353
episode 3685.000000, reward total was 2.000000. running mean: -2.736510
episode 3686.000000, reward total was -9.000000. running mean: -2.799144
episode 3687.000000, reward total was -5.000000. running mean: -2.821153
episode 3688.000000, reward total was -3.000000. running mean: -2.822941
episode 3689.000000, reward total was 6.000000. running mean: -2.734712
episode 3690.000000, reward total was -8.000000. running mean: -2.787365
episode 3691.000000, reward total was -9.000000. running mean: -2.849491
episode 3692.000000, reward total was 10.000000. running mean: -2.720996
episode 3693.000000, reward total was -1.000000. running mean: -2.703786
episode 3694.000000, reward total was -4.000000. runni

episode 3794.000000, reward total was -14.000000. running mean: -3.529417
episode 3795.000000, reward total was -3.000000. running mean: -3.524122
episode 3796.000000, reward total was 12.000000. running mean: -3.368881
episode 3797.000000, reward total was -1.000000. running mean: -3.345192
episode 3798.000000, reward total was -3.000000. running mean: -3.341741
episode 3799.000000, reward total was -5.000000. running mean: -3.358323
episode 3800.000000, reward total was 11.000000. running mean: -3.214740
episode 3801.000000, reward total was -11.000000. running mean: -3.292592
episode 3802.000000, reward total was -10.000000. running mean: -3.359667
episode 3803.000000, reward total was -3.000000. running mean: -3.356070
episode 3804.000000, reward total was -1.000000. running mean: -3.332509
episode 3805.000000, reward total was -5.000000. running mean: -3.349184
episode 3806.000000, reward total was -1.000000. running mean: -3.325692
episode 3807.000000, reward total was -1.000000.

episode 3907.000000, reward total was -3.000000. running mean: -2.292874
episode 3908.000000, reward total was -9.000000. running mean: -2.359945
episode 3909.000000, reward total was -1.000000. running mean: -2.346346
episode 3910.000000, reward total was -3.000000. running mean: -2.352882
episode 3911.000000, reward total was 8.000000. running mean: -2.249353
episode 3912.000000, reward total was -4.000000. running mean: -2.266860
episode 3913.000000, reward total was 1.000000. running mean: -2.234191
episode 3914.000000, reward total was -3.000000. running mean: -2.241849
episode 3915.000000, reward total was 7.000000. running mean: -2.149431
episode 3916.000000, reward total was -15.000000. running mean: -2.277937
episode 3917.000000, reward total was -7.000000. running mean: -2.325157
episode 3918.000000, reward total was -15.000000. running mean: -2.451906
episode 3919.000000, reward total was -7.000000. running mean: -2.497387
episode 3920.000000, reward total was 6.000000. runn

episode 4020.000000, reward total was -8.000000. running mean: -4.044740
episode 4021.000000, reward total was -3.000000. running mean: -4.034293
episode 4022.000000, reward total was -1.000000. running mean: -4.003950
episode 4023.000000, reward total was 5.000000. running mean: -3.913910
episode 4024.000000, reward total was 3.000000. running mean: -3.844771
episode 4025.000000, reward total was -13.000000. running mean: -3.936323
episode 4026.000000, reward total was 3.000000. running mean: -3.866960
episode 4027.000000, reward total was 1.000000. running mean: -3.818291
episode 4028.000000, reward total was 2.000000. running mean: -3.760108
episode 4029.000000, reward total was -3.000000. running mean: -3.752507
episode 4030.000000, reward total was -9.000000. running mean: -3.804981
episode 4031.000000, reward total was 3.000000. running mean: -3.736932
episode 4032.000000, reward total was 2.000000. running mean: -3.679562
episode 4033.000000, reward total was -15.000000. running

episode 4133.000000, reward total was -5.000000. running mean: -2.733321
episode 4134.000000, reward total was -5.000000. running mean: -2.755988
episode 4135.000000, reward total was 10.000000. running mean: -2.628428
episode 4136.000000, reward total was -1.000000. running mean: -2.612144
episode 4137.000000, reward total was 5.000000. running mean: -2.536023
episode 4138.000000, reward total was -8.000000. running mean: -2.590662
episode 4139.000000, reward total was -3.000000. running mean: -2.594756
episode 4140.000000, reward total was -5.000000. running mean: -2.618808
episode 4141.000000, reward total was -5.000000. running mean: -2.642620
episode 4142.000000, reward total was 7.000000. running mean: -2.546194
episode 4143.000000, reward total was -3.000000. running mean: -2.550732
episode 4144.000000, reward total was -3.000000. running mean: -2.555225
episode 4145.000000, reward total was 5.000000. running mean: -2.479672
episode 4146.000000, reward total was 4.000000. runnin

episode 4246.000000, reward total was 5.000000. running mean: -2.592738
episode 4247.000000, reward total was 6.000000. running mean: -2.506811
episode 4248.000000, reward total was -13.000000. running mean: -2.611743
episode 4249.000000, reward total was -7.000000. running mean: -2.655625
episode 4250.000000, reward total was -5.000000. running mean: -2.679069
episode 4251.000000, reward total was -8.000000. running mean: -2.732279
episode 4252.000000, reward total was -1.000000. running mean: -2.714956
episode 4253.000000, reward total was 2.000000. running mean: -2.667806
episode 4254.000000, reward total was -7.000000. running mean: -2.711128
episode 4255.000000, reward total was -8.000000. running mean: -2.764017
episode 4256.000000, reward total was -8.000000. running mean: -2.816377
episode 4257.000000, reward total was -5.000000. running mean: -2.838213
episode 4258.000000, reward total was 2.000000. running mean: -2.789831
episode 4259.000000, reward total was -4.000000. runni

episode 4359.000000, reward total was -5.000000. running mean: -2.647393
episode 4360.000000, reward total was 6.000000. running mean: -2.560919
episode 4361.000000, reward total was -14.000000. running mean: -2.675310
episode 4362.000000, reward total was -8.000000. running mean: -2.728557
episode 4363.000000, reward total was -7.000000. running mean: -2.771271
episode 4364.000000, reward total was -11.000000. running mean: -2.853559
episode 4365.000000, reward total was 4.000000. running mean: -2.785023
episode 4366.000000, reward total was -8.000000. running mean: -2.837173
episode 4367.000000, reward total was -5.000000. running mean: -2.858801
episode 4368.000000, reward total was -9.000000. running mean: -2.920213
episode 4369.000000, reward total was -9.000000. running mean: -2.981011
episode 4370.000000, reward total was 7.000000. running mean: -2.881201
episode 4371.000000, reward total was -13.000000. running mean: -2.982389
episode 4372.000000, reward total was -2.000000. ru

episode 4472.000000, reward total was 1.000000. running mean: -1.942533
episode 4473.000000, reward total was -2.000000. running mean: -1.943108
episode 4474.000000, reward total was -3.000000. running mean: -1.953676
episode 4475.000000, reward total was -14.000000. running mean: -2.074140
episode 4476.000000, reward total was -7.000000. running mean: -2.123398
episode 4477.000000, reward total was 7.000000. running mean: -2.032164
episode 4478.000000, reward total was -11.000000. running mean: -2.121843
episode 4479.000000, reward total was 1.000000. running mean: -2.090624
episode 4480.000000, reward total was -14.000000. running mean: -2.209718
episode 4481.000000, reward total was 5.000000. running mean: -2.137621
episode 4482.000000, reward total was -5.000000. running mean: -2.166245
episode 4483.000000, reward total was 1.000000. running mean: -2.134582
episode 4484.000000, reward total was 5.000000. running mean: -2.063236
episode 4485.000000, reward total was -14.000000. runn

episode 4585.000000, reward total was 3.000000. running mean: -2.242278
episode 4586.000000, reward total was 5.000000. running mean: -2.169855
episode 4587.000000, reward total was 5.000000. running mean: -2.098157
episode 4588.000000, reward total was -3.000000. running mean: -2.107175
episode 4589.000000, reward total was -9.000000. running mean: -2.176103
episode 4590.000000, reward total was 11.000000. running mean: -2.044342
episode 4591.000000, reward total was 1.000000. running mean: -2.013899
episode 4592.000000, reward total was -3.000000. running mean: -2.023760
episode 4593.000000, reward total was -3.000000. running mean: -2.033522
episode 4594.000000, reward total was -4.000000. running mean: -2.053187
episode 4595.000000, reward total was -3.000000. running mean: -2.062655
episode 4596.000000, reward total was 3.000000. running mean: -2.012029
episode 4597.000000, reward total was 7.000000. running mean: -1.921908
episode 4598.000000, reward total was -15.000000. running

episode 4698.000000, reward total was -4.000000. running mean: -1.802348
episode 4699.000000, reward total was 1.000000. running mean: -1.774325
episode 4700.000000, reward total was -9.000000. running mean: -1.846582
episode 4701.000000, reward total was 11.000000. running mean: -1.718116
episode 4702.000000, reward total was 8.000000. running mean: -1.620935
episode 4703.000000, reward total was -14.000000. running mean: -1.744725
episode 4704.000000, reward total was 6.000000. running mean: -1.667278
episode 4705.000000, reward total was 6.000000. running mean: -1.590605
episode 4706.000000, reward total was -1.000000. running mean: -1.584699
episode 4707.000000, reward total was -4.000000. running mean: -1.608852
episode 4708.000000, reward total was -7.000000. running mean: -1.662764
episode 4709.000000, reward total was -2.000000. running mean: -1.666136
episode 4710.000000, reward total was 1.000000. running mean: -1.639475
episode 4711.000000, reward total was 2.000000. running

episode 4811.000000, reward total was -12.000000. running mean: -3.326679
episode 4812.000000, reward total was -9.000000. running mean: -3.383412
episode 4813.000000, reward total was -1.000000. running mean: -3.359578
episode 4814.000000, reward total was -2.000000. running mean: -3.345982
episode 4815.000000, reward total was 9.000000. running mean: -3.222522
episode 4816.000000, reward total was 1.000000. running mean: -3.180297
episode 4817.000000, reward total was -10.000000. running mean: -3.248494
episode 4818.000000, reward total was -6.000000. running mean: -3.276009
episode 4819.000000, reward total was 2.000000. running mean: -3.223249
episode 4820.000000, reward total was 2.000000. running mean: -3.171017
episode 4821.000000, reward total was -3.000000. running mean: -3.169307
episode 4822.000000, reward total was 5.000000. running mean: -3.087613
episode 4823.000000, reward total was 1.000000. running mean: -3.046737
episode 4824.000000, reward total was 9.000000. running

episode 4924.000000, reward total was -4.000000. running mean: -2.620311
episode 4925.000000, reward total was 2.000000. running mean: -2.574108
episode 4926.000000, reward total was -6.000000. running mean: -2.608367
episode 4927.000000, reward total was 4.000000. running mean: -2.542283
episode 4928.000000, reward total was -6.000000. running mean: -2.576860
episode 4929.000000, reward total was -13.000000. running mean: -2.681092
episode 4930.000000, reward total was 5.000000. running mean: -2.604281
episode 4931.000000, reward total was -4.000000. running mean: -2.618238
episode 4932.000000, reward total was -10.000000. running mean: -2.692056
episode 4933.000000, reward total was -8.000000. running mean: -2.745135
episode 4934.000000, reward total was 3.000000. running mean: -2.687684
episode 4935.000000, reward total was -2.000000. running mean: -2.680807
episode 4936.000000, reward total was -15.000000. running mean: -2.803999
episode 4937.000000, reward total was -1.000000. run

episode 5037.000000, reward total was -3.000000. running mean: -1.562455
episode 5038.000000, reward total was -9.000000. running mean: -1.636830
episode 5039.000000, reward total was 4.000000. running mean: -1.580462
episode 5040.000000, reward total was 3.000000. running mean: -1.534657
episode 5041.000000, reward total was -10.000000. running mean: -1.619311
episode 5042.000000, reward total was -6.000000. running mean: -1.663118
episode 5043.000000, reward total was 4.000000. running mean: -1.606486
episode 5044.000000, reward total was -1.000000. running mean: -1.600422
episode 5045.000000, reward total was -1.000000. running mean: -1.594417
episode 5046.000000, reward total was -9.000000. running mean: -1.668473
episode 5047.000000, reward total was -3.000000. running mean: -1.681788
episode 5048.000000, reward total was -4.000000. running mean: -1.704971
episode 5049.000000, reward total was -1.000000. running mean: -1.697921
episode 5050.000000, reward total was -2.000000. runn

episode 5150.000000, reward total was -3.000000. running mean: -1.112069
episode 5151.000000, reward total was 3.000000. running mean: -1.070949
episode 5152.000000, reward total was -10.000000. running mean: -1.160239
episode 5153.000000, reward total was -12.000000. running mean: -1.268637
episode 5154.000000, reward total was 5.000000. running mean: -1.205950
episode 5155.000000, reward total was 7.000000. running mean: -1.123891
episode 5156.000000, reward total was -3.000000. running mean: -1.142652
episode 5157.000000, reward total was 8.000000. running mean: -1.051225
episode 5158.000000, reward total was 8.000000. running mean: -0.960713
episode 5159.000000, reward total was 9.000000. running mean: -0.861106
episode 5160.000000, reward total was -4.000000. running mean: -0.892495
episode 5161.000000, reward total was 2.000000. running mean: -0.863570
episode 5162.000000, reward total was 7.000000. running mean: -0.784934
episode 5163.000000, reward total was -3.000000. running 

episode 5263.000000, reward total was -5.000000. running mean: -0.680014
episode 5264.000000, reward total was -6.000000. running mean: -0.733214
episode 5265.000000, reward total was 5.000000. running mean: -0.675882
episode 5266.000000, reward total was -3.000000. running mean: -0.699123
episode 5267.000000, reward total was 4.000000. running mean: -0.652131
episode 5268.000000, reward total was 5.000000. running mean: -0.595610
episode 5269.000000, reward total was -11.000000. running mean: -0.699654
episode 5270.000000, reward total was 2.000000. running mean: -0.672658
episode 5271.000000, reward total was 8.000000. running mean: -0.585931
episode 5272.000000, reward total was -1.000000. running mean: -0.590072
episode 5273.000000, reward total was -7.000000. running mean: -0.654171
episode 5274.000000, reward total was 8.000000. running mean: -0.567629
episode 5275.000000, reward total was -1.000000. running mean: -0.571953
episode 5276.000000, reward total was 1.000000. running 

episode 5376.000000, reward total was -9.000000. running mean: -0.434778
episode 5377.000000, reward total was 6.000000. running mean: -0.370430
episode 5378.000000, reward total was -1.000000. running mean: -0.376725
episode 5379.000000, reward total was 8.000000. running mean: -0.292958
episode 5380.000000, reward total was -13.000000. running mean: -0.420029
episode 5381.000000, reward total was -5.000000. running mean: -0.465828
episode 5382.000000, reward total was -4.000000. running mean: -0.501170
episode 5383.000000, reward total was -1.000000. running mean: -0.506158
episode 5384.000000, reward total was 2.000000. running mean: -0.481097
episode 5385.000000, reward total was -9.000000. running mean: -0.566286
episode 5386.000000, reward total was -1.000000. running mean: -0.570623
episode 5387.000000, reward total was -1.000000. running mean: -0.574917
episode 5388.000000, reward total was 4.000000. running mean: -0.529168
episode 5389.000000, reward total was 10.000000. runni

episode 5489.000000, reward total was 5.000000. running mean: -1.134567
episode 5490.000000, reward total was 3.000000. running mean: -1.093222
episode 5491.000000, reward total was -10.000000. running mean: -1.182289
episode 5492.000000, reward total was 2.000000. running mean: -1.150466
episode 5493.000000, reward total was -5.000000. running mean: -1.188962
episode 5494.000000, reward total was -7.000000. running mean: -1.247072
episode 5495.000000, reward total was -11.000000. running mean: -1.344601
episode 5496.000000, reward total was -1.000000. running mean: -1.341155
episode 5497.000000, reward total was -4.000000. running mean: -1.367744
episode 5498.000000, reward total was 5.000000. running mean: -1.304066
episode 5499.000000, reward total was -8.000000. running mean: -1.371026
episode 5500.000000, reward total was -3.000000. running mean: -1.387316
episode 5501.000000, reward total was -6.000000. running mean: -1.433442
episode 5502.000000, reward total was 5.000000. runni

episode 5602.000000, reward total was 6.000000. running mean: -0.394667
episode 5603.000000, reward total was -9.000000. running mean: -0.480720
episode 5604.000000, reward total was 12.000000. running mean: -0.355913
episode 5605.000000, reward total was -1.000000. running mean: -0.362354
episode 5606.000000, reward total was 4.000000. running mean: -0.318730
episode 5607.000000, reward total was 7.000000. running mean: -0.245543
episode 5608.000000, reward total was 8.000000. running mean: -0.163087
episode 5609.000000, reward total was -5.000000. running mean: -0.211457
episode 5610.000000, reward total was 6.000000. running mean: -0.149342
episode 5611.000000, reward total was -10.000000. running mean: -0.247849
episode 5612.000000, reward total was -4.000000. running mean: -0.285370
episode 5613.000000, reward total was 3.000000. running mean: -0.252516
episode 5614.000000, reward total was -6.000000. running mean: -0.309991
episode 5615.000000, reward total was 11.000000. running

episode 5715.000000, reward total was 7.000000. running mean: -0.360562
episode 5716.000000, reward total was -4.000000. running mean: -0.396957
episode 5717.000000, reward total was -11.000000. running mean: -0.502987
episode 5718.000000, reward total was -3.000000. running mean: -0.527957
episode 5719.000000, reward total was -1.000000. running mean: -0.532678
episode 5720.000000, reward total was -11.000000. running mean: -0.637351
episode 5721.000000, reward total was -3.000000. running mean: -0.660977
episode 5722.000000, reward total was -6.000000. running mean: -0.714368
episode 5723.000000, reward total was 13.000000. running mean: -0.577224
episode 5724.000000, reward total was -5.000000. running mean: -0.621452
episode 5725.000000, reward total was 2.000000. running mean: -0.595237
episode 5726.000000, reward total was -6.000000. running mean: -0.649285
episode 5727.000000, reward total was -9.000000. running mean: -0.732792
episode 5728.000000, reward total was 8.000000. run

episode 5828.000000, reward total was -3.000000. running mean: -0.832815
episode 5829.000000, reward total was -4.000000. running mean: -0.864487
episode 5830.000000, reward total was -10.000000. running mean: -0.955842
episode 5831.000000, reward total was -9.000000. running mean: -1.036283
episode 5832.000000, reward total was 2.000000. running mean: -1.005921
episode 5833.000000, reward total was 4.000000. running mean: -0.955861
episode 5834.000000, reward total was -8.000000. running mean: -1.026303
episode 5835.000000, reward total was -4.000000. running mean: -1.056040
episode 5836.000000, reward total was -1.000000. running mean: -1.055479
episode 5837.000000, reward total was -4.000000. running mean: -1.084925
episode 5838.000000, reward total was 2.000000. running mean: -1.054075
episode 5839.000000, reward total was -3.000000. running mean: -1.073535
episode 5840.000000, reward total was -5.000000. running mean: -1.112799
episode 5841.000000, reward total was -15.000000. run

episode 5941.000000, reward total was -15.000000. running mean: -1.333584
episode 5942.000000, reward total was 5.000000. running mean: -1.270248
episode 5943.000000, reward total was 10.000000. running mean: -1.157545
episode 5944.000000, reward total was -1.000000. running mean: -1.155970
episode 5945.000000, reward total was 1.000000. running mean: -1.134410
episode 5946.000000, reward total was -2.000000. running mean: -1.143066
episode 5947.000000, reward total was 4.000000. running mean: -1.091635
episode 5948.000000, reward total was -5.000000. running mean: -1.130719
episode 5949.000000, reward total was 7.000000. running mean: -1.049412
episode 5950.000000, reward total was 3.000000. running mean: -1.008918
episode 5951.000000, reward total was -18.000000. running mean: -1.178829
episode 5952.000000, reward total was -5.000000. running mean: -1.217040
episode 5953.000000, reward total was -6.000000. running mean: -1.264870
episode 5954.000000, reward total was -5.000000. runni

episode 6054.000000, reward total was -4.000000. running mean: -0.309523
episode 6055.000000, reward total was -3.000000. running mean: -0.336428
episode 6056.000000, reward total was 13.000000. running mean: -0.203064
episode 6057.000000, reward total was 3.000000. running mean: -0.171033
episode 6058.000000, reward total was 2.000000. running mean: -0.149323
episode 6059.000000, reward total was 5.000000. running mean: -0.097830
episode 6060.000000, reward total was -7.000000. running mean: -0.166851
episode 6061.000000, reward total was -7.000000. running mean: -0.235183
episode 6062.000000, reward total was 2.000000. running mean: -0.212831
episode 6063.000000, reward total was -1.000000. running mean: -0.220703
episode 6064.000000, reward total was 2.000000. running mean: -0.198496
episode 6065.000000, reward total was 5.000000. running mean: -0.146511
episode 6066.000000, reward total was -13.000000. running mean: -0.275046
episode 6067.000000, reward total was 1.000000. running 

episode 6168.000000, reward total was 8.000000. running mean: -0.058089
episode 6169.000000, reward total was 7.000000. running mean: 0.012492
episode 6170.000000, reward total was -5.000000. running mean: -0.037633
episode 6171.000000, reward total was 5.000000. running mean: 0.012743
episode 6172.000000, reward total was 9.000000. running mean: 0.102616
episode 6173.000000, reward total was -4.000000. running mean: 0.061590
episode 6174.000000, reward total was -16.000000. running mean: -0.099026
episode 6175.000000, reward total was 3.000000. running mean: -0.068036
episode 6176.000000, reward total was 3.000000. running mean: -0.037356
episode 6177.000000, reward total was -6.000000. running mean: -0.096982
episode 6178.000000, reward total was -9.000000. running mean: -0.186012
episode 6179.000000, reward total was 6.000000. running mean: -0.124152
episode 6180.000000, reward total was -7.000000. running mean: -0.192911
episode 6181.000000, reward total was -3.000000. running mean

episode 6282.000000, reward total was -1.000000. running mean: -0.109800
episode 6283.000000, reward total was 7.000000. running mean: -0.038702
episode 6284.000000, reward total was 13.000000. running mean: 0.091685
episode 6285.000000, reward total was -6.000000. running mean: 0.030768
episode 6286.000000, reward total was -4.000000. running mean: -0.009540
episode 6287.000000, reward total was -3.000000. running mean: -0.039444
episode 6288.000000, reward total was -2.000000. running mean: -0.059050
episode 6289.000000, reward total was -2.000000. running mean: -0.078459
episode 6290.000000, reward total was 3.000000. running mean: -0.047675
episode 6291.000000, reward total was -7.000000. running mean: -0.117198
episode 6292.000000, reward total was 7.000000. running mean: -0.046026
episode 6293.000000, reward total was -7.000000. running mean: -0.115566
episode 6294.000000, reward total was 4.000000. running mean: -0.074410
episode 6295.000000, reward total was -4.000000. running 

episode 6397.000000, reward total was 12.000000. running mean: 0.884802
episode 6398.000000, reward total was -3.000000. running mean: 0.845954
episode 6399.000000, reward total was -1.000000. running mean: 0.827495
episode 6400.000000, reward total was -3.000000. running mean: 0.789220
episode 6401.000000, reward total was 8.000000. running mean: 0.861328
episode 6402.000000, reward total was 3.000000. running mean: 0.882714
episode 6403.000000, reward total was 7.000000. running mean: 0.943887
episode 6404.000000, reward total was 1.000000. running mean: 0.944448
episode 6405.000000, reward total was -7.000000. running mean: 0.865004
episode 6406.000000, reward total was 6.000000. running mean: 0.916354
episode 6407.000000, reward total was -1.000000. running mean: 0.897190
episode 6408.000000, reward total was -7.000000. running mean: 0.818218
episode 6409.000000, reward total was -9.000000. running mean: 0.720036
episode 6410.000000, reward total was 8.000000. running mean: 0.79283

episode 6512.000000, reward total was 7.000000. running mean: 0.649053
episode 6513.000000, reward total was -3.000000. running mean: 0.612562
episode 6514.000000, reward total was -4.000000. running mean: 0.566437
episode 6515.000000, reward total was 9.000000. running mean: 0.650772
episode 6516.000000, reward total was 12.000000. running mean: 0.764265
episode 6517.000000, reward total was 2.000000. running mean: 0.776622
episode 6518.000000, reward total was 13.000000. running mean: 0.898856
episode 6519.000000, reward total was -2.000000. running mean: 0.869867
episode 6520.000000, reward total was -5.000000. running mean: 0.811169
episode 6521.000000, reward total was 8.000000. running mean: 0.883057
episode 6522.000000, reward total was 10.000000. running mean: 0.974226
episode 6523.000000, reward total was -3.000000. running mean: 0.934484
episode 6524.000000, reward total was 4.000000. running mean: 0.965139
episode 6525.000000, reward total was 11.000000. running mean: 1.0654

episode 6627.000000, reward total was -2.000000. running mean: 0.825944
episode 6628.000000, reward total was 3.000000. running mean: 0.847685
episode 6629.000000, reward total was 8.000000. running mean: 0.919208
episode 6630.000000, reward total was 2.000000. running mean: 0.930016
episode 6631.000000, reward total was 10.000000. running mean: 1.020716
episode 6632.000000, reward total was 4.000000. running mean: 1.050509
episode 6633.000000, reward total was 11.000000. running mean: 1.150004
episode 6634.000000, reward total was -17.000000. running mean: 0.968504
episode 6635.000000, reward total was 7.000000. running mean: 1.028819
episode 6636.000000, reward total was -3.000000. running mean: 0.988530
episode 6637.000000, reward total was -1.000000. running mean: 0.968645
episode 6638.000000, reward total was 5.000000. running mean: 1.008959
episode 6639.000000, reward total was 1.000000. running mean: 1.008869
episode 6640.000000, reward total was -1.000000. running mean: 0.98878

episode 6742.000000, reward total was 13.000000. running mean: 0.815233
episode 6743.000000, reward total was 9.000000. running mean: 0.897081
episode 6744.000000, reward total was 2.000000. running mean: 0.908110
episode 6745.000000, reward total was 8.000000. running mean: 0.979029
episode 6746.000000, reward total was 3.000000. running mean: 0.999239
episode 6747.000000, reward total was 5.000000. running mean: 1.039246
episode 6748.000000, reward total was 3.000000. running mean: 1.058854
episode 6749.000000, reward total was 2.000000. running mean: 1.068265
episode 6750.000000, reward total was 9.000000. running mean: 1.147583
episode 6751.000000, reward total was -4.000000. running mean: 1.096107
episode 6752.000000, reward total was -9.000000. running mean: 0.995146
episode 6753.000000, reward total was 11.000000. running mean: 1.095194
episode 6754.000000, reward total was 5.000000. running mean: 1.134242
episode 6755.000000, reward total was -3.000000. running mean: 1.092900
e

episode 6857.000000, reward total was -5.000000. running mean: 0.011372
episode 6858.000000, reward total was 4.000000. running mean: 0.051259
episode 6859.000000, reward total was 11.000000. running mean: 0.160746
episode 6860.000000, reward total was -10.000000. running mean: 0.059139
episode 6861.000000, reward total was -13.000000. running mean: -0.071453
episode 6862.000000, reward total was -2.000000. running mean: -0.090738
episode 6863.000000, reward total was -7.000000. running mean: -0.159831
episode 6864.000000, reward total was 8.000000. running mean: -0.078232
episode 6865.000000, reward total was 5.000000. running mean: -0.027450
episode 6866.000000, reward total was 4.000000. running mean: 0.012824
episode 6867.000000, reward total was 3.000000. running mean: 0.042696
episode 6868.000000, reward total was 2.000000. running mean: 0.062269
episode 6869.000000, reward total was 7.000000. running mean: 0.131646
episode 6870.000000, reward total was -11.000000. running mean: 

episode 6972.000000, reward total was 9.000000. running mean: 1.385064
episode 6973.000000, reward total was -1.000000. running mean: 1.361214
episode 6974.000000, reward total was 3.000000. running mean: 1.377601
episode 6975.000000, reward total was 6.000000. running mean: 1.423825
episode 6976.000000, reward total was -1.000000. running mean: 1.399587
episode 6977.000000, reward total was 11.000000. running mean: 1.495591
episode 6978.000000, reward total was 9.000000. running mean: 1.570635
episode 6979.000000, reward total was -7.000000. running mean: 1.484929
episode 6980.000000, reward total was -9.000000. running mean: 1.380080
episode 6981.000000, reward total was 7.000000. running mean: 1.436279
episode 6982.000000, reward total was 12.000000. running mean: 1.541916
episode 6983.000000, reward total was 4.000000. running mean: 1.566497
episode 6984.000000, reward total was 3.000000. running mean: 1.580832
episode 6985.000000, reward total was -5.000000. running mean: 1.515024