In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  logger.warn(
  deprecation(
  deprecation(


In [2]:


%time hist1 = train_model(env, model, total_episodes=5500)

  logger.deprecation(


episode 1.000000, reward total was -20.000000. running mean: -20.000000
episode 2.000000, reward total was -18.000000. running mean: -19.980000
episode 3.000000, reward total was -18.000000. running mean: -19.960200
episode 4.000000, reward total was -19.000000. running mean: -19.950598
episode 5.000000, reward total was -21.000000. running mean: -19.961092
episode 6.000000, reward total was -21.000000. running mean: -19.971481
episode 7.000000, reward total was -21.000000. running mean: -19.981766
episode 8.000000, reward total was -20.000000. running mean: -19.981949
episode 9.000000, reward total was -18.000000. running mean: -19.962129
episode 10.000000, reward total was -21.000000. running mean: -19.972508
episode 11.000000, reward total was -21.000000. running mean: -19.982783
episode 12.000000, reward total was -19.000000. running mean: -19.972955
episode 13.000000, reward total was -21.000000. running mean: -19.983225
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.349441
episode 115.000000, reward total was -20.000000. running mean: -20.345946
episode 116.000000, reward total was -20.000000. running mean: -20.342487
episode 117.000000, reward total was -21.000000. running mean: -20.349062
episode 118.000000, reward total was -19.000000. running mean: -20.335571
episode 119.000000, reward total was -20.000000. running mean: -20.332215
episode 120.000000, reward total was -21.000000. running mean: -20.338893
episode 121.000000, reward total was -20.000000. running mean: -20.335504
episode 122.000000, reward total was -20.000000. running mean: -20.332149
episode 123.000000, reward total was -21.000000. running mean: -20.338828
episode 124.000000, reward total was -20.000000. running mean: -20.335440
episode 125.000000, reward total was -20.000000. running mean: -20.332085
episode 126.000000, reward total was -20.000000. running mean: -20.328764
episode 127.000000, reward total was -

episode 225.000000, reward total was -19.000000. running mean: -20.111024
episode 226.000000, reward total was -20.000000. running mean: -20.109913
episode 227.000000, reward total was -21.000000. running mean: -20.118814
episode 228.000000, reward total was -20.000000. running mean: -20.117626
episode 229.000000, reward total was -21.000000. running mean: -20.126450
episode 230.000000, reward total was -21.000000. running mean: -20.135185
episode 231.000000, reward total was -21.000000. running mean: -20.143833
episode 232.000000, reward total was -20.000000. running mean: -20.142395
episode 233.000000, reward total was -21.000000. running mean: -20.150971
episode 234.000000, reward total was -20.000000. running mean: -20.149461
episode 235.000000, reward total was -21.000000. running mean: -20.157967
episode 236.000000, reward total was -20.000000. running mean: -20.156387
episode 237.000000, reward total was -21.000000. running mean: -20.164823
episode 238.000000, reward total was -

episode 336.000000, reward total was -20.000000. running mean: -19.966986
episode 337.000000, reward total was -21.000000. running mean: -19.977316
episode 338.000000, reward total was -20.000000. running mean: -19.977543
episode 339.000000, reward total was -21.000000. running mean: -19.987768
episode 340.000000, reward total was -20.000000. running mean: -19.987890
episode 341.000000, reward total was -18.000000. running mean: -19.968011
episode 342.000000, reward total was -19.000000. running mean: -19.958331
episode 343.000000, reward total was -17.000000. running mean: -19.928748
episode 344.000000, reward total was -18.000000. running mean: -19.909460
episode 345.000000, reward total was -20.000000. running mean: -19.910366
episode 346.000000, reward total was -20.000000. running mean: -19.911262
episode 347.000000, reward total was -20.000000. running mean: -19.912149
episode 348.000000, reward total was -20.000000. running mean: -19.913028
episode 349.000000, reward total was -

episode 447.000000, reward total was -19.000000. running mean: -19.702809
episode 448.000000, reward total was -21.000000. running mean: -19.715781
episode 449.000000, reward total was -16.000000. running mean: -19.678623
episode 450.000000, reward total was -20.000000. running mean: -19.681837
episode 451.000000, reward total was -21.000000. running mean: -19.695019
episode 452.000000, reward total was -19.000000. running mean: -19.688068
episode 453.000000, reward total was -20.000000. running mean: -19.691188
episode 454.000000, reward total was -17.000000. running mean: -19.664276
episode 455.000000, reward total was -20.000000. running mean: -19.667633
episode 456.000000, reward total was -15.000000. running mean: -19.620957
episode 457.000000, reward total was -20.000000. running mean: -19.624747
episode 458.000000, reward total was -15.000000. running mean: -19.578500
episode 459.000000, reward total was -19.000000. running mean: -19.572715
episode 460.000000, reward total was -

episode 558.000000, reward total was -18.000000. running mean: -19.209531
episode 559.000000, reward total was -19.000000. running mean: -19.207436
episode 560.000000, reward total was -20.000000. running mean: -19.215362
episode 561.000000, reward total was -18.000000. running mean: -19.203208
episode 562.000000, reward total was -18.000000. running mean: -19.191176
episode 563.000000, reward total was -20.000000. running mean: -19.199264
episode 564.000000, reward total was -20.000000. running mean: -19.207272
episode 565.000000, reward total was -20.000000. running mean: -19.215199
episode 566.000000, reward total was -17.000000. running mean: -19.193047
episode 567.000000, reward total was -21.000000. running mean: -19.211116
episode 568.000000, reward total was -17.000000. running mean: -19.189005
episode 569.000000, reward total was -18.000000. running mean: -19.177115
episode 570.000000, reward total was -19.000000. running mean: -19.175344
episode 571.000000, reward total was -

episode 669.000000, reward total was -19.000000. running mean: -18.891716
episode 670.000000, reward total was -21.000000. running mean: -18.912798
episode 671.000000, reward total was -21.000000. running mean: -18.933670
episode 672.000000, reward total was -21.000000. running mean: -18.954334
episode 673.000000, reward total was -21.000000. running mean: -18.974790
episode 674.000000, reward total was -19.000000. running mean: -18.975042
episode 675.000000, reward total was -19.000000. running mean: -18.975292
episode 676.000000, reward total was -17.000000. running mean: -18.955539
episode 677.000000, reward total was -17.000000. running mean: -18.935984
episode 678.000000, reward total was -17.000000. running mean: -18.916624
episode 679.000000, reward total was -19.000000. running mean: -18.917458
episode 680.000000, reward total was -19.000000. running mean: -18.918283
episode 681.000000, reward total was -18.000000. running mean: -18.909100
episode 682.000000, reward total was -

episode 780.000000, reward total was -15.000000. running mean: -18.301582
episode 781.000000, reward total was -18.000000. running mean: -18.298566
episode 782.000000, reward total was -17.000000. running mean: -18.285580
episode 783.000000, reward total was -13.000000. running mean: -18.232725
episode 784.000000, reward total was -21.000000. running mean: -18.260397
episode 785.000000, reward total was -21.000000. running mean: -18.287793
episode 786.000000, reward total was -21.000000. running mean: -18.314916
episode 787.000000, reward total was -16.000000. running mean: -18.291766
episode 788.000000, reward total was -18.000000. running mean: -18.288849
episode 789.000000, reward total was -13.000000. running mean: -18.235960
episode 790.000000, reward total was -21.000000. running mean: -18.263601
episode 791.000000, reward total was -16.000000. running mean: -18.240965
episode 792.000000, reward total was -18.000000. running mean: -18.238555
episode 793.000000, reward total was -

episode 891.000000, reward total was -21.000000. running mean: -17.752600
episode 892.000000, reward total was -13.000000. running mean: -17.705074
episode 893.000000, reward total was -17.000000. running mean: -17.698023
episode 894.000000, reward total was -16.000000. running mean: -17.681043
episode 895.000000, reward total was -15.000000. running mean: -17.654232
episode 896.000000, reward total was -17.000000. running mean: -17.647690
episode 897.000000, reward total was -19.000000. running mean: -17.661213
episode 898.000000, reward total was -19.000000. running mean: -17.674601
episode 899.000000, reward total was -15.000000. running mean: -17.647855
episode 900.000000, reward total was -19.000000. running mean: -17.661376
episode 901.000000, reward total was -17.000000. running mean: -17.654763
episode 902.000000, reward total was -17.000000. running mean: -17.648215
episode 903.000000, reward total was -15.000000. running mean: -17.621733
episode 904.000000, reward total was -

episode 1002.000000, reward total was -20.000000. running mean: -17.249104
episode 1003.000000, reward total was -19.000000. running mean: -17.266613
episode 1004.000000, reward total was -18.000000. running mean: -17.273947
episode 1005.000000, reward total was -13.000000. running mean: -17.231208
episode 1006.000000, reward total was -16.000000. running mean: -17.218896
episode 1007.000000, reward total was -15.000000. running mean: -17.196707
episode 1008.000000, reward total was -19.000000. running mean: -17.214740
episode 1009.000000, reward total was -19.000000. running mean: -17.232592
episode 1010.000000, reward total was -15.000000. running mean: -17.210266
episode 1011.000000, reward total was -19.000000. running mean: -17.228164
episode 1012.000000, reward total was -17.000000. running mean: -17.225882
episode 1013.000000, reward total was -13.000000. running mean: -17.183623
episode 1014.000000, reward total was -15.000000. running mean: -17.161787
episode 1015.000000, rewa

episode 1112.000000, reward total was -17.000000. running mean: -16.600627
episode 1113.000000, reward total was -17.000000. running mean: -16.604621
episode 1114.000000, reward total was -12.000000. running mean: -16.558575
episode 1115.000000, reward total was -19.000000. running mean: -16.582989
episode 1116.000000, reward total was -21.000000. running mean: -16.627159
episode 1117.000000, reward total was -9.000000. running mean: -16.550888
episode 1118.000000, reward total was -14.000000. running mean: -16.525379
episode 1119.000000, reward total was -13.000000. running mean: -16.490125
episode 1120.000000, reward total was -17.000000. running mean: -16.495224
episode 1121.000000, reward total was -17.000000. running mean: -16.500271
episode 1122.000000, reward total was -15.000000. running mean: -16.485269
episode 1123.000000, reward total was -19.000000. running mean: -16.510416
episode 1124.000000, reward total was -18.000000. running mean: -16.525312
episode 1125.000000, rewar

episode 1222.000000, reward total was -17.000000. running mean: -16.193755
episode 1223.000000, reward total was -17.000000. running mean: -16.201817
episode 1224.000000, reward total was -17.000000. running mean: -16.209799
episode 1225.000000, reward total was -11.000000. running mean: -16.157701
episode 1226.000000, reward total was -15.000000. running mean: -16.146124
episode 1227.000000, reward total was -15.000000. running mean: -16.134663
episode 1228.000000, reward total was -15.000000. running mean: -16.123316
episode 1229.000000, reward total was -15.000000. running mean: -16.112083
episode 1230.000000, reward total was -18.000000. running mean: -16.130962
episode 1231.000000, reward total was -19.000000. running mean: -16.159653
episode 1232.000000, reward total was -13.000000. running mean: -16.128056
episode 1233.000000, reward total was -21.000000. running mean: -16.176776
episode 1234.000000, reward total was -9.000000. running mean: -16.105008
episode 1235.000000, rewar

episode 1332.000000, reward total was -17.000000. running mean: -16.002194
episode 1333.000000, reward total was -17.000000. running mean: -16.012172
episode 1334.000000, reward total was -15.000000. running mean: -16.002050
episode 1335.000000, reward total was -17.000000. running mean: -16.012030
episode 1336.000000, reward total was -14.000000. running mean: -15.991910
episode 1337.000000, reward total was -16.000000. running mean: -15.991991
episode 1338.000000, reward total was -17.000000. running mean: -16.002071
episode 1339.000000, reward total was -16.000000. running mean: -16.002050
episode 1340.000000, reward total was -15.000000. running mean: -15.992029
episode 1341.000000, reward total was -17.000000. running mean: -16.002109
episode 1342.000000, reward total was -15.000000. running mean: -15.992088
episode 1343.000000, reward total was -14.000000. running mean: -15.972167
episode 1344.000000, reward total was -17.000000. running mean: -15.982445
episode 1345.000000, rewa

episode 1442.000000, reward total was -12.000000. running mean: -15.653510
episode 1443.000000, reward total was -13.000000. running mean: -15.626975
episode 1444.000000, reward total was -15.000000. running mean: -15.620705
episode 1445.000000, reward total was -14.000000. running mean: -15.604498
episode 1446.000000, reward total was -17.000000. running mean: -15.618453
episode 1447.000000, reward total was -19.000000. running mean: -15.652269
episode 1448.000000, reward total was -8.000000. running mean: -15.575746
episode 1449.000000, reward total was -13.000000. running mean: -15.549989
episode 1450.000000, reward total was -13.000000. running mean: -15.524489
episode 1451.000000, reward total was -17.000000. running mean: -15.539244
episode 1452.000000, reward total was -17.000000. running mean: -15.553852
episode 1453.000000, reward total was -15.000000. running mean: -15.548313
episode 1454.000000, reward total was -13.000000. running mean: -15.522830
episode 1455.000000, rewar

episode 1552.000000, reward total was -15.000000. running mean: -15.316367
episode 1553.000000, reward total was -12.000000. running mean: -15.283203
episode 1554.000000, reward total was -17.000000. running mean: -15.300371
episode 1555.000000, reward total was -17.000000. running mean: -15.317367
episode 1556.000000, reward total was -15.000000. running mean: -15.314194
episode 1557.000000, reward total was -15.000000. running mean: -15.311052
episode 1558.000000, reward total was -17.000000. running mean: -15.327941
episode 1559.000000, reward total was -15.000000. running mean: -15.324662
episode 1560.000000, reward total was -19.000000. running mean: -15.361415
episode 1561.000000, reward total was -15.000000. running mean: -15.357801
episode 1562.000000, reward total was -13.000000. running mean: -15.334223
episode 1563.000000, reward total was -12.000000. running mean: -15.300881
episode 1564.000000, reward total was -15.000000. running mean: -15.297872
episode 1565.000000, rewa

episode 1662.000000, reward total was -15.000000. running mean: -14.663175
episode 1663.000000, reward total was -15.000000. running mean: -14.666544
episode 1664.000000, reward total was -16.000000. running mean: -14.679878
episode 1665.000000, reward total was -17.000000. running mean: -14.703079
episode 1666.000000, reward total was -12.000000. running mean: -14.676049
episode 1667.000000, reward total was -10.000000. running mean: -14.629288
episode 1668.000000, reward total was -17.000000. running mean: -14.652995
episode 1669.000000, reward total was -17.000000. running mean: -14.676465
episode 1670.000000, reward total was -11.000000. running mean: -14.639701
episode 1671.000000, reward total was -5.000000. running mean: -14.543304
episode 1672.000000, reward total was -11.000000. running mean: -14.507871
episode 1673.000000, reward total was -14.000000. running mean: -14.502792
episode 1674.000000, reward total was -12.000000. running mean: -14.477764
episode 1675.000000, rewar

episode 1772.000000, reward total was -11.000000. running mean: -14.112945
episode 1773.000000, reward total was -17.000000. running mean: -14.141816
episode 1774.000000, reward total was -10.000000. running mean: -14.100398
episode 1775.000000, reward total was -13.000000. running mean: -14.089394
episode 1776.000000, reward total was -11.000000. running mean: -14.058500
episode 1777.000000, reward total was -7.000000. running mean: -13.987915
episode 1778.000000, reward total was -16.000000. running mean: -14.008036
episode 1779.000000, reward total was -15.000000. running mean: -14.017955
episode 1780.000000, reward total was -11.000000. running mean: -13.987776
episode 1781.000000, reward total was -15.000000. running mean: -13.997898
episode 1782.000000, reward total was -12.000000. running mean: -13.977919
episode 1783.000000, reward total was -13.000000. running mean: -13.968140
episode 1784.000000, reward total was -16.000000. running mean: -13.988458
episode 1785.000000, rewar

episode 1882.000000, reward total was -13.000000. running mean: -13.676003
episode 1883.000000, reward total was -19.000000. running mean: -13.729243
episode 1884.000000, reward total was -10.000000. running mean: -13.691950
episode 1885.000000, reward total was -16.000000. running mean: -13.715031
episode 1886.000000, reward total was -14.000000. running mean: -13.717880
episode 1887.000000, reward total was -11.000000. running mean: -13.690702
episode 1888.000000, reward total was -14.000000. running mean: -13.693795
episode 1889.000000, reward total was -13.000000. running mean: -13.686857
episode 1890.000000, reward total was -11.000000. running mean: -13.659988
episode 1891.000000, reward total was -18.000000. running mean: -13.703388
episode 1892.000000, reward total was -10.000000. running mean: -13.666354
episode 1893.000000, reward total was -16.000000. running mean: -13.689691
episode 1894.000000, reward total was -13.000000. running mean: -13.682794
episode 1895.000000, rewa

episode 1992.000000, reward total was -16.000000. running mean: -13.303965
episode 1993.000000, reward total was -16.000000. running mean: -13.330926
episode 1994.000000, reward total was -17.000000. running mean: -13.367616
episode 1995.000000, reward total was -12.000000. running mean: -13.353940
episode 1996.000000, reward total was -21.000000. running mean: -13.430401
episode 1997.000000, reward total was -17.000000. running mean: -13.466097
episode 1998.000000, reward total was -17.000000. running mean: -13.501436
episode 1999.000000, reward total was -11.000000. running mean: -13.476421
episode 2000.000000, reward total was -12.000000. running mean: -13.461657
episode 2001.000000, reward total was -16.000000. running mean: -13.487041
episode 2002.000000, reward total was -14.000000. running mean: -13.492170
episode 2003.000000, reward total was -20.000000. running mean: -13.557248
episode 2004.000000, reward total was -5.000000. running mean: -13.471676
episode 2005.000000, rewar

episode 2102.000000, reward total was -16.000000. running mean: -13.214585
episode 2103.000000, reward total was -9.000000. running mean: -13.172439
episode 2104.000000, reward total was -13.000000. running mean: -13.170715
episode 2105.000000, reward total was -13.000000. running mean: -13.169008
episode 2106.000000, reward total was -18.000000. running mean: -13.217318
episode 2107.000000, reward total was -8.000000. running mean: -13.165144
episode 2108.000000, reward total was -12.000000. running mean: -13.153493
episode 2109.000000, reward total was -7.000000. running mean: -13.091958
episode 2110.000000, reward total was -16.000000. running mean: -13.121038
episode 2111.000000, reward total was -14.000000. running mean: -13.129828
episode 2112.000000, reward total was -17.000000. running mean: -13.168530
episode 2113.000000, reward total was -13.000000. running mean: -13.166844
episode 2114.000000, reward total was -13.000000. running mean: -13.165176
episode 2115.000000, reward 

episode 2212.000000, reward total was -12.000000. running mean: -13.220845
episode 2213.000000, reward total was -15.000000. running mean: -13.238637
episode 2214.000000, reward total was -15.000000. running mean: -13.256251
episode 2215.000000, reward total was -11.000000. running mean: -13.233688
episode 2216.000000, reward total was -13.000000. running mean: -13.231351
episode 2217.000000, reward total was -8.000000. running mean: -13.179038
episode 2218.000000, reward total was -11.000000. running mean: -13.157247
episode 2219.000000, reward total was -15.000000. running mean: -13.175675
episode 2220.000000, reward total was -15.000000. running mean: -13.193918
episode 2221.000000, reward total was -15.000000. running mean: -13.211979
episode 2222.000000, reward total was -17.000000. running mean: -13.249859
episode 2223.000000, reward total was -15.000000. running mean: -13.267361
episode 2224.000000, reward total was -12.000000. running mean: -13.254687
episode 2225.000000, rewar

episode 2322.000000, reward total was -13.000000. running mean: -12.802064
episode 2323.000000, reward total was -11.000000. running mean: -12.784044
episode 2324.000000, reward total was -14.000000. running mean: -12.796203
episode 2325.000000, reward total was -17.000000. running mean: -12.838241
episode 2326.000000, reward total was -12.000000. running mean: -12.829859
episode 2327.000000, reward total was -12.000000. running mean: -12.821560
episode 2328.000000, reward total was -3.000000. running mean: -12.723344
episode 2329.000000, reward total was -10.000000. running mean: -12.696111
episode 2330.000000, reward total was -5.000000. running mean: -12.619150
episode 2331.000000, reward total was -14.000000. running mean: -12.632958
episode 2332.000000, reward total was -10.000000. running mean: -12.606629
episode 2333.000000, reward total was -13.000000. running mean: -12.610563
episode 2334.000000, reward total was -8.000000. running mean: -12.564457
episode 2335.000000, reward 

episode 2432.000000, reward total was -15.000000. running mean: -12.764370
episode 2433.000000, reward total was -10.000000. running mean: -12.736727
episode 2434.000000, reward total was -10.000000. running mean: -12.709359
episode 2435.000000, reward total was -11.000000. running mean: -12.692266
episode 2436.000000, reward total was -5.000000. running mean: -12.615343
episode 2437.000000, reward total was -4.000000. running mean: -12.529190
episode 2438.000000, reward total was -13.000000. running mean: -12.533898
episode 2439.000000, reward total was -18.000000. running mean: -12.588559
episode 2440.000000, reward total was -17.000000. running mean: -12.632673
episode 2441.000000, reward total was -16.000000. running mean: -12.666347
episode 2442.000000, reward total was -15.000000. running mean: -12.689683
episode 2443.000000, reward total was -16.000000. running mean: -12.722786
episode 2444.000000, reward total was -11.000000. running mean: -12.705558
episode 2445.000000, reward

episode 2542.000000, reward total was -15.000000. running mean: -12.206489
episode 2543.000000, reward total was -11.000000. running mean: -12.194424
episode 2544.000000, reward total was -12.000000. running mean: -12.192480
episode 2545.000000, reward total was -12.000000. running mean: -12.190555
episode 2546.000000, reward total was -7.000000. running mean: -12.138649
episode 2547.000000, reward total was -13.000000. running mean: -12.147263
episode 2548.000000, reward total was -11.000000. running mean: -12.135790
episode 2549.000000, reward total was -8.000000. running mean: -12.094432
episode 2550.000000, reward total was 1.000000. running mean: -11.963488
episode 2551.000000, reward total was -15.000000. running mean: -11.993853
episode 2552.000000, reward total was -14.000000. running mean: -12.013915
episode 2553.000000, reward total was -11.000000. running mean: -12.003775
episode 2554.000000, reward total was -10.000000. running mean: -11.983738
episode 2555.000000, reward t

episode 2652.000000, reward total was -14.000000. running mean: -11.327590
episode 2653.000000, reward total was -15.000000. running mean: -11.364314
episode 2654.000000, reward total was -12.000000. running mean: -11.370671
episode 2655.000000, reward total was -6.000000. running mean: -11.316964
episode 2656.000000, reward total was -11.000000. running mean: -11.313794
episode 2657.000000, reward total was -7.000000. running mean: -11.270656
episode 2658.000000, reward total was -13.000000. running mean: -11.287950
episode 2659.000000, reward total was -12.000000. running mean: -11.295070
episode 2660.000000, reward total was -15.000000. running mean: -11.332120
episode 2661.000000, reward total was -7.000000. running mean: -11.288798
episode 2662.000000, reward total was -15.000000. running mean: -11.325910
episode 2663.000000, reward total was -9.000000. running mean: -11.302651
episode 2664.000000, reward total was -3.000000. running mean: -11.219625
episode 2665.000000, reward to

episode 2762.000000, reward total was -11.000000. running mean: -10.701337
episode 2763.000000, reward total was -17.000000. running mean: -10.764323
episode 2764.000000, reward total was -11.000000. running mean: -10.766680
episode 2765.000000, reward total was -2.000000. running mean: -10.679013
episode 2766.000000, reward total was -7.000000. running mean: -10.642223
episode 2767.000000, reward total was -9.000000. running mean: -10.625801
episode 2768.000000, reward total was -7.000000. running mean: -10.589543
episode 2769.000000, reward total was -8.000000. running mean: -10.563647
episode 2770.000000, reward total was -12.000000. running mean: -10.578011
episode 2771.000000, reward total was -6.000000. running mean: -10.532231
episode 2772.000000, reward total was -1.000000. running mean: -10.436909
episode 2773.000000, reward total was -12.000000. running mean: -10.452539
episode 2774.000000, reward total was -8.000000. running mean: -10.428014
episode 2775.000000, reward total

episode 2874.000000, reward total was -13.000000. running mean: -9.072681
episode 2875.000000, reward total was -12.000000. running mean: -9.101954
episode 2876.000000, reward total was -8.000000. running mean: -9.090934
episode 2877.000000, reward total was -15.000000. running mean: -9.150025
episode 2878.000000, reward total was -7.000000. running mean: -9.128525
episode 2879.000000, reward total was -12.000000. running mean: -9.157239
episode 2880.000000, reward total was -16.000000. running mean: -9.225667
episode 2881.000000, reward total was -4.000000. running mean: -9.173410
episode 2882.000000, reward total was 1.000000. running mean: -9.071676
episode 2883.000000, reward total was -7.000000. running mean: -9.050959
episode 2884.000000, reward total was -13.000000. running mean: -9.090450
episode 2885.000000, reward total was -11.000000. running mean: -9.109545
episode 2886.000000, reward total was -5.000000. running mean: -9.068450
episode 2887.000000, reward total was -7.0000

episode 2986.000000, reward total was -10.000000. running mean: -9.436669
episode 2987.000000, reward total was -15.000000. running mean: -9.492302
episode 2988.000000, reward total was -17.000000. running mean: -9.567379
episode 2989.000000, reward total was -11.000000. running mean: -9.581705
episode 2990.000000, reward total was -11.000000. running mean: -9.595888
episode 2991.000000, reward total was -15.000000. running mean: -9.649930
episode 2992.000000, reward total was -3.000000. running mean: -9.583430
episode 2993.000000, reward total was -11.000000. running mean: -9.597596
episode 2994.000000, reward total was -6.000000. running mean: -9.561620
episode 2995.000000, reward total was -11.000000. running mean: -9.576004
episode 2996.000000, reward total was -3.000000. running mean: -9.510244
episode 2997.000000, reward total was -10.000000. running mean: -9.515141
episode 2998.000000, reward total was -13.000000. running mean: -9.549990
episode 2999.000000, reward total was -16

episode 3098.000000, reward total was 2.000000. running mean: -9.612818
episode 3099.000000, reward total was 2.000000. running mean: -9.496690
episode 3100.000000, reward total was -9.000000. running mean: -9.491723
episode 3101.000000, reward total was -6.000000. running mean: -9.456806
episode 3102.000000, reward total was -12.000000. running mean: -9.482238
episode 3103.000000, reward total was -8.000000. running mean: -9.467415
episode 3104.000000, reward total was -12.000000. running mean: -9.492741
episode 3105.000000, reward total was -2.000000. running mean: -9.417814
episode 3106.000000, reward total was -8.000000. running mean: -9.403635
episode 3107.000000, reward total was -8.000000. running mean: -9.389599
episode 3108.000000, reward total was -11.000000. running mean: -9.405703
episode 3109.000000, reward total was -11.000000. running mean: -9.421646
episode 3110.000000, reward total was -7.000000. running mean: -9.397430
episode 3111.000000, reward total was -10.000000.

episode 3210.000000, reward total was -17.000000. running mean: -9.577718
episode 3211.000000, reward total was -3.000000. running mean: -9.511941
episode 3212.000000, reward total was -10.000000. running mean: -9.516822
episode 3213.000000, reward total was -12.000000. running mean: -9.541653
episode 3214.000000, reward total was -8.000000. running mean: -9.526237
episode 3215.000000, reward total was -7.000000. running mean: -9.500974
episode 3216.000000, reward total was -11.000000. running mean: -9.515965
episode 3217.000000, reward total was -11.000000. running mean: -9.530805
episode 3218.000000, reward total was -3.000000. running mean: -9.465497
episode 3219.000000, reward total was -15.000000. running mean: -9.520842
episode 3220.000000, reward total was -7.000000. running mean: -9.495634
episode 3221.000000, reward total was -13.000000. running mean: -9.530677
episode 3222.000000, reward total was -3.000000. running mean: -9.465370
episode 3223.000000, reward total was -10.00

episode 3322.000000, reward total was -1.000000. running mean: -8.760061
episode 3323.000000, reward total was -14.000000. running mean: -8.812461
episode 3324.000000, reward total was -9.000000. running mean: -8.814336
episode 3325.000000, reward total was -10.000000. running mean: -8.826193
episode 3326.000000, reward total was -6.000000. running mean: -8.797931
episode 3327.000000, reward total was -5.000000. running mean: -8.759952
episode 3328.000000, reward total was -13.000000. running mean: -8.802352
episode 3329.000000, reward total was -13.000000. running mean: -8.844329
episode 3330.000000, reward total was -10.000000. running mean: -8.855885
episode 3331.000000, reward total was 1.000000. running mean: -8.757326
episode 3332.000000, reward total was -9.000000. running mean: -8.759753
episode 3333.000000, reward total was -5.000000. running mean: -8.722156
episode 3334.000000, reward total was -14.000000. running mean: -8.774934
episode 3335.000000, reward total was -15.0000

episode 3434.000000, reward total was -15.000000. running mean: -8.659572
episode 3435.000000, reward total was -15.000000. running mean: -8.722977
episode 3436.000000, reward total was -7.000000. running mean: -8.705747
episode 3437.000000, reward total was -8.000000. running mean: -8.698689
episode 3438.000000, reward total was -14.000000. running mean: -8.751702
episode 3439.000000, reward total was -1.000000. running mean: -8.674185
episode 3440.000000, reward total was -6.000000. running mean: -8.647444
episode 3441.000000, reward total was -6.000000. running mean: -8.620969
episode 3442.000000, reward total was -5.000000. running mean: -8.584759
episode 3443.000000, reward total was -7.000000. running mean: -8.568912
episode 3444.000000, reward total was -15.000000. running mean: -8.633223
episode 3445.000000, reward total was -13.000000. running mean: -8.676891
episode 3446.000000, reward total was -4.000000. running mean: -8.630122
episode 3447.000000, reward total was -12.0000

episode 3546.000000, reward total was -3.000000. running mean: -8.792078
episode 3547.000000, reward total was -8.000000. running mean: -8.784157
episode 3548.000000, reward total was -9.000000. running mean: -8.786316
episode 3549.000000, reward total was -3.000000. running mean: -8.728453
episode 3550.000000, reward total was -17.000000. running mean: -8.811168
episode 3551.000000, reward total was -2.000000. running mean: -8.743056
episode 3552.000000, reward total was -6.000000. running mean: -8.715626
episode 3553.000000, reward total was -3.000000. running mean: -8.658470
episode 3554.000000, reward total was -9.000000. running mean: -8.661885
episode 3555.000000, reward total was -9.000000. running mean: -8.665266
episode 3556.000000, reward total was -6.000000. running mean: -8.638613
episode 3557.000000, reward total was -8.000000. running mean: -8.632227
episode 3558.000000, reward total was -7.000000. running mean: -8.615905
episode 3559.000000, reward total was -5.000000. r

episode 3658.000000, reward total was -12.000000. running mean: -8.494445
episode 3659.000000, reward total was -9.000000. running mean: -8.499501
episode 3660.000000, reward total was -13.000000. running mean: -8.544506
episode 3661.000000, reward total was -9.000000. running mean: -8.549061
episode 3662.000000, reward total was -11.000000. running mean: -8.573570
episode 3663.000000, reward total was -11.000000. running mean: -8.597834
episode 3664.000000, reward total was -15.000000. running mean: -8.661856
episode 3665.000000, reward total was -9.000000. running mean: -8.665238
episode 3666.000000, reward total was -5.000000. running mean: -8.628585
episode 3667.000000, reward total was -12.000000. running mean: -8.662299
episode 3668.000000, reward total was -9.000000. running mean: -8.665676
episode 3669.000000, reward total was -5.000000. running mean: -8.629020
episode 3670.000000, reward total was -14.000000. running mean: -8.682729
episode 3671.000000, reward total was -3.000

episode 3770.000000, reward total was -15.000000. running mean: -7.383870
episode 3771.000000, reward total was -6.000000. running mean: -7.370031
episode 3772.000000, reward total was -11.000000. running mean: -7.406331
episode 3773.000000, reward total was -1.000000. running mean: -7.342268
episode 3774.000000, reward total was -7.000000. running mean: -7.338845
episode 3775.000000, reward total was -8.000000. running mean: -7.345457
episode 3776.000000, reward total was -8.000000. running mean: -7.352002
episode 3777.000000, reward total was -10.000000. running mean: -7.378482
episode 3778.000000, reward total was -6.000000. running mean: -7.364697
episode 3779.000000, reward total was -11.000000. running mean: -7.401050
episode 3780.000000, reward total was -13.000000. running mean: -7.457040
episode 3781.000000, reward total was 1.000000. running mean: -7.372469
episode 3782.000000, reward total was -7.000000. running mean: -7.368745
episode 3783.000000, reward total was -9.000000

episode 3882.000000, reward total was -2.000000. running mean: -7.152460
episode 3883.000000, reward total was -4.000000. running mean: -7.120936
episode 3884.000000, reward total was -7.000000. running mean: -7.119726
episode 3885.000000, reward total was -5.000000. running mean: -7.098529
episode 3886.000000, reward total was -13.000000. running mean: -7.157544
episode 3887.000000, reward total was -7.000000. running mean: -7.155968
episode 3888.000000, reward total was -9.000000. running mean: -7.174409
episode 3889.000000, reward total was -4.000000. running mean: -7.142665
episode 3890.000000, reward total was -7.000000. running mean: -7.141238
episode 3891.000000, reward total was -9.000000. running mean: -7.159826
episode 3892.000000, reward total was -7.000000. running mean: -7.158227
episode 3893.000000, reward total was -5.000000. running mean: -7.136645
episode 3894.000000, reward total was -9.000000. running mean: -7.155279
episode 3895.000000, reward total was -11.000000. 

episode 3994.000000, reward total was -2.000000. running mean: -7.855101
episode 3995.000000, reward total was -12.000000. running mean: -7.896550
episode 3996.000000, reward total was -6.000000. running mean: -7.877585
episode 3997.000000, reward total was -17.000000. running mean: -7.968809
episode 3998.000000, reward total was -3.000000. running mean: -7.919121
episode 3999.000000, reward total was 1.000000. running mean: -7.829930
episode 4000.000000, reward total was -12.000000. running mean: -7.871630
episode 4001.000000, reward total was -1.000000. running mean: -7.802914
episode 4002.000000, reward total was -11.000000. running mean: -7.834885
episode 4003.000000, reward total was -3.000000. running mean: -7.786536
episode 4004.000000, reward total was -2.000000. running mean: -7.728671
episode 4005.000000, reward total was -14.000000. running mean: -7.791384
episode 4006.000000, reward total was 1.000000. running mean: -7.703470
episode 4007.000000, reward total was -16.000000

episode 4106.000000, reward total was -14.000000. running mean: -6.617808
episode 4107.000000, reward total was -3.000000. running mean: -6.581630
episode 4108.000000, reward total was -14.000000. running mean: -6.655814
episode 4109.000000, reward total was -9.000000. running mean: -6.679256
episode 4110.000000, reward total was -10.000000. running mean: -6.712463
episode 4111.000000, reward total was -4.000000. running mean: -6.685338
episode 4112.000000, reward total was -3.000000. running mean: -6.648485
episode 4113.000000, reward total was -4.000000. running mean: -6.622000
episode 4114.000000, reward total was -4.000000. running mean: -6.595780
episode 4115.000000, reward total was 2.000000. running mean: -6.509822
episode 4116.000000, reward total was -1.000000. running mean: -6.454724
episode 4117.000000, reward total was 5.000000. running mean: -6.340177
episode 4118.000000, reward total was -7.000000. running mean: -6.346775
episode 4119.000000, reward total was -7.000000. r

episode 4218.000000, reward total was -2.000000. running mean: -6.487690
episode 4219.000000, reward total was 2.000000. running mean: -6.402813
episode 4220.000000, reward total was -7.000000. running mean: -6.408785
episode 4221.000000, reward total was -12.000000. running mean: -6.464697
episode 4222.000000, reward total was -6.000000. running mean: -6.460050
episode 4223.000000, reward total was -10.000000. running mean: -6.495450
episode 4224.000000, reward total was -1.000000. running mean: -6.440495
episode 4225.000000, reward total was -1.000000. running mean: -6.386090
episode 4226.000000, reward total was -7.000000. running mean: -6.392230
episode 4227.000000, reward total was -8.000000. running mean: -6.408307
episode 4228.000000, reward total was -6.000000. running mean: -6.404224
episode 4229.000000, reward total was -3.000000. running mean: -6.370182
episode 4230.000000, reward total was -14.000000. running mean: -6.446480
episode 4231.000000, reward total was -5.000000. 

episode 4331.000000, reward total was -7.000000. running mean: -5.633332
episode 4332.000000, reward total was 3.000000. running mean: -5.546999
episode 4333.000000, reward total was -5.000000. running mean: -5.541529
episode 4334.000000, reward total was -7.000000. running mean: -5.556114
episode 4335.000000, reward total was -3.000000. running mean: -5.530552
episode 4336.000000, reward total was -2.000000. running mean: -5.495247
episode 4337.000000, reward total was -3.000000. running mean: -5.470294
episode 4338.000000, reward total was -4.000000. running mean: -5.455592
episode 4339.000000, reward total was -2.000000. running mean: -5.421036
episode 4340.000000, reward total was -12.000000. running mean: -5.486825
episode 4341.000000, reward total was 3.000000. running mean: -5.401957
episode 4342.000000, reward total was -8.000000. running mean: -5.427937
episode 4343.000000, reward total was -13.000000. running mean: -5.503658
episode 4344.000000, reward total was -7.000000. ru

episode 4444.000000, reward total was -8.000000. running mean: -5.899808
episode 4445.000000, reward total was -17.000000. running mean: -6.010809
episode 4446.000000, reward total was -5.000000. running mean: -6.000701
episode 4447.000000, reward total was -8.000000. running mean: -6.020694
episode 4448.000000, reward total was -13.000000. running mean: -6.090487
episode 4449.000000, reward total was -4.000000. running mean: -6.069583
episode 4450.000000, reward total was -9.000000. running mean: -6.098887
episode 4451.000000, reward total was -17.000000. running mean: -6.207898
episode 4452.000000, reward total was -15.000000. running mean: -6.295819
episode 4453.000000, reward total was -11.000000. running mean: -6.342861
episode 4454.000000, reward total was -8.000000. running mean: -6.359432
episode 4455.000000, reward total was -10.000000. running mean: -6.395838
episode 4456.000000, reward total was -12.000000. running mean: -6.451879
episode 4457.000000, reward total was -6.000

episode 4556.000000, reward total was -6.000000. running mean: -6.165149
episode 4557.000000, reward total was -5.000000. running mean: -6.153498
episode 4558.000000, reward total was 3.000000. running mean: -6.061963
episode 4559.000000, reward total was -9.000000. running mean: -6.091343
episode 4560.000000, reward total was -11.000000. running mean: -6.140430
episode 4561.000000, reward total was -10.000000. running mean: -6.179026
episode 4562.000000, reward total was -11.000000. running mean: -6.227235
episode 4563.000000, reward total was -3.000000. running mean: -6.194963
episode 4564.000000, reward total was 7.000000. running mean: -6.063013
episode 4565.000000, reward total was -6.000000. running mean: -6.062383
episode 4566.000000, reward total was 2.000000. running mean: -5.981759
episode 4567.000000, reward total was 5.000000. running mean: -5.871942
episode 4568.000000, reward total was -6.000000. running mean: -5.873222
episode 4569.000000, reward total was -8.000000. run

episode 4669.000000, reward total was -13.000000. running mean: -5.875959
episode 4670.000000, reward total was 6.000000. running mean: -5.757199
episode 4671.000000, reward total was -5.000000. running mean: -5.749627
episode 4672.000000, reward total was -11.000000. running mean: -5.802131
episode 4673.000000, reward total was -13.000000. running mean: -5.874110
episode 4674.000000, reward total was -9.000000. running mean: -5.905369
episode 4675.000000, reward total was 3.000000. running mean: -5.816315
episode 4676.000000, reward total was -17.000000. running mean: -5.928152
episode 4677.000000, reward total was -5.000000. running mean: -5.918870
episode 4678.000000, reward total was -11.000000. running mean: -5.969682
episode 4679.000000, reward total was 9.000000. running mean: -5.819985
episode 4680.000000, reward total was 4.000000. running mean: -5.721785
episode 4681.000000, reward total was -9.000000. running mean: -5.754567
episode 4682.000000, reward total was -9.000000. r

episode 4782.000000, reward total was -12.000000. running mean: -5.085262
episode 4783.000000, reward total was -6.000000. running mean: -5.094409
episode 4784.000000, reward total was -11.000000. running mean: -5.153465
episode 4785.000000, reward total was -11.000000. running mean: -5.211931
episode 4786.000000, reward total was -7.000000. running mean: -5.229811
episode 4787.000000, reward total was -2.000000. running mean: -5.197513
episode 4788.000000, reward total was 5.000000. running mean: -5.095538
episode 4789.000000, reward total was -3.000000. running mean: -5.074583
episode 4790.000000, reward total was 1.000000. running mean: -5.013837
episode 4791.000000, reward total was -12.000000. running mean: -5.083699
episode 4792.000000, reward total was 6.000000. running mean: -4.972862
episode 4793.000000, reward total was -4.000000. running mean: -4.963133
episode 4794.000000, reward total was -2.000000. running mean: -4.933502
episode 4795.000000, reward total was -9.000000. r

episode 4895.000000, reward total was -11.000000. running mean: -5.200813
episode 4896.000000, reward total was -2.000000. running mean: -5.168804
episode 4897.000000, reward total was -7.000000. running mean: -5.187116
episode 4898.000000, reward total was -3.000000. running mean: -5.165245
episode 4899.000000, reward total was 2.000000. running mean: -5.093593
episode 4900.000000, reward total was -3.000000. running mean: -5.072657
episode 4901.000000, reward total was -1.000000. running mean: -5.031930
episode 4902.000000, reward total was -3.000000. running mean: -5.011611
episode 4903.000000, reward total was -1.000000. running mean: -4.971495
episode 4904.000000, reward total was 6.000000. running mean: -4.861780
episode 4905.000000, reward total was -10.000000. running mean: -4.913162
episode 4906.000000, reward total was -14.000000. running mean: -5.004031
episode 4907.000000, reward total was -5.000000. running mean: -5.003990
episode 4908.000000, reward total was 1.000000. ru

episode 5008.000000, reward total was -6.000000. running mean: -4.638596
episode 5009.000000, reward total was -9.000000. running mean: -4.682210
episode 5010.000000, reward total was -4.000000. running mean: -4.675388
episode 5011.000000, reward total was -2.000000. running mean: -4.648634
episode 5012.000000, reward total was -6.000000. running mean: -4.662147
episode 5013.000000, reward total was -1.000000. running mean: -4.625526
episode 5014.000000, reward total was -8.000000. running mean: -4.659271
episode 5015.000000, reward total was -8.000000. running mean: -4.692678
episode 5016.000000, reward total was -12.000000. running mean: -4.765751
episode 5017.000000, reward total was -7.000000. running mean: -4.788094
episode 5018.000000, reward total was -5.000000. running mean: -4.790213
episode 5019.000000, reward total was -5.000000. running mean: -4.792311
episode 5020.000000, reward total was -2.000000. running mean: -4.764387
episode 5021.000000, reward total was -9.000000. r

episode 5121.000000, reward total was -7.000000. running mean: -4.780121
episode 5122.000000, reward total was -11.000000. running mean: -4.842320
episode 5123.000000, reward total was -11.000000. running mean: -4.903897
episode 5124.000000, reward total was 7.000000. running mean: -4.784858
episode 5125.000000, reward total was -7.000000. running mean: -4.807009
episode 5126.000000, reward total was -5.000000. running mean: -4.808939
episode 5127.000000, reward total was -7.000000. running mean: -4.830850
episode 5128.000000, reward total was -7.000000. running mean: -4.852541
episode 5129.000000, reward total was -4.000000. running mean: -4.844016
episode 5130.000000, reward total was -11.000000. running mean: -4.905576
episode 5131.000000, reward total was -5.000000. running mean: -4.906520
episode 5132.000000, reward total was 2.000000. running mean: -4.837455
episode 5133.000000, reward total was -11.000000. running mean: -4.899080
episode 5134.000000, reward total was -7.000000. 

episode 5234.000000, reward total was -6.000000. running mean: -5.458244
episode 5235.000000, reward total was -10.000000. running mean: -5.503662
episode 5236.000000, reward total was -7.000000. running mean: -5.518625
episode 5237.000000, reward total was -9.000000. running mean: -5.553439
episode 5238.000000, reward total was 7.000000. running mean: -5.427905
episode 5239.000000, reward total was -8.000000. running mean: -5.453626
episode 5240.000000, reward total was -9.000000. running mean: -5.489089
episode 5241.000000, reward total was -3.000000. running mean: -5.464198
episode 5242.000000, reward total was 6.000000. running mean: -5.349556
episode 5243.000000, reward total was 1.000000. running mean: -5.286061
episode 5244.000000, reward total was -10.000000. running mean: -5.333200
episode 5245.000000, reward total was 11.000000. running mean: -5.169868
episode 5246.000000, reward total was -3.000000. running mean: -5.148170
episode 5247.000000, reward total was -14.000000. ru

episode 5347.000000, reward total was -5.000000. running mean: -5.215736
episode 5348.000000, reward total was -1.000000. running mean: -5.173579
episode 5349.000000, reward total was 1.000000. running mean: -5.111843
episode 5350.000000, reward total was 7.000000. running mean: -4.990724
episode 5351.000000, reward total was -7.000000. running mean: -5.010817
episode 5352.000000, reward total was 16.000000. running mean: -4.800709
episode 5353.000000, reward total was -5.000000. running mean: -4.802702
episode 5354.000000, reward total was -1.000000. running mean: -4.764675
episode 5355.000000, reward total was -10.000000. running mean: -4.817028
episode 5356.000000, reward total was -6.000000. running mean: -4.828858
episode 5357.000000, reward total was -10.000000. running mean: -4.880569
episode 5358.000000, reward total was 4.000000. running mean: -4.791764
episode 5359.000000, reward total was -11.000000. running mean: -4.853846
episode 5360.000000, reward total was 3.000000. run

episode 5460.000000, reward total was 8.000000. running mean: -4.804721
episode 5461.000000, reward total was -5.000000. running mean: -4.806674
episode 5462.000000, reward total was -3.000000. running mean: -4.788607
episode 5463.000000, reward total was -7.000000. running mean: -4.810721
episode 5464.000000, reward total was -7.000000. running mean: -4.832614
episode 5465.000000, reward total was -2.000000. running mean: -4.804288
episode 5466.000000, reward total was -3.000000. running mean: -4.786245
episode 5467.000000, reward total was -7.000000. running mean: -4.808382
episode 5468.000000, reward total was -11.000000. running mean: -4.870298
episode 5469.000000, reward total was -12.000000. running mean: -4.941595
episode 5470.000000, reward total was 3.000000. running mean: -4.862179
episode 5471.000000, reward total was 4.000000. running mean: -4.773558
episode 5472.000000, reward total was -12.000000. running mean: -4.845822
episode 5473.000000, reward total was -9.000000. ru