In [1]:
import gym
import numpy as np


In [2]:


from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 400 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

    # if reward != 0: # Pong has either +1 or -1 reward exactly when game ends.
    #   print('ep {}: game finished, reward: {}'.format(episode_number, reward) + ('' if reward == -1 else ' !!!!!!!!'))

  logger.warn(
  deprecation(
  deprecation(


In [3]:
%time hist1 = train_model(env, model, total_episodes=5500)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -20.000000. running mean: -20.990000
episode 3.000000, reward total was -21.000000. running mean: -20.990100
episode 4.000000, reward total was -20.000000. running mean: -20.980199
episode 5.000000, reward total was -21.000000. running mean: -20.980397
episode 6.000000, reward total was -21.000000. running mean: -20.980593
episode 7.000000, reward total was -19.000000. running mean: -20.960787
episode 8.000000, reward total was -21.000000. running mean: -20.961179
episode 9.000000, reward total was -21.000000. running mean: -20.961567
episode 10.000000, reward total was -21.000000. running mean: -20.961952
episode 11.000000, reward total was -21.000000. running mean: -20.962332
episode 12.000000, reward total was -21.000000. running mean: -20.962709
episode 13.000000, reward total was -21.000000. running mean: -20.963082
episode 14.000000, reward total was -20.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.642302
episode 115.000000, reward total was -20.000000. running mean: -20.635879
episode 116.000000, reward total was -21.000000. running mean: -20.639520
episode 117.000000, reward total was -21.000000. running mean: -20.643125
episode 118.000000, reward total was -21.000000. running mean: -20.646693
episode 119.000000, reward total was -21.000000. running mean: -20.650226
episode 120.000000, reward total was -20.000000. running mean: -20.643724
episode 121.000000, reward total was -21.000000. running mean: -20.647287
episode 122.000000, reward total was -21.000000. running mean: -20.650814
episode 123.000000, reward total was -21.000000. running mean: -20.654306
episode 124.000000, reward total was -21.000000. running mean: -20.657763
episode 125.000000, reward total was -20.000000. running mean: -20.651185
episode 126.000000, reward total was -21.000000. running mean: -20.654673
episode 127.000000, reward total was -

episode 225.000000, reward total was -21.000000. running mean: -20.361948
episode 226.000000, reward total was -21.000000. running mean: -20.368329
episode 227.000000, reward total was -21.000000. running mean: -20.374646
episode 228.000000, reward total was -19.000000. running mean: -20.360899
episode 229.000000, reward total was -18.000000. running mean: -20.337290
episode 230.000000, reward total was -21.000000. running mean: -20.343917
episode 231.000000, reward total was -21.000000. running mean: -20.350478
episode 232.000000, reward total was -20.000000. running mean: -20.346973
episode 233.000000, reward total was -21.000000. running mean: -20.353504
episode 234.000000, reward total was -21.000000. running mean: -20.359969
episode 235.000000, reward total was -21.000000. running mean: -20.366369
episode 236.000000, reward total was -21.000000. running mean: -20.372705
episode 237.000000, reward total was -21.000000. running mean: -20.378978
episode 238.000000, reward total was -

episode 336.000000, reward total was -21.000000. running mean: -20.431145
episode 337.000000, reward total was -20.000000. running mean: -20.426834
episode 338.000000, reward total was -21.000000. running mean: -20.432565
episode 339.000000, reward total was -20.000000. running mean: -20.428240
episode 340.000000, reward total was -21.000000. running mean: -20.433957
episode 341.000000, reward total was -20.000000. running mean: -20.429618
episode 342.000000, reward total was -21.000000. running mean: -20.435321
episode 343.000000, reward total was -20.000000. running mean: -20.430968
episode 344.000000, reward total was -21.000000. running mean: -20.436659
episode 345.000000, reward total was -21.000000. running mean: -20.442292
episode 346.000000, reward total was -21.000000. running mean: -20.447869
episode 347.000000, reward total was -19.000000. running mean: -20.433390
episode 348.000000, reward total was -21.000000. running mean: -20.439056
episode 349.000000, reward total was -

episode 447.000000, reward total was -19.000000. running mean: -20.374227
episode 448.000000, reward total was -20.000000. running mean: -20.370485
episode 449.000000, reward total was -20.000000. running mean: -20.366780
episode 450.000000, reward total was -19.000000. running mean: -20.353112
episode 451.000000, reward total was -21.000000. running mean: -20.359581
episode 452.000000, reward total was -20.000000. running mean: -20.355985
episode 453.000000, reward total was -19.000000. running mean: -20.342425
episode 454.000000, reward total was -21.000000. running mean: -20.349001
episode 455.000000, reward total was -21.000000. running mean: -20.355511
episode 456.000000, reward total was -21.000000. running mean: -20.361956
episode 457.000000, reward total was -21.000000. running mean: -20.368336
episode 458.000000, reward total was -21.000000. running mean: -20.374653
episode 459.000000, reward total was -21.000000. running mean: -20.380906
episode 460.000000, reward total was -

episode 558.000000, reward total was -21.000000. running mean: -20.362740
episode 559.000000, reward total was -20.000000. running mean: -20.359112
episode 560.000000, reward total was -21.000000. running mean: -20.365521
episode 561.000000, reward total was -21.000000. running mean: -20.371866
episode 562.000000, reward total was -21.000000. running mean: -20.378147
episode 563.000000, reward total was -20.000000. running mean: -20.374366
episode 564.000000, reward total was -21.000000. running mean: -20.380622
episode 565.000000, reward total was -18.000000. running mean: -20.356816
episode 566.000000, reward total was -19.000000. running mean: -20.343248
episode 567.000000, reward total was -21.000000. running mean: -20.349815
episode 568.000000, reward total was -21.000000. running mean: -20.356317
episode 569.000000, reward total was -19.000000. running mean: -20.342754
episode 570.000000, reward total was -21.000000. running mean: -20.349326
episode 571.000000, reward total was -

episode 669.000000, reward total was -19.000000. running mean: -20.317592
episode 670.000000, reward total was -21.000000. running mean: -20.324416
episode 671.000000, reward total was -21.000000. running mean: -20.331172
episode 672.000000, reward total was -21.000000. running mean: -20.337860
episode 673.000000, reward total was -20.000000. running mean: -20.334481
episode 674.000000, reward total was -21.000000. running mean: -20.341136
episode 675.000000, reward total was -20.000000. running mean: -20.337725
episode 676.000000, reward total was -20.000000. running mean: -20.334348
episode 677.000000, reward total was -21.000000. running mean: -20.341004
episode 678.000000, reward total was -20.000000. running mean: -20.337594
episode 679.000000, reward total was -20.000000. running mean: -20.334218
episode 680.000000, reward total was -17.000000. running mean: -20.300876
episode 681.000000, reward total was -19.000000. running mean: -20.287867
episode 682.000000, reward total was -

episode 780.000000, reward total was -19.000000. running mean: -20.184073
episode 781.000000, reward total was -19.000000. running mean: -20.172232
episode 782.000000, reward total was -20.000000. running mean: -20.170510
episode 783.000000, reward total was -20.000000. running mean: -20.168805
episode 784.000000, reward total was -21.000000. running mean: -20.177117
episode 785.000000, reward total was -21.000000. running mean: -20.185345
episode 786.000000, reward total was -21.000000. running mean: -20.193492
episode 787.000000, reward total was -21.000000. running mean: -20.201557
episode 788.000000, reward total was -20.000000. running mean: -20.199542
episode 789.000000, reward total was -20.000000. running mean: -20.197546
episode 790.000000, reward total was -21.000000. running mean: -20.205571
episode 791.000000, reward total was -21.000000. running mean: -20.213515
episode 792.000000, reward total was -20.000000. running mean: -20.211380
episode 793.000000, reward total was -

episode 891.000000, reward total was -21.000000. running mean: -20.235734
episode 892.000000, reward total was -21.000000. running mean: -20.243377
episode 893.000000, reward total was -20.000000. running mean: -20.240943
episode 894.000000, reward total was -21.000000. running mean: -20.248534
episode 895.000000, reward total was -19.000000. running mean: -20.236048
episode 896.000000, reward total was -20.000000. running mean: -20.233688
episode 897.000000, reward total was -21.000000. running mean: -20.241351
episode 898.000000, reward total was -21.000000. running mean: -20.248937
episode 899.000000, reward total was -21.000000. running mean: -20.256448
episode 900.000000, reward total was -21.000000. running mean: -20.263883
episode 901.000000, reward total was -20.000000. running mean: -20.261245
episode 902.000000, reward total was -21.000000. running mean: -20.268632
episode 903.000000, reward total was -19.000000. running mean: -20.255946
episode 904.000000, reward total was -

episode 1002.000000, reward total was -21.000000. running mean: -20.165837
episode 1003.000000, reward total was -21.000000. running mean: -20.174178
episode 1004.000000, reward total was -20.000000. running mean: -20.172436
episode 1005.000000, reward total was -21.000000. running mean: -20.180712
episode 1006.000000, reward total was -21.000000. running mean: -20.188905
episode 1007.000000, reward total was -21.000000. running mean: -20.197016
episode 1008.000000, reward total was -19.000000. running mean: -20.185046
episode 1009.000000, reward total was -20.000000. running mean: -20.183195
episode 1010.000000, reward total was -20.000000. running mean: -20.181363
episode 1011.000000, reward total was -17.000000. running mean: -20.149550
episode 1012.000000, reward total was -20.000000. running mean: -20.148054
episode 1013.000000, reward total was -20.000000. running mean: -20.146574
episode 1014.000000, reward total was -20.000000. running mean: -20.145108
episode 1015.000000, rewa

episode 1112.000000, reward total was -19.000000. running mean: -20.107633
episode 1113.000000, reward total was -20.000000. running mean: -20.106557
episode 1114.000000, reward total was -21.000000. running mean: -20.115491
episode 1115.000000, reward total was -21.000000. running mean: -20.124337
episode 1116.000000, reward total was -21.000000. running mean: -20.133093
episode 1117.000000, reward total was -21.000000. running mean: -20.141762
episode 1118.000000, reward total was -21.000000. running mean: -20.150345
episode 1119.000000, reward total was -20.000000. running mean: -20.148841
episode 1120.000000, reward total was -20.000000. running mean: -20.147353
episode 1121.000000, reward total was -20.000000. running mean: -20.145879
episode 1122.000000, reward total was -18.000000. running mean: -20.124420
episode 1123.000000, reward total was -20.000000. running mean: -20.123176
episode 1124.000000, reward total was -21.000000. running mean: -20.131945
episode 1125.000000, rewa

episode 1222.000000, reward total was -20.000000. running mean: -20.221699
episode 1223.000000, reward total was -20.000000. running mean: -20.219482
episode 1224.000000, reward total was -21.000000. running mean: -20.227288
episode 1225.000000, reward total was -21.000000. running mean: -20.235015
episode 1226.000000, reward total was -20.000000. running mean: -20.232664
episode 1227.000000, reward total was -20.000000. running mean: -20.230338
episode 1228.000000, reward total was -21.000000. running mean: -20.238034
episode 1229.000000, reward total was -21.000000. running mean: -20.245654
episode 1230.000000, reward total was -19.000000. running mean: -20.233198
episode 1231.000000, reward total was -21.000000. running mean: -20.240866
episode 1232.000000, reward total was -20.000000. running mean: -20.238457
episode 1233.000000, reward total was -18.000000. running mean: -20.216072
episode 1234.000000, reward total was -21.000000. running mean: -20.223912
episode 1235.000000, rewa

episode 1332.000000, reward total was -21.000000. running mean: -20.148761
episode 1333.000000, reward total was -20.000000. running mean: -20.147273
episode 1334.000000, reward total was -21.000000. running mean: -20.155800
episode 1335.000000, reward total was -20.000000. running mean: -20.154242
episode 1336.000000, reward total was -20.000000. running mean: -20.152700
episode 1337.000000, reward total was -21.000000. running mean: -20.161173
episode 1338.000000, reward total was -21.000000. running mean: -20.169561
episode 1339.000000, reward total was -21.000000. running mean: -20.177866
episode 1340.000000, reward total was -21.000000. running mean: -20.186087
episode 1341.000000, reward total was -20.000000. running mean: -20.184226
episode 1342.000000, reward total was -20.000000. running mean: -20.182384
episode 1343.000000, reward total was -20.000000. running mean: -20.180560
episode 1344.000000, reward total was -21.000000. running mean: -20.188754
episode 1345.000000, rewa

episode 1442.000000, reward total was -20.000000. running mean: -20.141189
episode 1443.000000, reward total was -21.000000. running mean: -20.149777
episode 1444.000000, reward total was -21.000000. running mean: -20.158280
episode 1445.000000, reward total was -21.000000. running mean: -20.166697
episode 1446.000000, reward total was -20.000000. running mean: -20.165030
episode 1447.000000, reward total was -21.000000. running mean: -20.173380
episode 1448.000000, reward total was -19.000000. running mean: -20.161646
episode 1449.000000, reward total was -20.000000. running mean: -20.160029
episode 1450.000000, reward total was -19.000000. running mean: -20.148429
episode 1451.000000, reward total was -20.000000. running mean: -20.146945
episode 1452.000000, reward total was -21.000000. running mean: -20.155475
episode 1453.000000, reward total was -21.000000. running mean: -20.163921
episode 1454.000000, reward total was -21.000000. running mean: -20.172281
episode 1455.000000, rewa

episode 1552.000000, reward total was -21.000000. running mean: -20.170548
episode 1553.000000, reward total was -20.000000. running mean: -20.168842
episode 1554.000000, reward total was -21.000000. running mean: -20.177154
episode 1555.000000, reward total was -20.000000. running mean: -20.175382
episode 1556.000000, reward total was -21.000000. running mean: -20.183629
episode 1557.000000, reward total was -21.000000. running mean: -20.191792
episode 1558.000000, reward total was -19.000000. running mean: -20.179874
episode 1559.000000, reward total was -21.000000. running mean: -20.188076
episode 1560.000000, reward total was -19.000000. running mean: -20.176195
episode 1561.000000, reward total was -21.000000. running mean: -20.184433
episode 1562.000000, reward total was -21.000000. running mean: -20.192589
episode 1563.000000, reward total was -21.000000. running mean: -20.200663
episode 1564.000000, reward total was -20.000000. running mean: -20.198656
episode 1565.000000, rewa

episode 1662.000000, reward total was -20.000000. running mean: -19.999583
episode 1663.000000, reward total was -19.000000. running mean: -19.989587
episode 1664.000000, reward total was -21.000000. running mean: -19.999691
episode 1665.000000, reward total was -20.000000. running mean: -19.999694
episode 1666.000000, reward total was -21.000000. running mean: -20.009697
episode 1667.000000, reward total was -21.000000. running mean: -20.019600
episode 1668.000000, reward total was -21.000000. running mean: -20.029404
episode 1669.000000, reward total was -20.000000. running mean: -20.029110
episode 1670.000000, reward total was -20.000000. running mean: -20.028819
episode 1671.000000, reward total was -21.000000. running mean: -20.038531
episode 1672.000000, reward total was -21.000000. running mean: -20.048146
episode 1673.000000, reward total was -20.000000. running mean: -20.047664
episode 1674.000000, reward total was -21.000000. running mean: -20.057188
episode 1675.000000, rewa

episode 1772.000000, reward total was -20.000000. running mean: -19.931587
episode 1773.000000, reward total was -20.000000. running mean: -19.932271
episode 1774.000000, reward total was -20.000000. running mean: -19.932948
episode 1775.000000, reward total was -21.000000. running mean: -19.943619
episode 1776.000000, reward total was -19.000000. running mean: -19.934183
episode 1777.000000, reward total was -21.000000. running mean: -19.944841
episode 1778.000000, reward total was -21.000000. running mean: -19.955392
episode 1779.000000, reward total was -19.000000. running mean: -19.945839
episode 1780.000000, reward total was -21.000000. running mean: -19.956380
episode 1781.000000, reward total was -20.000000. running mean: -19.956816
episode 1782.000000, reward total was -19.000000. running mean: -19.947248
episode 1783.000000, reward total was -20.000000. running mean: -19.947776
episode 1784.000000, reward total was -20.000000. running mean: -19.948298
episode 1785.000000, rewa

episode 1882.000000, reward total was -21.000000. running mean: -19.958596
episode 1883.000000, reward total was -19.000000. running mean: -19.949010
episode 1884.000000, reward total was -20.000000. running mean: -19.949520
episode 1885.000000, reward total was -21.000000. running mean: -19.960024
episode 1886.000000, reward total was -21.000000. running mean: -19.970424
episode 1887.000000, reward total was -19.000000. running mean: -19.960720
episode 1888.000000, reward total was -17.000000. running mean: -19.931113
episode 1889.000000, reward total was -20.000000. running mean: -19.931802
episode 1890.000000, reward total was -21.000000. running mean: -19.942484
episode 1891.000000, reward total was -21.000000. running mean: -19.953059
episode 1892.000000, reward total was -20.000000. running mean: -19.953528
episode 1893.000000, reward total was -21.000000. running mean: -19.963993
episode 1894.000000, reward total was -20.000000. running mean: -19.964353
episode 1895.000000, rewa

episode 1992.000000, reward total was -21.000000. running mean: -19.995674
episode 1993.000000, reward total was -21.000000. running mean: -20.005717
episode 1994.000000, reward total was -19.000000. running mean: -19.995660
episode 1995.000000, reward total was -21.000000. running mean: -20.005704
episode 1996.000000, reward total was -18.000000. running mean: -19.985647
episode 1997.000000, reward total was -20.000000. running mean: -19.985790
episode 1998.000000, reward total was -20.000000. running mean: -19.985932
episode 1999.000000, reward total was -21.000000. running mean: -19.996073
episode 2000.000000, reward total was -17.000000. running mean: -19.966112
episode 2001.000000, reward total was -21.000000. running mean: -19.976451
episode 2002.000000, reward total was -20.000000. running mean: -19.976687
episode 2003.000000, reward total was -21.000000. running mean: -19.986920
episode 2004.000000, reward total was -20.000000. running mean: -19.987050
episode 2005.000000, rewa

episode 2102.000000, reward total was -20.000000. running mean: -20.048920
episode 2103.000000, reward total was -18.000000. running mean: -20.028431
episode 2104.000000, reward total was -20.000000. running mean: -20.028146
episode 2105.000000, reward total was -21.000000. running mean: -20.037865
episode 2106.000000, reward total was -19.000000. running mean: -20.027486
episode 2107.000000, reward total was -20.000000. running mean: -20.027212
episode 2108.000000, reward total was -20.000000. running mean: -20.026939
episode 2109.000000, reward total was -21.000000. running mean: -20.036670
episode 2110.000000, reward total was -19.000000. running mean: -20.026303
episode 2111.000000, reward total was -19.000000. running mean: -20.016040
episode 2112.000000, reward total was -21.000000. running mean: -20.025880
episode 2113.000000, reward total was -20.000000. running mean: -20.025621
episode 2114.000000, reward total was -20.000000. running mean: -20.025365
episode 2115.000000, rewa

episode 2212.000000, reward total was -20.000000. running mean: -20.010107
episode 2213.000000, reward total was -20.000000. running mean: -20.010006
episode 2214.000000, reward total was -17.000000. running mean: -19.979906
episode 2215.000000, reward total was -19.000000. running mean: -19.970107
episode 2216.000000, reward total was -19.000000. running mean: -19.960406
episode 2217.000000, reward total was -19.000000. running mean: -19.950802
episode 2218.000000, reward total was -21.000000. running mean: -19.961294
episode 2219.000000, reward total was -21.000000. running mean: -19.971681
episode 2220.000000, reward total was -21.000000. running mean: -19.981964
episode 2221.000000, reward total was -19.000000. running mean: -19.972145
episode 2222.000000, reward total was -21.000000. running mean: -19.982423
episode 2223.000000, reward total was -17.000000. running mean: -19.952599
episode 2224.000000, reward total was -20.000000. running mean: -19.953073
episode 2225.000000, rewa

episode 2322.000000, reward total was -19.000000. running mean: -20.016142
episode 2323.000000, reward total was -20.000000. running mean: -20.015981
episode 2324.000000, reward total was -21.000000. running mean: -20.025821
episode 2325.000000, reward total was -21.000000. running mean: -20.035563
episode 2326.000000, reward total was -21.000000. running mean: -20.045207
episode 2327.000000, reward total was -21.000000. running mean: -20.054755
episode 2328.000000, reward total was -19.000000. running mean: -20.044208
episode 2329.000000, reward total was -19.000000. running mean: -20.033766
episode 2330.000000, reward total was -20.000000. running mean: -20.033428
episode 2331.000000, reward total was -20.000000. running mean: -20.033094
episode 2332.000000, reward total was -19.000000. running mean: -20.022763
episode 2333.000000, reward total was -21.000000. running mean: -20.032535
episode 2334.000000, reward total was -20.000000. running mean: -20.032210
episode 2335.000000, rewa

episode 2432.000000, reward total was -21.000000. running mean: -19.969431
episode 2433.000000, reward total was -19.000000. running mean: -19.959737
episode 2434.000000, reward total was -20.000000. running mean: -19.960140
episode 2435.000000, reward total was -21.000000. running mean: -19.970538
episode 2436.000000, reward total was -18.000000. running mean: -19.950833
episode 2437.000000, reward total was -17.000000. running mean: -19.921325
episode 2438.000000, reward total was -21.000000. running mean: -19.932111
episode 2439.000000, reward total was -21.000000. running mean: -19.942790
episode 2440.000000, reward total was -21.000000. running mean: -19.953362
episode 2441.000000, reward total was -21.000000. running mean: -19.963829
episode 2442.000000, reward total was -20.000000. running mean: -19.964190
episode 2443.000000, reward total was -20.000000. running mean: -19.964548
episode 2444.000000, reward total was -20.000000. running mean: -19.964903
episode 2445.000000, rewa

episode 2542.000000, reward total was -20.000000. running mean: -19.907381
episode 2543.000000, reward total was -19.000000. running mean: -19.898308
episode 2544.000000, reward total was -19.000000. running mean: -19.889324
episode 2545.000000, reward total was -20.000000. running mean: -19.890431
episode 2546.000000, reward total was -18.000000. running mean: -19.871527
episode 2547.000000, reward total was -20.000000. running mean: -19.872812
episode 2548.000000, reward total was -20.000000. running mean: -19.874083
episode 2549.000000, reward total was -21.000000. running mean: -19.885343
episode 2550.000000, reward total was -21.000000. running mean: -19.896489
episode 2551.000000, reward total was -19.000000. running mean: -19.887524
episode 2552.000000, reward total was -20.000000. running mean: -19.888649
episode 2553.000000, reward total was -21.000000. running mean: -19.899763
episode 2554.000000, reward total was -20.000000. running mean: -19.900765
episode 2555.000000, rewa

episode 2652.000000, reward total was -20.000000. running mean: -19.782128
episode 2653.000000, reward total was -18.000000. running mean: -19.764307
episode 2654.000000, reward total was -20.000000. running mean: -19.766664
episode 2655.000000, reward total was -20.000000. running mean: -19.768997
episode 2656.000000, reward total was -19.000000. running mean: -19.761307
episode 2657.000000, reward total was -21.000000. running mean: -19.773694
episode 2658.000000, reward total was -21.000000. running mean: -19.785957
episode 2659.000000, reward total was -18.000000. running mean: -19.768098
episode 2660.000000, reward total was -19.000000. running mean: -19.760417
episode 2661.000000, reward total was -19.000000. running mean: -19.752813
episode 2662.000000, reward total was -21.000000. running mean: -19.765285
episode 2663.000000, reward total was -21.000000. running mean: -19.777632
episode 2664.000000, reward total was -19.000000. running mean: -19.769855
episode 2665.000000, rewa

episode 2762.000000, reward total was -20.000000. running mean: -19.708734
episode 2763.000000, reward total was -21.000000. running mean: -19.721647
episode 2764.000000, reward total was -21.000000. running mean: -19.734431
episode 2765.000000, reward total was -19.000000. running mean: -19.727086
episode 2766.000000, reward total was -19.000000. running mean: -19.719815
episode 2767.000000, reward total was -20.000000. running mean: -19.722617
episode 2768.000000, reward total was -19.000000. running mean: -19.715391
episode 2769.000000, reward total was -21.000000. running mean: -19.728237
episode 2770.000000, reward total was -21.000000. running mean: -19.740955
episode 2771.000000, reward total was -17.000000. running mean: -19.713545
episode 2772.000000, reward total was -18.000000. running mean: -19.696410
episode 2773.000000, reward total was -21.000000. running mean: -19.709446
episode 2774.000000, reward total was -20.000000. running mean: -19.712351
episode 2775.000000, rewa

episode 2872.000000, reward total was -20.000000. running mean: -19.815309
episode 2873.000000, reward total was -21.000000. running mean: -19.827156
episode 2874.000000, reward total was -18.000000. running mean: -19.808884
episode 2875.000000, reward total was -15.000000. running mean: -19.760795
episode 2876.000000, reward total was -21.000000. running mean: -19.773187
episode 2877.000000, reward total was -21.000000. running mean: -19.785456
episode 2878.000000, reward total was -19.000000. running mean: -19.777601
episode 2879.000000, reward total was -17.000000. running mean: -19.749825
episode 2880.000000, reward total was -19.000000. running mean: -19.742327
episode 2881.000000, reward total was -20.000000. running mean: -19.744903
episode 2882.000000, reward total was -20.000000. running mean: -19.747454
episode 2883.000000, reward total was -21.000000. running mean: -19.759980
episode 2884.000000, reward total was -21.000000. running mean: -19.772380
episode 2885.000000, rewa

episode 2982.000000, reward total was -20.000000. running mean: -19.536542
episode 2983.000000, reward total was -20.000000. running mean: -19.541176
episode 2984.000000, reward total was -20.000000. running mean: -19.545765
episode 2985.000000, reward total was -21.000000. running mean: -19.560307
episode 2986.000000, reward total was -21.000000. running mean: -19.574704
episode 2987.000000, reward total was -20.000000. running mean: -19.578957
episode 2988.000000, reward total was -19.000000. running mean: -19.573167
episode 2989.000000, reward total was -20.000000. running mean: -19.577436
episode 2990.000000, reward total was -20.000000. running mean: -19.581661
episode 2991.000000, reward total was -19.000000. running mean: -19.575845
episode 2992.000000, reward total was -19.000000. running mean: -19.570086
episode 2993.000000, reward total was -20.000000. running mean: -19.574385
episode 2994.000000, reward total was -21.000000. running mean: -19.588642
episode 2995.000000, rewa

episode 3092.000000, reward total was -21.000000. running mean: -19.732281
episode 3093.000000, reward total was -19.000000. running mean: -19.724958
episode 3094.000000, reward total was -19.000000. running mean: -19.717709
episode 3095.000000, reward total was -21.000000. running mean: -19.730531
episode 3096.000000, reward total was -20.000000. running mean: -19.733226
episode 3097.000000, reward total was -20.000000. running mean: -19.735894
episode 3098.000000, reward total was -21.000000. running mean: -19.748535
episode 3099.000000, reward total was -19.000000. running mean: -19.741050
episode 3100.000000, reward total was -20.000000. running mean: -19.743639
episode 3101.000000, reward total was -18.000000. running mean: -19.726203
episode 3102.000000, reward total was -20.000000. running mean: -19.728941
episode 3103.000000, reward total was -21.000000. running mean: -19.741651
episode 3104.000000, reward total was -18.000000. running mean: -19.724235
episode 3105.000000, rewa

episode 3202.000000, reward total was -19.000000. running mean: -19.569141
episode 3203.000000, reward total was -19.000000. running mean: -19.563450
episode 3204.000000, reward total was -21.000000. running mean: -19.577815
episode 3205.000000, reward total was -20.000000. running mean: -19.582037
episode 3206.000000, reward total was -17.000000. running mean: -19.556217
episode 3207.000000, reward total was -21.000000. running mean: -19.570655
episode 3208.000000, reward total was -20.000000. running mean: -19.574948
episode 3209.000000, reward total was -17.000000. running mean: -19.549199
episode 3210.000000, reward total was -20.000000. running mean: -19.553707
episode 3211.000000, reward total was -19.000000. running mean: -19.548169
episode 3212.000000, reward total was -21.000000. running mean: -19.562688
episode 3213.000000, reward total was -21.000000. running mean: -19.577061
episode 3214.000000, reward total was -20.000000. running mean: -19.581290
episode 3215.000000, rewa

episode 3312.000000, reward total was -21.000000. running mean: -19.590503
episode 3313.000000, reward total was -21.000000. running mean: -19.604598
episode 3314.000000, reward total was -17.000000. running mean: -19.578552
episode 3315.000000, reward total was -19.000000. running mean: -19.572767
episode 3316.000000, reward total was -20.000000. running mean: -19.577039
episode 3317.000000, reward total was -20.000000. running mean: -19.581269
episode 3318.000000, reward total was -19.000000. running mean: -19.575456
episode 3319.000000, reward total was -20.000000. running mean: -19.579701
episode 3320.000000, reward total was -17.000000. running mean: -19.553904
episode 3321.000000, reward total was -19.000000. running mean: -19.548365
episode 3322.000000, reward total was -19.000000. running mean: -19.542882
episode 3323.000000, reward total was -19.000000. running mean: -19.537453
episode 3324.000000, reward total was -18.000000. running mean: -19.522078
episode 3325.000000, rewa

episode 3422.000000, reward total was -20.000000. running mean: -19.550023
episode 3423.000000, reward total was -19.000000. running mean: -19.544523
episode 3424.000000, reward total was -20.000000. running mean: -19.549078
episode 3425.000000, reward total was -20.000000. running mean: -19.553587
episode 3426.000000, reward total was -21.000000. running mean: -19.568051
episode 3427.000000, reward total was -19.000000. running mean: -19.562371
episode 3428.000000, reward total was -19.000000. running mean: -19.556747
episode 3429.000000, reward total was -21.000000. running mean: -19.571179
episode 3430.000000, reward total was -20.000000. running mean: -19.575468
episode 3431.000000, reward total was -18.000000. running mean: -19.559713
episode 3432.000000, reward total was -19.000000. running mean: -19.554116
episode 3433.000000, reward total was -20.000000. running mean: -19.558575
episode 3434.000000, reward total was -20.000000. running mean: -19.562989
episode 3435.000000, rewa

episode 3532.000000, reward total was -17.000000. running mean: -19.554864
episode 3533.000000, reward total was -21.000000. running mean: -19.569315
episode 3534.000000, reward total was -16.000000. running mean: -19.533622
episode 3535.000000, reward total was -17.000000. running mean: -19.508286
episode 3536.000000, reward total was -21.000000. running mean: -19.523203
episode 3537.000000, reward total was -17.000000. running mean: -19.497971
episode 3538.000000, reward total was -18.000000. running mean: -19.482991
episode 3539.000000, reward total was -21.000000. running mean: -19.498161
episode 3540.000000, reward total was -18.000000. running mean: -19.483180
episode 3541.000000, reward total was -17.000000. running mean: -19.458348
episode 3542.000000, reward total was -21.000000. running mean: -19.473764
episode 3543.000000, reward total was -19.000000. running mean: -19.469027
episode 3544.000000, reward total was -18.000000. running mean: -19.454336
episode 3545.000000, rewa

episode 3642.000000, reward total was -20.000000. running mean: -19.496857
episode 3643.000000, reward total was -18.000000. running mean: -19.481888
episode 3644.000000, reward total was -20.000000. running mean: -19.487069
episode 3645.000000, reward total was -19.000000. running mean: -19.482198
episode 3646.000000, reward total was -19.000000. running mean: -19.477376
episode 3647.000000, reward total was -19.000000. running mean: -19.472603
episode 3648.000000, reward total was -17.000000. running mean: -19.447877
episode 3649.000000, reward total was -21.000000. running mean: -19.463398
episode 3650.000000, reward total was -19.000000. running mean: -19.458764
episode 3651.000000, reward total was -16.000000. running mean: -19.424176
episode 3652.000000, reward total was -17.000000. running mean: -19.399934
episode 3653.000000, reward total was -19.000000. running mean: -19.395935
episode 3654.000000, reward total was -16.000000. running mean: -19.361976
episode 3655.000000, rewa

episode 3752.000000, reward total was -17.000000. running mean: -19.385251
episode 3753.000000, reward total was -18.000000. running mean: -19.371399
episode 3754.000000, reward total was -19.000000. running mean: -19.367685
episode 3755.000000, reward total was -21.000000. running mean: -19.384008
episode 3756.000000, reward total was -19.000000. running mean: -19.380168
episode 3757.000000, reward total was -19.000000. running mean: -19.376366
episode 3758.000000, reward total was -19.000000. running mean: -19.372603
episode 3759.000000, reward total was -20.000000. running mean: -19.378877
episode 3760.000000, reward total was -18.000000. running mean: -19.365088
episode 3761.000000, reward total was -21.000000. running mean: -19.381437
episode 3762.000000, reward total was -20.000000. running mean: -19.387623
episode 3763.000000, reward total was -21.000000. running mean: -19.403746
episode 3764.000000, reward total was -19.000000. running mean: -19.399709
episode 3765.000000, rewa

episode 3862.000000, reward total was -18.000000. running mean: -19.403803
episode 3863.000000, reward total was -18.000000. running mean: -19.389765
episode 3864.000000, reward total was -20.000000. running mean: -19.395867
episode 3865.000000, reward total was -18.000000. running mean: -19.381909
episode 3866.000000, reward total was -20.000000. running mean: -19.388090
episode 3867.000000, reward total was -21.000000. running mean: -19.404209
episode 3868.000000, reward total was -19.000000. running mean: -19.400167
episode 3869.000000, reward total was -21.000000. running mean: -19.416165
episode 3870.000000, reward total was -21.000000. running mean: -19.432003
episode 3871.000000, reward total was -21.000000. running mean: -19.447683
episode 3872.000000, reward total was -19.000000. running mean: -19.443206
episode 3873.000000, reward total was -21.000000. running mean: -19.458774
episode 3874.000000, reward total was -18.000000. running mean: -19.444187
episode 3875.000000, rewa

episode 3972.000000, reward total was -19.000000. running mean: -19.466394
episode 3973.000000, reward total was -19.000000. running mean: -19.461731
episode 3974.000000, reward total was -19.000000. running mean: -19.457113
episode 3975.000000, reward total was -19.000000. running mean: -19.452542
episode 3976.000000, reward total was -20.000000. running mean: -19.458017
episode 3977.000000, reward total was -20.000000. running mean: -19.463436
episode 3978.000000, reward total was -18.000000. running mean: -19.448802
episode 3979.000000, reward total was -21.000000. running mean: -19.464314
episode 3980.000000, reward total was -19.000000. running mean: -19.459671
episode 3981.000000, reward total was -21.000000. running mean: -19.475074
episode 3982.000000, reward total was -16.000000. running mean: -19.440324
episode 3983.000000, reward total was -21.000000. running mean: -19.455920
episode 3984.000000, reward total was -20.000000. running mean: -19.461361
episode 3985.000000, rewa

episode 4082.000000, reward total was -18.000000. running mean: -19.327470
episode 4083.000000, reward total was -20.000000. running mean: -19.334195
episode 4084.000000, reward total was -20.000000. running mean: -19.340853
episode 4085.000000, reward total was -19.000000. running mean: -19.337445
episode 4086.000000, reward total was -19.000000. running mean: -19.334070
episode 4087.000000, reward total was -17.000000. running mean: -19.310729
episode 4088.000000, reward total was -20.000000. running mean: -19.317622
episode 4089.000000, reward total was -21.000000. running mean: -19.334446
episode 4090.000000, reward total was -17.000000. running mean: -19.311101
episode 4091.000000, reward total was -19.000000. running mean: -19.307990
episode 4092.000000, reward total was -17.000000. running mean: -19.284911
episode 4093.000000, reward total was -21.000000. running mean: -19.302061
episode 4094.000000, reward total was -20.000000. running mean: -19.309041
episode 4095.000000, rewa

episode 4192.000000, reward total was -19.000000. running mean: -19.147955
episode 4193.000000, reward total was -20.000000. running mean: -19.156476
episode 4194.000000, reward total was -19.000000. running mean: -19.154911
episode 4195.000000, reward total was -21.000000. running mean: -19.173362
episode 4196.000000, reward total was -17.000000. running mean: -19.151628
episode 4197.000000, reward total was -18.000000. running mean: -19.140112
episode 4198.000000, reward total was -20.000000. running mean: -19.148711
episode 4199.000000, reward total was -20.000000. running mean: -19.157224
episode 4200.000000, reward total was -17.000000. running mean: -19.135652
episode 4201.000000, reward total was -17.000000. running mean: -19.114295
episode 4202.000000, reward total was -20.000000. running mean: -19.123152
episode 4203.000000, reward total was -20.000000. running mean: -19.131921
episode 4204.000000, reward total was -20.000000. running mean: -19.140601
episode 4205.000000, rewa

episode 4302.000000, reward total was -21.000000. running mean: -19.176347
episode 4303.000000, reward total was -19.000000. running mean: -19.174583
episode 4304.000000, reward total was -19.000000. running mean: -19.172837
episode 4305.000000, reward total was -21.000000. running mean: -19.191109
episode 4306.000000, reward total was -20.000000. running mean: -19.199198
episode 4307.000000, reward total was -21.000000. running mean: -19.217206
episode 4308.000000, reward total was -18.000000. running mean: -19.205034
episode 4309.000000, reward total was -19.000000. running mean: -19.202984
episode 4310.000000, reward total was -17.000000. running mean: -19.180954
episode 4311.000000, reward total was -20.000000. running mean: -19.189144
episode 4312.000000, reward total was -18.000000. running mean: -19.177253
episode 4313.000000, reward total was -20.000000. running mean: -19.185480
episode 4314.000000, reward total was -18.000000. running mean: -19.173625
episode 4315.000000, rewa

episode 4412.000000, reward total was -19.000000. running mean: -19.260846
episode 4413.000000, reward total was -20.000000. running mean: -19.268238
episode 4414.000000, reward total was -19.000000. running mean: -19.265556
episode 4415.000000, reward total was -21.000000. running mean: -19.282900
episode 4416.000000, reward total was -21.000000. running mean: -19.300071
episode 4417.000000, reward total was -20.000000. running mean: -19.307070
episode 4418.000000, reward total was -18.000000. running mean: -19.294000
episode 4419.000000, reward total was -18.000000. running mean: -19.281060
episode 4420.000000, reward total was -20.000000. running mean: -19.288249
episode 4421.000000, reward total was -21.000000. running mean: -19.305367
episode 4422.000000, reward total was -21.000000. running mean: -19.322313
episode 4423.000000, reward total was -21.000000. running mean: -19.339090
episode 4424.000000, reward total was -19.000000. running mean: -19.335699
episode 4425.000000, rewa

episode 4522.000000, reward total was -20.000000. running mean: -19.273915
episode 4523.000000, reward total was -21.000000. running mean: -19.291176
episode 4524.000000, reward total was -17.000000. running mean: -19.268264
episode 4525.000000, reward total was -20.000000. running mean: -19.275582
episode 4526.000000, reward total was -21.000000. running mean: -19.292826
episode 4527.000000, reward total was -21.000000. running mean: -19.309898
episode 4528.000000, reward total was -19.000000. running mean: -19.306799
episode 4529.000000, reward total was -19.000000. running mean: -19.303731
episode 4530.000000, reward total was -20.000000. running mean: -19.310693
episode 4531.000000, reward total was -20.000000. running mean: -19.317586
episode 4532.000000, reward total was -20.000000. running mean: -19.324411
episode 4533.000000, reward total was -17.000000. running mean: -19.301166
episode 4534.000000, reward total was -19.000000. running mean: -19.298155
episode 4535.000000, rewa

episode 4632.000000, reward total was -19.000000. running mean: -19.074216
episode 4633.000000, reward total was -19.000000. running mean: -19.073474
episode 4634.000000, reward total was -21.000000. running mean: -19.092739
episode 4635.000000, reward total was -18.000000. running mean: -19.081811
episode 4636.000000, reward total was -16.000000. running mean: -19.050993
episode 4637.000000, reward total was -19.000000. running mean: -19.050483
episode 4638.000000, reward total was -18.000000. running mean: -19.039979
episode 4639.000000, reward total was -20.000000. running mean: -19.049579
episode 4640.000000, reward total was -19.000000. running mean: -19.049083
episode 4641.000000, reward total was -20.000000. running mean: -19.058592
episode 4642.000000, reward total was -20.000000. running mean: -19.068006
episode 4643.000000, reward total was -18.000000. running mean: -19.057326
episode 4644.000000, reward total was -19.000000. running mean: -19.056753
episode 4645.000000, rewa

episode 4742.000000, reward total was -19.000000. running mean: -18.999375
episode 4743.000000, reward total was -17.000000. running mean: -18.979382
episode 4744.000000, reward total was -18.000000. running mean: -18.969588
episode 4745.000000, reward total was -20.000000. running mean: -18.979892
episode 4746.000000, reward total was -18.000000. running mean: -18.970093
episode 4747.000000, reward total was -18.000000. running mean: -18.960392
episode 4748.000000, reward total was -19.000000. running mean: -18.960788
episode 4749.000000, reward total was -19.000000. running mean: -18.961180
episode 4750.000000, reward total was -18.000000. running mean: -18.951568
episode 4751.000000, reward total was -17.000000. running mean: -18.932053
episode 4752.000000, reward total was -19.000000. running mean: -18.932732
episode 4753.000000, reward total was -17.000000. running mean: -18.913405
episode 4754.000000, reward total was -17.000000. running mean: -18.894271
episode 4755.000000, rewa

episode 4852.000000, reward total was -16.000000. running mean: -18.918807
episode 4853.000000, reward total was -19.000000. running mean: -18.919619
episode 4854.000000, reward total was -20.000000. running mean: -18.930422
episode 4855.000000, reward total was -19.000000. running mean: -18.931118
episode 4856.000000, reward total was -17.000000. running mean: -18.911807
episode 4857.000000, reward total was -19.000000. running mean: -18.912689
episode 4858.000000, reward total was -17.000000. running mean: -18.893562
episode 4859.000000, reward total was -17.000000. running mean: -18.874626
episode 4860.000000, reward total was -20.000000. running mean: -18.885880
episode 4861.000000, reward total was -21.000000. running mean: -18.907021
episode 4862.000000, reward total was -20.000000. running mean: -18.917951
episode 4863.000000, reward total was -21.000000. running mean: -18.938772
episode 4864.000000, reward total was -18.000000. running mean: -18.929384
episode 4865.000000, rewa

episode 4962.000000, reward total was -19.000000. running mean: -18.887634
episode 4963.000000, reward total was -21.000000. running mean: -18.908758
episode 4964.000000, reward total was -17.000000. running mean: -18.889670
episode 4965.000000, reward total was -19.000000. running mean: -18.890774
episode 4966.000000, reward total was -19.000000. running mean: -18.891866
episode 4967.000000, reward total was -17.000000. running mean: -18.872947
episode 4968.000000, reward total was -19.000000. running mean: -18.874218
episode 4969.000000, reward total was -20.000000. running mean: -18.885476
episode 4970.000000, reward total was -17.000000. running mean: -18.866621
episode 4971.000000, reward total was -19.000000. running mean: -18.867955
episode 4972.000000, reward total was -18.000000. running mean: -18.859275
episode 4973.000000, reward total was -19.000000. running mean: -18.860682
episode 4974.000000, reward total was -19.000000. running mean: -18.862075
episode 4975.000000, rewa

episode 5072.000000, reward total was -15.000000. running mean: -18.999077
episode 5073.000000, reward total was -18.000000. running mean: -18.989087
episode 5074.000000, reward total was -20.000000. running mean: -18.999196
episode 5075.000000, reward total was -15.000000. running mean: -18.959204
episode 5076.000000, reward total was -16.000000. running mean: -18.929612
episode 5077.000000, reward total was -18.000000. running mean: -18.920316
episode 5078.000000, reward total was -20.000000. running mean: -18.931112
episode 5079.000000, reward total was -20.000000. running mean: -18.941801
episode 5080.000000, reward total was -17.000000. running mean: -18.922383
episode 5081.000000, reward total was -19.000000. running mean: -18.923159
episode 5082.000000, reward total was -18.000000. running mean: -18.913928
episode 5083.000000, reward total was -19.000000. running mean: -18.914789
episode 5084.000000, reward total was -18.000000. running mean: -18.905641
episode 5085.000000, rewa

episode 5182.000000, reward total was -19.000000. running mean: -18.870406
episode 5183.000000, reward total was -15.000000. running mean: -18.831702
episode 5184.000000, reward total was -21.000000. running mean: -18.853385
episode 5185.000000, reward total was -20.000000. running mean: -18.864851
episode 5186.000000, reward total was -18.000000. running mean: -18.856202
episode 5187.000000, reward total was -18.000000. running mean: -18.847640
episode 5188.000000, reward total was -19.000000. running mean: -18.849164
episode 5189.000000, reward total was -19.000000. running mean: -18.850672
episode 5190.000000, reward total was -17.000000. running mean: -18.832166
episode 5191.000000, reward total was -18.000000. running mean: -18.823844
episode 5192.000000, reward total was -17.000000. running mean: -18.805605
episode 5193.000000, reward total was -19.000000. running mean: -18.807549
episode 5194.000000, reward total was -21.000000. running mean: -18.829474
episode 5195.000000, rewa

episode 5292.000000, reward total was -15.000000. running mean: -18.824359
episode 5293.000000, reward total was -17.000000. running mean: -18.806115
episode 5294.000000, reward total was -19.000000. running mean: -18.808054
episode 5295.000000, reward total was -18.000000. running mean: -18.799973
episode 5296.000000, reward total was -18.000000. running mean: -18.791974
episode 5297.000000, reward total was -21.000000. running mean: -18.814054
episode 5298.000000, reward total was -20.000000. running mean: -18.825913
episode 5299.000000, reward total was -20.000000. running mean: -18.837654
episode 5300.000000, reward total was -20.000000. running mean: -18.849278
episode 5301.000000, reward total was -19.000000. running mean: -18.850785
episode 5302.000000, reward total was -21.000000. running mean: -18.872277
episode 5303.000000, reward total was -21.000000. running mean: -18.893554
episode 5304.000000, reward total was -20.000000. running mean: -18.904619
episode 5305.000000, rewa

episode 5402.000000, reward total was -19.000000. running mean: -18.921778
episode 5403.000000, reward total was -19.000000. running mean: -18.922560
episode 5404.000000, reward total was -19.000000. running mean: -18.923335
episode 5405.000000, reward total was -21.000000. running mean: -18.944101
episode 5406.000000, reward total was -19.000000. running mean: -18.944660
episode 5407.000000, reward total was -16.000000. running mean: -18.915214
episode 5408.000000, reward total was -19.000000. running mean: -18.916062
episode 5409.000000, reward total was -16.000000. running mean: -18.886901
episode 5410.000000, reward total was -20.000000. running mean: -18.898032
episode 5411.000000, reward total was -21.000000. running mean: -18.919052
episode 5412.000000, reward total was -14.000000. running mean: -18.869861
episode 5413.000000, reward total was -21.000000. running mean: -18.891163
episode 5414.000000, reward total was -17.000000. running mean: -18.872251
episode 5415.000000, rewa