In [None]:
import numpy as np
import time
from Helper import LearningCurvePlot, smooth
from baseline_subtraction import *

def average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate,
                             gamma, batch_update_size):

    reward_results = np.empty([n_repetitions,n_episodes]) # Result array
    now = time.time()
    
    for rep in range(n_repetitions): # Loop over repetitions
        rewards = cartpole(learning_rate, batch_update_size, gamma)
        reward_results[rep] = rewards
        
    print('Running one setting takes {} minutes'.format((time.time()-now)/60))    
    learning_curve = np.mean(reward_results,axis=0) # average over repetitions
    learning_curve = smooth(learning_curve,smoothing_window) # additional smoothing
    return learning_curve

def experiment():
    ####### Settings
    # Experiment    
    n_repetitions = 5
    smoothing_window = 1001
    n_episodes = 10000
    gamma = 0.99
    
    # Plotting parameters
    plot = True
    
    # Nice labels for plotting
    policy_labels = {'basesub': 'baseline subtraction',
                  'boot': 'bootstrap', 'b+b': 'bootstrap with baseline subtraction'}       #Something here might need to be changed

    
    ####### Experiments
    
    good_average_reward = 250 # We set this as a benchmark of good average reward reached by the algorithm
    
    policy = 'basesub'
    Plot = LearningCurvePlot(title = 'Cartpole experiment solved with' + policy_labels[policy])
    lr = [0.001, 0.01, 0.1, 0.0001]
    batch_sizes = [1,8,16,24]
    batch_update_size_1 = 1
    for learning_rate in lr:
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate,
                             gamma, batch_update_size_1)
        Plot.add_curve(learning_curve,label=policy_labels[policy] + ', learning rate: ' + str(learning_rate))

    Plot.add_hline(optimal_average_reward_per_timestep)
    Plot.save('cartpole_test' + policy_labels[policy] + 'learning_rate' + '.png')
    
    for batch_update_size in batch_sizes:
        learning_rate_1 = 0.001
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate_1,
                             gamma, batch_update_size)
        Plot.add_curve(learning_curve,label=policy_labels[policy] + ', batch update: ' + str(batch_update_size))
        
    Plot.add_hline(good_average_reward)
    Plot.save('cartpole_test' + policy_labels[policy] + 'batch_update' + '.png')

if __name__ == '__main__':
    experiment()

    
# To run with actor_critic: change import + add N as input to average_over_repetitions + ad N as hyperparameter + change 
# number of batch sizes and learning rates

  and should_run_async(code)
Episode 9999: 100%|██████████████████████| 10000/10000 [41:04<00:00,  4.06it/s, episode_reward=500, running_reward=469]
Episode 9999: 100%|██████████████████████| 10000/10000 [41:04<00:00,  4.06it/s, episode_reward=500, running_reward=470]
Episode 9999: 100%|██████████████████████| 10000/10000 [40:13<00:00,  4.14it/s, episode_reward=500, running_reward=497]
Episode 9999: 100%|██████████████████████| 10000/10000 [58:52<00:00,  2.83it/s, episode_reward=500, running_reward=496]
Episode 9999: 100%|████████████████████| 10000/10000 [1:10:32<00:00,  2.36it/s, episode_reward=500, running_reward=468]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 251.7858483393987 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [37:34<00:00,  4.43it/s, episode_reward=458, running_reward=490]
Episode 9999: 100%|███████████████████████| 10000/10000 [03:23<00:00, 49.12it/s, episode_reward=9, running_reward=9.37]
Episode 9999: 100%|███████████████████████| 10000/10000 [03:34<00:00, 46.64it/s, episode_reward=9, running_reward=9.36]
Episode 9999: 100%|███████████████████████| 10000/10000 [04:32<00:00, 36.71it/s, episode_reward=9, running_reward=9.45]
Episode 9999: 100%|██████████████████████| 10000/10000 [04:00<00:00, 41.50it/s, episode_reward=10, running_reward=9.45]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 53.108317069212596 minutes


Episode 9999: 100%|███████████████████████| 10000/10000 [03:11<00:00, 52.32it/s, episode_reward=8, running_reward=9.35]
Episode 9999: 100%|██████████████████████| 10000/10000 [03:03<00:00, 54.60it/s, episode_reward=10, running_reward=9.36]
Episode 9999: 100%|██████████████████████| 10000/10000 [03:04<00:00, 54.24it/s, episode_reward=10, running_reward=9.35]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:04<00:00, 32.79it/s, episode_reward=9, running_reward=9.36]
Episode 9999: 100%|███████████████████████| 10000/10000 [03:09<00:00, 52.78it/s, episode_reward=9, running_reward=9.31]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 17.554421857992807 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [23:24<00:00,  7.12it/s, episode_reward=500, running_reward=294]
Episode 9999: 100%|██████████████████████| 10000/10000 [09:41<00:00, 17.19it/s, episode_reward=131, running_reward=110]
Episode 9567:  96%|██████████████████████ | 9568/10000 [11:29<01:02,  6.95it/s, episode_reward=161, running_reward=143]