In [None]:
import numpy as np
import time
from Helper import LearningCurvePlot, smooth
from baseline_subtraction import *

def average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate,
                             gamma, batch_update_size):

    reward_results = np.empty([n_repetitions,n_episodes]) # Result array
    now = time.time()
    
    for rep in range(n_repetitions): # Loop over repetitions
        rewards = cartpole(learning_rate, batch_update_size, gamma, n_episodes)
        reward_results[rep] = rewards
        
    print('Running one setting takes {} minutes'.format((time.time()-now)/60))    
    learning_curve = np.mean(reward_results,axis=0) # average over repetitions
    learning_curve = smooth(learning_curve,smoothing_window) # additional smoothing
    return learning_curve

def experiment():
    ####### Settings
    # Experiment    
    n_repetitions = 5
    smoothing_window = 501
    n_episodes = 10000
    gamma = 0.99
    
    # Plotting parameters
    plot = True
    
    # Nice labels for plotting
    policy_labels = {'basesub': 'baseline subtraction',
                  'boot': 'bootstrap', 'b+b': 'bootstrap with baseline subtraction'}       #Something here might need to be changed

    
    ####### Experiments
    
    good_average_reward = 250 # We set this as a benchmark of good average reward reached by the algorithm
    
    policy = 'basesub'
    Plot1 = LearningCurvePlot(title = 'Cartpole experiment solved with ' + policy_labels[policy])
    Plot2 = LearningCurvePlot(title = 'Cartpole experiment solved with ' + policy_labels[policy])
    lr = [0.001, 0.01, 0.1, 0.0001]
    batch_sizes = [1,8,16,24]
    batch_update_size_1 = 1
    for learning_rate in lr:
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate,
                             gamma, batch_update_size_1)
        Plot1.add_curve(learning_curve,label = 'learning rate: ' + str(learning_rate))

    Plot1.add_hline(good_average_reward, label = 'Threshold for good reward')
    Plot1.save('cartpole_test_' + policy_labels[policy] + '_learning_rate' + '.png')
    
    for batch_update_size in batch_sizes:
        learning_rate_1 = 0.001
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate_1,
                             gamma, batch_update_size)
        Plot2.add_curve(learning_curve, label = 'batch update: ' + str(batch_update_size))
        
    Plot2.add_hline(good_average_reward, label = 'Threshold for good reward')
    Plot2.save('cartpole_test_' + policy_labels[policy] + '_batch_update' + '.png')

if __name__ == '__main__':
    experiment()

    
# To run with actor_critic: change import + add N as input to average_over_repetitions + ad N as hyperparameter + change 
# number of batch sizes and learning rates

Episode 9999: 100%|████████████████████| 10000/10000 [1:06:37<00:00,  2.50it/s, episode_reward=500, running_reward=393]
Episode 9999: 100%|████████████████████| 10000/10000 [1:11:04<00:00,  2.35it/s, episode_reward=500, running_reward=418]
Episode 9999: 100%|████████████████████| 10000/10000 [1:10:02<00:00,  2.38it/s, episode_reward=500, running_reward=412]
Episode 9999: 100%|████████████████████| 10000/10000 [1:12:51<00:00,  2.29it/s, episode_reward=500, running_reward=418]
Episode 9999: 100%|████████████████████| 10000/10000 [1:17:56<00:00,  2.14it/s, episode_reward=500, running_reward=425]


Running one setting takes 358.5271384835243 minutes


Episode 9999: 100%|███████████████████████| 10000/10000 [05:39<00:00, 29.47it/s, episode_reward=8, running_reward=13.3]
Episode 9999: 100%|████████████████████████| 10000/10000 [05:18<00:00, 31.38it/s, episode_reward=10, running_reward=10]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:19<00:00, 31.33it/s, episode_reward=9, running_reward=10.5]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:21<00:00, 31.13it/s, episode_reward=8, running_reward=10.3]
Episode 9999: 100%|██████████████████████| 10000/10000 [05:36<00:00, 29.73it/s, episode_reward=10, running_reward=11.5]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 27.25035835901896 minutes


Episode 9999: 100%|████████████████████████| 10000/10000 [05:28<00:00, 30.42it/s, episode_reward=8, running_reward=9.4]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:29<00:00, 30.34it/s, episode_reward=9, running_reward=9.58]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:50<00:00, 28.52it/s, episode_reward=9, running_reward=11.4]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:59<00:00, 27.81it/s, episode_reward=9, running_reward=13.4]
Episode 9999: 100%|███████████████████████| 10000/10000 [05:26<00:00, 30.64it/s, episode_reward=9, running_reward=9.41]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 28.252520815531412 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [28:27<00:00,  5.85it/s, episode_reward=500, running_reward=149]
Episode 9999: 100%|██████████████████████| 10000/10000 [21:33<00:00,  7.73it/s, episode_reward=327, running_reward=118]
Episode 9999: 100%|█████████████████████| 10000/10000 [15:54<00:00, 10.48it/s, episode_reward=189, running_reward=89.7]
Episode 9999: 100%|██████████████████████| 10000/10000 [23:27<00:00,  7.11it/s, episode_reward=115, running_reward=128]
Episode 9999: 100%|██████████████████████| 10000/10000 [28:28<00:00,  5.85it/s, episode_reward=446, running_reward=154]


Running one setting takes 117.85247792402903 minutes


Episode 9999: 100%|████████████████████| 10000/10000 [1:29:05<00:00,  1.87it/s, episode_reward=500, running_reward=398]
Episode 9999: 100%|████████████████████| 10000/10000 [1:28:20<00:00,  1.89it/s, episode_reward=500, running_reward=394]
Episode 9999: 100%|████████████████████| 10000/10000 [1:29:19<00:00,  1.87it/s, episode_reward=500, running_reward=395]
Episode 9999: 100%|████████████████████| 10000/10000 [1:30:35<00:00,  1.84it/s, episode_reward=500, running_reward=408]
Episode 5519:  55%|████████████▋          | 5520/10000 [41:49<49:15,  1.52it/s, episode_reward=500, running_reward=363]