In [None]:
import numpy as np
import time
from Helper import LearningCurvePlot, smooth
from actor_critic import *

def average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate,
                             gamma, batch_update_size, N):

    reward_results = np.empty([n_repetitions,n_episodes]) # Result array
    now = time.time()
    
    for rep in range(n_repetitions): # Loop over repetitions
        rewards = cartpole(learning_rate, batch_update_size, gamma, n_episodes, N)
        reward_results[rep] = rewards
        
    print('Running one setting takes {} minutes'.format((time.time()-now)/60))    
    learning_curve = np.mean(reward_results,axis=0) # average over repetitions
    learning_curve = smooth(learning_curve,smoothing_window) # additional smoothing
    return learning_curve

def experiment():
    ####### Settings
    # Experiment    
    n_repetitions = 5
    smoothing_window = 501
    n_episodes = 10000
    gamma = 0.99
    
    # Plotting parameters
    plot = True
    
    # Nice labels for plotting
    policy_labels = {'basesub': 'baseline subtraction',
                  'boot': 'bootstrap', 'b+b': 'bootstrap with baseline subtraction'}       #Something here might need to be changed

    
    ####### Experiments
    
    good_average_reward = 250 # We set this as a benchmark of good average reward reached by the algorithm
    
    policy = 'boot'
    Plot1 = LearningCurvePlot(title = 'Cartpole experiment solved with ' + policy_labels[policy])
    Plot2 = LearningCurvePlot(title = 'Cartpole experiment solved with ' + policy_labels[policy])
    Plot3 = LearningCurvePlot(title = 'Cartpole experiment solved with ' + policy_labels[policy])
    lr = [0.001, 0.01, 0.1, 0.0001]
    batch_sizes = [1,8,16,24, 48]
    N_sizes = [6, 12, 24, 48]
    batch_update_size_1 = 16
    learning_rate_1 = 0.001
    for N in N_sizes:
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate_1,
                             gamma, batch_update_size_1, N)
        Plot1.add_curve(learning_curve,label = 'N: ' + str(N))

    Plot1.add_hline(good_average_reward, label = 'Threshold for good reward')
    Plot1.save('cartpole_test_' + policy_labels[policy] + '_bootstrap' + '.png')
    
    for batch_update_size in batch_sizes:
        learning_curve = average_over_repetitions(smoothing_window, plot, n_repetitions, n_episodes, learning_rate_1,
                             gamma, batch_update_size, N)
        Plot2.add_curve(learning_curve, label = 'batch update: ' + str(batch_update_size))
        
    Plot2.add_hline(good_average_reward, label = 'Threshold for good reward')
    Plot2.save('cartpole_test_' + policy_labels[policy] + 'batch_update' + '.png')


if __name__ == '__main__':
    experiment()

    
# To run with actor_critic: change import + add N as input to average_over_repetitions + ad N as hyperparameter + change 
# number of batch sizes and learning rates


Episode 9999: 100%|██████████████████████| 10000/10000 [07:44<00:00, 21.53it/s, episode_reward=12, running_reward=39.1]
Episode 9999: 100%|██████████████████████| 10000/10000 [06:13<00:00, 26.79it/s, episode_reward=11, running_reward=27.3]
Episode 9999: 100%|██████████████████████| 10000/10000 [07:36<00:00, 21.89it/s, episode_reward=11, running_reward=39.5]
Episode 9999: 100%|██████████████████████| 10000/10000 [09:05<00:00, 18.32it/s, episode_reward=25, running_reward=49.6]
Episode 9999: 100%|██████████████████████| 10000/10000 [07:07<00:00, 23.38it/s, episode_reward=14, running_reward=34.4]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 37.82010194460551 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [17:08<00:00,  9.72it/s, episode_reward=27, running_reward=96.5]
Episode 9999: 100%|██████████████████████| 10000/10000 [16:53<00:00,  9.87it/s, episode_reward=69, running_reward=94.1]
Episode 9999: 100%|███████████████████████| 10000/10000 [19:41<00:00,  8.46it/s, episode_reward=17, running_reward=103]
Episode 9999: 100%|█████████████████████| 10000/10000 [11:43<00:00, 14.21it/s, episode_reward=137, running_reward=64.7]
Episode 9999: 100%|███████████████████████| 10000/10000 [22:04<00:00,  7.55it/s, episode_reward=39, running_reward=114]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 87.54305943250657 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [37:32<00:00,  4.44it/s, episode_reward=500, running_reward=177]
Episode 9999: 100%|██████████████████████| 10000/10000 [27:47<00:00,  6.00it/s, episode_reward=363, running_reward=141]
Episode 9999: 100%|██████████████████████| 10000/10000 [32:39<00:00,  5.10it/s, episode_reward=500, running_reward=160]
Episode 9999: 100%|██████████████████████| 10000/10000 [25:15<00:00,  6.60it/s, episode_reward=341, running_reward=133]
Episode 9999: 100%|██████████████████████| 10000/10000 [30:11<00:00,  5.52it/s, episode_reward=500, running_reward=157]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 153.44017761945724 minutes


Episode 9999: 100%|█████████████████████| 10000/10000 [16:26<00:00, 10.14it/s, episode_reward=305, running_reward=91.5]
Episode 9999: 100%|███████████████████████| 10000/10000 [15:47<00:00, 10.55it/s, episode_reward=235, running_reward=85]
Episode 9999: 100%|█████████████████████| 10000/10000 [12:43<00:00, 13.10it/s, episode_reward=333, running_reward=66.9]
Episode 9999: 100%|█████████████████████| 10000/10000 [13:46<00:00, 12.09it/s, episode_reward=208, running_reward=76.4]
Episode 9999: 100%|██████████████████████| 10000/10000 [10:21<00:00, 16.10it/s, episode_reward=77, running_reward=56.4]


Running one setting takes 69.09656020402909 minutes


Episode 9999: 100%|████████████████████████| 10000/10000 [20:56<00:00,  7.96it/s, episode_reward=10, running_reward=74]
Episode 9999: 100%|███████████████████████| 10000/10000 [27:43<00:00,  6.01it/s, episode_reward=11, running_reward=101]
Episode 9999: 100%|███████████████████████| 10000/10000 [27:36<00:00,  6.04it/s, episode_reward=10, running_reward=104]
Episode 9999: 100%|███████████████████████| 10000/10000 [23:05<00:00,  7.22it/s, episode_reward=9, running_reward=83.3]
Episode 9999: 100%|████████████████████████| 10000/10000 [28:30<00:00,  5.85it/s, episode_reward=9, running_reward=103]
  0%|                                                                                        | 0/10000 [00:00<?, ?it/s]

Running one setting takes 127.84725482861201 minutes


Episode 9999: 100%|██████████████████████| 10000/10000 [40:38<00:00,  4.10it/s, episode_reward=500, running_reward=202]
Episode 9999: 100%|██████████████████████| 10000/10000 [54:13<00:00,  3.07it/s, episode_reward=307, running_reward=258]
Episode 7376:  74%|████████████████▉      | 7377/10000 [27:40<25:36,  1.71it/s, episode_reward=500, running_reward=193]