In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.special
from pathlib import Path

import nengo
import learnrules as rules
import representations as rp
import minigrid_wrap
from ac_learn import ActorCriticLearn

**Testing values:** <br>
alphas = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0] <br>
betas = [0.01, 0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99] <br>
gammas = [0.01, 0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99] <br>
lambda = [0.01, 0.1, 0.25, 0.5, 0.6, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]<br>
<br>
n_neurons =  [10, 100, 500, 1000, 1500, 2000, 2500, 3000, 5000] <br>
n_dims = [64, 128, 256, 532] <br>
sparsity = [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99] <br>

In [None]:
#Set path for data 
data_folder = Path('../WAT002_RL_Data/MG_TD0_1H_nn_alpha')

In [None]:
#Set testing values and number of runs
variable = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75, 0.9, 0.95, 0.99, 1.0]
runs = 5
    
#Run experiment
for v in variable:
    for run in range(runs):
        for attempt in range(5):
            try:
                out = ActorCriticLearn().run(env='MiniGrid',
                                             rep=rp.OneHotRep((8,8,4)),
                                             trials = 100, #10000,
                                             steps = 200,
                                             rule=rules.ActorCriticTD0,
                                             alpha = v, #0.5, 
                                             beta = 0.9, 
                                             gamma = 0.95, 
                                             n_neurons = None,
                                             sparsity = None,
                                             sample_encoders = 'False',
                                             lambd = None,
                                             verbose = False,
                                             seed = run,
                                             dims = None,
                                             data_dir = data_folder, 
                                             data_format = "npz")
                if run == 0:
                    Results_df = pd.DataFrame([out])
                else:
                    Results_df.loc[len(Results_df.index)] = out
                print("Finished test number ", run+1)
            except (FloatingPointError, ValueError):
                print('NaNs found. Starting again')
                continue
            else: break
        else: 
            print('Could not do it. Value we could not test: ', v)
            break

**Target Mean Reward:** <br>
MiniGrid = 0.95 (arbitrary) 

In [None]:
np_load_old = np.load
# modify the default parameters of np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

In [None]:
#Load the data
all_data = pd.read_pickle(data_folder)
#all_data

In [None]:
#Print the maximum reward for each run
for i in range(len(all_data)):
    index = all_data['roll_mean'][i][0][100:].argmax()
    print(all_data['roll_mean'][i][0][index])

In [None]:
#Calculate the mean number of runs to reach the goal rolling average reward
goal = 0.95
goal_reached = []
for i in range(len(all_data)):
    a=[i for i,v in enumerate(all_data['roll_mean'][i][0]) if np.abs(v) < goal]
    if len(a) < 1:
        goal_reached.append(np.nan)
    else:
        goal_reached.append(a[0])
        
goal_reached = np.asarray(goal_reached)
mean_runs = np.nanmean(goal_reached.reshape(-1, 5), axis=1)

In [None]:
#Plot the results
values = range(len(variable))

plt.figure(figsize=(8, 5))
plt.plot(values, mean_runs, marker = 'o')
plt.xticks(values, variable)
plt.ylabel('Number of Runs to reach Mean Reward <0.95')
plt.xlabel('Testing Variable')
plt.show()

| Environment | Rule | rep | runs | steps | alpha | beta | gamma | n_neurons| sparsity | lambda | sample_encoders | dims |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| MiniGrid | TD(0) | OneHotRep((8,8,4)) | 500 | 200 | 0.5 | 0.9 | 0.95 | None | None | None | False | None |
| MiniGrid | TD(0) | OneHotRep((8,8,4)) | 1000 | 200 | 0.5 | 0.8 | 0.8 | 3000 | 0.1 | None | False | None |
| MiniGrid | TD(0) | SSPRep(N=3, D=256, scale=[0.75,0.75,1.0]) | 500 | 200 | 0.5 | 0.6 | 0.7 | None | None | None | False | 256 |
| MiniGrid | TD(0) | SSPRep(N=3, D=128, scale=[0.75,0.75,1.0]) | 300 | 200 | 0.5 | 0.6 | 0.8 | 3000 | 0.25 | None | False | 128 |
| MiniGrid | TD(0) | GridSSPRep(3) | 300 | 200 | 0.1 | 0.85 | 0.95 | None | None | None | False | None |
| MiniGrid | TD(0) | GridSSPRep(3) | 300 | 200 | 0.1 | 0.85 | 0.95 | 1000 | 0.1 | None | False | None |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| MiniGrid | TD($\lambda$) | OneHotRep((8,8,4)) | 300 | 200 | 0.1 | 0.9 | 0.95 | None | None | 0.9 | False | None |
| MiniGrid | TD($\lambda$) | OneHotRep((8,8,4)) | 300 | 200 | 0.1 | 0.85 | 0.85 | 2000 | 0.005 | 0.8 | False | None |
| MiniGrid | TD($\lambda$) | SSPRep(N=3, D=256, scale=[0.75,0.75,1.0]) | 500 | 200 | 0.1 | 0.9 | 0.7 | None | None | 0.5 | False | 256 |
| MiniGrid | TD($\lambda$) | SSPRep(N=3, D=256, scale=[0.75,0.75,1.0]) | 500 | 200 | 0.1 | 0.9 | 0.7 | 5000 | 0.2 | 0.5 | False | 256 |
| MiniGrid | TD($\lambda$) | GridSSPRep(3) | 50 | 200 | 0.1 | 0.85 | 0.95 | None | None | 0.9 | False | None |
| MiniGrid | TD($\lambda$) | GridSSPRep(3) | 50 | 200 | 0.1 | 0.85 | 0.95 | 2000 | 0.2 | 0.9 | False | None |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |

