In [4]:
import pickle

import gymnasium as gym
import numpy as np
from helper import initialize_grids, initialize_q_table, initialize_state_dict, initialize_random_start, \
    epsilon_greedy_policy, get_closest_in_grid, plot_rewards, plot_steps, evaluate
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


def train(q, rand_init=True):
    # Initialize variables to track rewards
    reward_list = []
    avg_reward_list = []
    total_steps = []
    file = open('trace/trace_train.txt', 'w')

    for episode in range(n_training_episodes):
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
        state = env.reset()
        if rand_init:
            state = initialize_random_start(grid_x, grid_v)
        steps = 0
        tot_reward, reward = 0, 0
        terminated = False

        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            action = epsilon_greedy_policy(q, state, epsilon, grid_x, grid_v, state_to_qtable, env)

            file.write(f'{state[0]},{state[1]},{action}\n')

            new_state, reward, terminated, truncated, info = env.step(action)

            s = state_to_qtable[get_closest_in_grid(state, grid_x, grid_v)]
            ns = state_to_qtable[get_closest_in_grid(new_state, grid_x, grid_v)]

            # Calculate current learning rate
            lr = max(min_lr, initial_lr * np.exp(-k * episode))

            # Update Q table
            q[s][action] = q[s][action] + lr * (reward + gamma * np.max(q[ns]) - q[s][action])

            steps += 1

            # If done, finish the episode
            if terminated:  # or truncated:
                total_steps.append(steps)
                break

            # Our state is the new state
            state = new_state

            # Update total reward
            tot_reward += reward

            # Track rewards
            reward_list.append(tot_reward)

        if (episode + 1) % 10 == 0:
            avg_reward = np.mean(reward_list)
            avg_reward_list.append(avg_reward)
            reward_list = []
            print(f"episode: {episode}\t average reward: {avg_reward}\t avg steps: {np.mean(total_steps[-10:])}"
                  f"\t lr: {lr}\t epsilon: {epsilon}")

    return q, avg_reward_list, total_steps

In [6]:
env = gym.make("MountainCar-v0")

# initialize size of state and action space
state_space = 20 * 20
action_space = env.action_space.n

observation, info = env.reset(seed=42)

# Training parameters
n_training_episodes = 50000  # Total training episodes
initial_lr = 0.2  # Learning rate       # old: 1
k = 0.0005  # lr decay                    # old: 0.005
min_lr = 0.0005


# Environment parameters
max_steps = 10000  # Max steps per episode
gamma = 0.99  # Discounting rate
eval_seed = []  # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1#0.95   # Exploration probability at start                     # old: 1
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.001  # Exponential decay rate for exploration prob           # old: 0.002


grid_x, grid_v = initialize_grids()
state_to_qtable = initialize_state_dict()

# Training
q_car = initialize_q_table(state_space, action_space)
q_car, avg_rewards, total_steps = train(q_car)


#
# Plot Q values for random init training
actions = np.max(q_car, axis=1)
actions = actions.reshape((20, 20))
plt.figure(figsize=(16, 12))
ax = plt.subplot(111)
ax = sns.heatmap(actions, annot=True)
plt.ylim(0, 20)
plt.xlabel("Position", fontsize=20)
plt.ylabel("Velocity", fontsize=20)
plt.title("Q values for optimal action - random init w/ lr decay", fontdict={'fontsize': 25})
# plt.savefig('plots/v_values_rand_decay')
with open('plots/q_values.pkl','wb') as fid:
    pickle.dump(ax, fid)
plt.savefig('plots/q_values_rand_decay.png')
plt.close()

plot_steps(total_steps, 'rand_decay')
plot_rewards(avg_rewards, 'rand_decay')

# # Save Q table
np.savetxt('data/q_rand_decay.txt', q_car)
np.array(avg_rewards)
np.savetxt('data/avg_rewards_rand_decay.txt', avg_rewards)
np.array(total_steps)
np.savetxt('data/total_steps_rand_decay.txt', total_steps)

env.close()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


episode: 9	 average reward: -5000.5	 avg steps: nan	 lr: 0.19910202196591412	 epsilon: 0.9914883598342394
episode: 19	 average reward: -4557.308114536639	 avg steps: 6863.0	 lr: 0.1981089964885801	 epsilon: 0.9821203941306657
episode: 29	 average reward: -4365.029278453155	 avg steps: 5977.0	 lr: 0.1971209237464765	 epsilon: 0.9728456412432746
episode: 39	 average reward: -3751.307521902795	 avg steps: 5594.1	 lr: 0.19613777903773325	 epsilon: 0.9636631736890484
episode: 49	 average reward: -4739.215335731415	 avg steps: 4405.7	 lr: 0.19515953778368147	 epsilon: 0.9545720732135796
episode: 59	 average reward: -3868.677663332836	 avg steps: 5098.3	 lr: 0.19418617552823886	 epsilon: 0.9455714306992448
episode: 69	 average reward: -4235.458823688006	 avg steps: 5245.5	 lr: 0.19321766793729833	 epsilon: 0.9366603460742918
episode: 79	 average reward: -3988.238947022798	 avg steps: 5752.9	 lr: 0.19225399079811967	 epsilon: 0.9278379282228324
episode: 89	 average reward: -4052.8162255645493	

episode: 689	 average reward: -263.78660915999177	 avg steps: 487.9	 lr: 0.14171491043736692	 epsilon: 0.5269740012064343
episode: 699	 average reward: -292.64963346269946	 avg steps: 464.8	 lr: 0.14100810437285335	 epsilon: 0.5222280305971058
episode: 709	 average reward: -247.85723556132382	 avg steps: 463.3	 lr: 0.14030482351829335	 epsilon: 0.5175292831843616
episode: 719	 average reward: -329.49413833528723	 avg steps: 512.8	 lr: 0.1396050502916288	 epsilon: 0.512877289089545
episode: 729	 average reward: -373.0449685534591	 avg steps: 637.0	 lr: 0.13890876719849268	 epsilon: 0.5082715831093696
episode: 739	 average reward: -224.2089150134705	 avg steps: 409.3	 lr: 0.13821595683177135	 epsilon: 0.5037117046693995
episode: 749	 average reward: -234.81237911025144	 avg steps: 414.6	 lr: 0.13752660187116958	 epsilon: 0.49919719777799054
episode: 759	 average reward: -233.53235225414622	 avg steps: 429.1	 lr: 0.13684068508277744	 epsilon: 0.4947276109806917
episode: 769	 average rewar

episode: 1359	 average reward: -140.35737586081913	 avg steps: 276.9	 lr: 0.10137407283989047	 epsilon: 0.2940716877985012
episode: 1369	 average reward: -307.97703788748566	 avg steps: 436.5	 lr: 0.10086846754227896	 epsilon: 0.2916431339277849
episode: 1379	 average reward: -172.42777413000655	 avg steps: 305.6	 lr: 0.10036538396160961	 epsilon: 0.28923874457183135
episode: 1389	 average reward: -121.45701943844493	 avg steps: 232.5	 lr: 0.09986480952076665	 epsilon: 0.28685827928970137
episode: 1399	 average reward: -125.73900414937759	 avg steps: 242.0	 lr: 0.099366731705363	 epsilon: 0.2845015000328828
episode: 1409	 average reward: -115.49582947173309	 avg steps: 216.8	 lr: 0.09887113806342734	 epsilon: 0.2821681711214862
episode: 1419	 average reward: -141.91647940074907	 avg steps: 268.0	 lr: 0.09837801620509283	 epsilon: 0.27985805922067575
episode: 1429	 average reward: -191.1558516801854	 avg steps: 346.2	 lr: 0.0978873538022873	 epsilon: 0.27757093331733645
episode: 1439	 a

episode: 2029	 average reward: -120.71721132897603	 avg steps: 230.5	 lr: 0.07251673527105228	 epsilon: 0.17489357624133203
episode: 2039	 average reward: -127.6517269736842	 avg steps: 244.2	 lr: 0.07215505654500913	 epsilon: 0.1736508643940698
episode: 2049	 average reward: -90.45444319460067	 avg steps: 178.8	 lr: 0.07179518169913772	 epsilon: 0.17242051773628975
episode: 2059	 average reward: -113.96134606639382	 avg steps: 220.9	 lr: 0.07143710173654812	 epsilon: 0.17120241323230073
episode: 2069	 average reward: -107.57509505703422	 avg steps: 211.4	 lr: 0.07108080770522261	 epsilon: 0.16999642907063733
episode: 2079	 average reward: -107.24735322425408	 avg steps: 208.8	 lr: 0.07072629069779186	 epsilon: 0.16880244465187827
episode: 2089	 average reward: -113.23722789871168	 avg steps: 226.1	 lr: 0.07037354185131224	 epsilon: 0.16762034057658687
episode: 2099	 average reward: -99.47013388259526	 avg steps: 195.2	 lr: 0.07002255234704417	 epsilon: 0.16644999863337032
episode: 210

episode: 2699	 average reward: -96.04777777777778	 avg steps: 181.0	 lr: 0.05187398263732972	 epsilon: 0.11390911427312714
episode: 2709	 average reward: -91.3257790368272	 avg steps: 177.5	 lr: 0.0516152600695676	 epsilon: 0.11327320796116609
episode: 2719	 average reward: -105.43724696356276	 avg steps: 198.6	 lr: 0.051357827885995515	 epsilon: 0.11264362902272904
episode: 2729	 average reward: -78.87841352405722	 avg steps: 154.8	 lr: 0.05110167965079546	 epsilon: 0.11202031449939742
episode: 2739	 average reward: -114.14400757934628	 avg steps: 212.1	 lr: 0.05084680896024825	 epsilon: 0.11140320205919955
episode: 2749	 average reward: -170.3913498430415	 avg steps: 287.7	 lr: 0.0505932094425733	 epsilon: 0.11079222999037708
episode: 2759	 average reward: -119.13796133567662	 avg steps: 228.6	 lr: 0.05034087475776949	 epsilon: 0.11018733719521402
episode: 2769	 average reward: -86.98892128279883	 avg steps: 172.5	 lr: 0.05008979859745653	 epsilon: 0.10958846318392676
episode: 2779	 

episode: 3369	 average reward: -89.1455733808675	 avg steps: 169.3	 lr: 0.03710743547127335	 epsilon: 0.08270284197229949
episode: 3379	 average reward: -73.54584221748401	 avg steps: 141.7	 lr: 0.03692236136475418	 epsilon: 0.08237744325780044
episode: 3389	 average reward: -78.70950819672132	 avg steps: 153.5	 lr: 0.03673821031919216	 epsilon: 0.08205528231460844
episode: 3399	 average reward: -81.64505978602895	 avg steps: 159.9	 lr: 0.03655497773080158	 epsilon: 0.08173632692636074
episode: 3409	 average reward: -79.2939649578196	 avg steps: 155.1	 lr: 0.03637265901875817	 epsilon: 0.0814205451972527
episode: 3419	 average reward: -82.0834890965732	 avg steps: 161.5	 lr: 0.036191249625084645	 epsilon: 0.08110790554884825
episode: 3429	 average reward: -80.17473287240729	 avg steps: 160.1	 lr: 0.03601074501453671	 epsilon: 0.08079837671692203
episode: 3439	 average reward: -85.45843373493976	 avg steps: 167.0	 lr: 0.03583114067448969	 epsilon: 0.08049192774833291
episode: 3449	 aver

episode: 4039	 average reward: -86.57913887204366	 avg steps: 165.9	 lr: 0.02654436187947179	 epsilon: 0.06673432475522344
episode: 4049	 average reward: -90.00519630484989	 avg steps: 174.2	 lr: 0.026411971322280944	 epsilon: 0.06656781544181353
episode: 4059	 average reward: -81.62430598396053	 avg steps: 163.1	 lr: 0.026280241065748795	 epsilon: 0.06640296292375439
episode: 4069	 average reward: -78.83873056994818	 avg steps: 155.4	 lr: 0.026149167816612046	 epsilon: 0.06623975071565681
episode: 4079	 average reward: -81.54579439252336	 avg steps: 161.5	 lr: 0.02601874829803266	 epsilon: 0.06607816249616397
episode: 4089	 average reward: -83.83018867924528	 avg steps: 165.3	 lr: 0.02588897924951585	 epsilon: 0.06591818210631922
episode: 4099	 average reward: -83.74728588661037	 avg steps: 166.8	 lr: 0.025759857426828677	 epsilon: 0.06575979354795033
episode: 4109	 average reward: -80.80474111041796	 avg steps: 161.3	 lr: 0.02563137960191883	 epsilon: 0.06560298098206944
episode: 411

episode: 4709	 average reward: -79.91861198738171	 avg steps: 159.5	 lr: 0.018988193030311185	 epsilon: 0.05856309752071351
episode: 4719	 average reward: -93.94698660714286	 avg steps: 180.2	 lr: 0.018893489022479143	 epsilon: 0.058477893276760326
episode: 4729	 average reward: -78.05912930474334	 avg steps: 154.9	 lr: 0.018799257352856702	 epsilon: 0.058393536829199746
episode: 4739	 average reward: -81.4669163545568	 avg steps: 161.2	 lr: 0.01870549566564721	 epsilon: 0.058310019742316734
episode: 4749	 average reward: -84.71978021978022	 avg steps: 164.8	 lr: 0.018612201616803607	 epsilon: 0.058227333664332984
episode: 4759	 average reward: -80.63276836158192	 avg steps: 160.3	 lr: 0.018519372873969802	 epsilon: 0.0581454703265718
episode: 4769	 average reward: -81.24811557788945	 avg steps: 160.2	 lr: 0.018427007116422405	 epsilon: 0.058064421542631195
episode: 4779	 average reward: -90.57289879931389	 avg steps: 175.9	 lr: 0.01833510203501266	 epsilon: 0.057984179207565226
episod

episode: 5379	 average reward: -75.5954454119223	 avg steps: 150.3	 lr: 0.013582977665595818	 epsilon: 0.05438181045377178
episode: 5389	 average reward: -76.85507246376811	 avg steps: 152.8	 lr: 0.01351523228186333	 epsilon: 0.05433821071127712
episode: 5399	 average reward: -75.50503018108652	 avg steps: 150.1	 lr: 0.013447824779641805	 epsilon: 0.05429504479346877
episode: 5409	 average reward: -74.05223367697594	 avg steps: 146.5	 lr: 0.013380753473740181	 epsilon: 0.054252308383718986
episode: 5419	 average reward: -73.96978021978022	 avg steps: 146.6	 lr: 0.013314016687372308	 epsilon: 0.05420999720835117
episode: 5429	 average reward: -78.51579626047712	 avg steps: 156.1	 lr: 0.013247612752115065	 epsilon: 0.05416810703621254
episode: 5439	 average reward: -74.3828178694158	 avg steps: 146.5	 lr: 0.013181540007866605	 epsilon: 0.05412663367825096
episode: 5449	 average reward: -77.41858932102835	 avg steps: 152.7	 lr: 0.013115796802804884	 epsilon: 0.054085572987096084
episode: 

episode: 6049	 average reward: -88.236	 avg steps: 176.0	 lr: 0.009716421250276875	 epsilon: 0.052242209995429764
episode: 6059	 average reward: -88.02527283170592	 avg steps: 175.1	 lr: 0.009667960397118456	 epsilon: 0.05221989963320596
episode: 6069	 average reward: -89.56369785794814	 avg steps: 178.4	 lr: 0.009619741243473506	 epsilon: 0.0521978112627954
episode: 6079	 average reward: -89.368986983588	 avg steps: 177.7	 lr: 0.009571762583860671	 epsilon: 0.05217594267534264
episode: 6089	 average reward: -93.17128874388254	 avg steps: 184.9	 lr: 0.009524023218810957	 epsilon: 0.0521542916839707
episode: 6099	 average reward: -87.66013824884793	 avg steps: 174.6	 lr: 0.00947652195483776	 epsilon: 0.0521328561235624
episode: 6109	 average reward: -85.17399049881236	 avg steps: 169.4	 lr: 0.009429257604407007	 epsilon: 0.052111633850543855
episode: 6119	 average reward: -86.76864801864802	 avg steps: 172.6	 lr: 0.009382228985907468	 epsilon: 0.052090622742670054
episode: 6129	 average

episode: 6719	 average reward: -76.58486842105263	 avg steps: 153.0	 lr: 0.006950526183368407	 epsilon: 0.05114735808786013
episode: 6729	 average reward: -73.46782006920415	 avg steps: 145.5	 lr: 0.0069158602894070515	 epsilon: 0.05113594168413669
episode: 6739	 average reward: -76.65697290152016	 avg steps: 152.3	 lr: 0.006881367292313133	 epsilon: 0.05112463887552828
episode: 6749	 average reward: -68.68195488721804	 avg steps: 134.0	 lr: 0.006847046329759923	 epsilon: 0.051113448531744626
episode: 6759	 average reward: -73.05233775296581	 avg steps: 144.3	 lr: 0.006812896543721578	 epsilon: 0.051102369533742024
episode: 6769	 average reward: -77.54025974025974	 avg steps: 155.0	 lr: 0.006778917080451665	 epsilon: 0.051091400773611434
episode: 6779	 average reward: -77.29511400651465	 avg steps: 154.5	 lr: 0.006745107090461832	 epsilon: 0.05108054115446772
episode: 6789	 average reward: -74.2319236016371	 avg steps: 147.6	 lr: 0.006711465728500569	 epsilon: 0.0510697895903399
episod

episode: 7389	 average reward: -78.51119894598155	 avg steps: 152.8	 lr: 0.004971976099154118	 epsilon: 0.0505871129753508
episode: 7399	 average reward: -81.54615881213687	 avg steps: 155.9	 lr: 0.004947178264906101	 epsilon: 0.05058127110363804
episode: 7409	 average reward: -78.12639161755075	 avg steps: 153.7	 lr: 0.004922504110372373	 epsilon: 0.050575487359520034
episode: 7419	 average reward: -83.82125307125307	 avg steps: 163.8	 lr: 0.004897953018697783	 epsilon: 0.05056976116461756
episode: 7429	 average reward: -71.8421433743664	 avg steps: 139.1	 lr: 0.004873524376103763	 epsilon: 0.050564091946306344
episode: 7439	 average reward: -78.57758620689656	 avg steps: 151.8	 lr: 0.004849217571872976	 epsilon: 0.050558479137659845
episode: 7449	 average reward: -76.58090185676393	 avg steps: 151.8	 lr: 0.0048250319983340504	 epsilon: 0.050552922177392504
episode: 7459	 average reward: -75.76117411607738	 avg steps: 150.9	 lr: 0.004800967050846383	 epsilon: 0.050547420509803674
epis

episode: 8049	 average reward: -78.3363344051447	 avg steps: 156.5	 lr: 0.0035744716197381853	 epsilon: 0.050303450124807456
episode: 8059	 average reward: -78.50096215522771	 avg steps: 156.9	 lr: 0.003556643868159572	 epsilon: 0.050300430745616784
episode: 8069	 average reward: -84.38164251207729	 avg steps: 166.6	 lr: 0.003538905032862907	 epsilon: 0.05029744140975104
episode: 8079	 average reward: -80.51635220125786	 avg steps: 160.0	 lr: 0.0035212546703763826	 epsilon: 0.05029448181827413
episode: 8089	 average reward: -84.89268585131894	 avg steps: 167.8	 lr: 0.0035036923394400157	 epsilon: 0.050291551675224457
episode: 8099	 average reward: -83.91883706844337	 avg steps: 166.1	 lr: 0.00348621760099462	 epsilon: 0.050288650687585264
episode: 8109	 average reward: -79.80342422320862	 avg steps: 158.7	 lr: 0.0034688300181708225	 epsilon: 0.050285778565255373
episode: 8119	 average reward: -79.34735500318675	 avg steps: 157.9	 lr: 0.003451529156278147	 epsilon: 0.05028293502102016
e

episode: 8709	 average reward: -81.56546134663341	 avg steps: 161.4	 lr: 0.0025697724819086403	 epsilon: 0.05015683860195841
episode: 8719	 average reward: -86.024231678487	 avg steps: 170.2	 lr: 0.0025569556881850485	 epsilon: 0.05015527803179437
episode: 8729	 average reward: -77.5812133072407	 avg steps: 154.3	 lr: 0.0025442028184868345	 epsilon: 0.05015373298956292
episode: 8739	 average reward: -76.36206896551724	 avg steps: 151.8	 lr: 0.002531513553991595	 epsilon: 0.05015220332075853
episode: 8749	 average reward: -76.9795918367347	 avg steps: 152.9	 lr: 0.002518887577467056	 epsilon: 0.05015068887241305
episode: 8759	 average reward: -80.45110410094637	 avg steps: 159.5	 lr: 0.002506324573263145	 epsilon: 0.05014918949308039
episode: 8769	 average reward: -80.61061946902655	 avg steps: 159.2	 lr: 0.002493824227304104	 epsilon: 0.050147705032821364
episode: 8779	 average reward: -74.67611336032388	 avg steps: 149.2	 lr: 0.002481386227080632	 epsilon: 0.050146235343188704
episode

episode: 9379	 average reward: -78.43811394891945	 avg steps: 153.7	 lr: 0.001838256129569995	 epsilon: 0.050080255657950164
episode: 9389	 average reward: -77.12425644415069	 avg steps: 152.3	 lr: 0.0018290877888745855	 epsilon: 0.05007945710081099
episode: 9399	 average reward: -74.91509433962264	 avg steps: 149.4	 lr: 0.0018199651754691615	 epsilon: 0.050078666489448115
episode: 9409	 average reward: -76.57057256990679	 avg steps: 151.2	 lr: 0.0018108880612879156	 epsilon: 0.05007788374479974
episode: 9419	 average reward: -77.49078947368422	 avg steps: 153.0	 lr: 0.001801856219402519	 epsilon: 0.05007710878859074
episode: 9429	 average reward: -80.64071474154436	 avg steps: 157.7	 lr: 0.0017928694240164546	 epsilon: 0.05007634154332486
episode: 9439	 average reward: -74.88745762711865	 avg steps: 148.5	 lr: 0.0017839274504593688	 epsilon: 0.05007558193227694
episode: 9449	 average reward: -74.82905405405405	 avg steps: 149.0	 lr: 0.001775030075181458	 epsilon: 0.05007482987948522
e

episode: 10039	 average reward: -85.08348457350272	 avg steps: 166.3	 lr: 0.0013215659596745833	 epsilon: 0.050041480243912057
episode: 10049	 average reward: -80.75445292620866	 avg steps: 158.2	 lr: 0.0013149746219524637	 epsilon: 0.050041067508589006
episode: 10059	 average reward: -79.24805194805195	 avg steps: 155.0	 lr: 0.00130841615866438	 epsilon: 0.050040658880051034
episode: 10069	 average reward: -78.46797385620916	 avg steps: 154.0	 lr: 0.001301890405848411	 epsilon: 0.05004025431743496
episode: 10079	 average reward: -80.4093529788597	 avg steps: 157.1	 lr: 0.001295397200360395	 epsilon: 0.050039853780284166
episode: 10089	 average reward: -82.86068111455108	 avg steps: 162.5	 lr: 0.0012889363798698566	 epsilon: 0.050039457228544616
episode: 10099	 average reward: -83.50215384615385	 avg steps: 163.5	 lr: 0.0012825077828559472	 epsilon: 0.050039064622560796
episode: 10109	 average reward: -78.04805194805195	 avg steps: 155.0	 lr: 0.0012761112486034062	 epsilon: 0.050038675

episode: 10709	 average reward: -76.20397350993377	 avg steps: 152.0	 lr: 0.0009453664645823009	 epsilon: 0.050021225796618475
episode: 10719	 average reward: -80.00830140485313	 avg steps: 157.6	 lr: 0.0009406514296696564	 epsilon: 0.05002101459641332
episode: 10729	 average reward: -75.40482573726541	 avg steps: 150.2	 lr: 0.0009359599110917451	 epsilon: 0.05002080549768531
episode: 10739	 average reward: -80.63440860215054	 avg steps: 159.1	 lr: 0.0009312917915603599	 epsilon: 0.05002059847952441
episode: 10749	 average reward: -80.5966709346991	 avg steps: 157.2	 lr: 0.0009266469543722687	 epsilon: 0.050020393521228626
episode: 10759	 average reward: -79.47422024188415	 avg steps: 158.1	 lr: 0.0009220252834062994	 epsilon: 0.050020190602301964
episode: 10769	 average reward: -79.93821656050956	 avg steps: 158.0	 lr: 0.0009174266631204376	 epsilon: 0.050019989702452354
episode: 10779	 average reward: -74.26766304347827	 avg steps: 148.2	 lr: 0.0009128509785489365	 epsilon: 0.0500197

episode: 11369	 average reward: -81.908125	 avg steps: 161.0	 lr: 0.0006796463881788486	 epsilon: 0.05001097058130791
episode: 11379	 average reward: -83.62186927306048	 avg steps: 164.7	 lr: 0.0006762566376761883	 epsilon: 0.05001086142220003
episode: 11389	 average reward: -81.58640049906425	 avg steps: 161.3	 lr: 0.0006728837936246911	 epsilon: 0.05001075334924342
episode: 11399	 average reward: -82.14800995024876	 avg steps: 161.8	 lr: 0.0006695277717030812	 epsilon: 0.050010646351630696
episode: 11409	 average reward: -83.1988950276243	 avg steps: 163.9	 lr: 0.000666188488010635	 epsilon: 0.050010540418662
episode: 11419	 average reward: -81.2396486825596	 avg steps: 160.4	 lr: 0.0006628658590650865	 epsilon: 0.050010435539743964
episode: 11429	 average reward: -89.53776263486655	 avg steps: 177.1	 lr: 0.0006595598018005389	 epsilon: 0.050010331704388594
episode: 11439	 average reward: -80.11820480404552	 avg steps: 159.2	 lr: 0.0006562702335653885	 epsilon: 0.050010228902212274
e

episode: 12029	 average reward: -78.43948220064725	 avg steps: 155.5	 lr: 0.0005	 epsilon: 0.05000567015958914
episode: 12039	 average reward: -82.8840490797546	 avg steps: 164.0	 lr: 0.0005	 epsilon: 0.05000561374055856
episode: 12049	 average reward: -83.0147329650092	 avg steps: 163.9	 lr: 0.0005	 epsilon: 0.05000555788290672
episode: 12059	 average reward: -84.04239854633555	 avg steps: 166.1	 lr: 0.0005	 epsilon: 0.050005502581047793
episode: 12069	 average reward: -81.2735790131168	 avg steps: 161.1	 lr: 0.0005	 epsilon: 0.05000544782945156
episode: 12079	 average reward: -84.64333132166566	 avg steps: 166.7	 lr: 0.0005	 epsilon: 0.05000539362264281
episode: 12089	 average reward: -81.66313162819714	 avg steps: 161.3	 lr: 0.0005	 epsilon: 0.05000533995520082
episode: 12099	 average reward: -79.84939374601149	 avg steps: 157.7	 lr: 0.0005	 epsilon: 0.050005286821758796
episode: 12109	 average reward: -79.55176848874598	 avg steps: 156.5	 lr: 0.0005	 epsilon: 0.05000523421700336
ep

episode: 12769	 average reward: -82.27142857142857	 avg steps: 162.0	 lr: 0.0005	 epsilon: 0.050002705312043205
episode: 12779	 average reward: -85.06080674292595	 avg steps: 167.1	 lr: 0.0005	 epsilon: 0.05000267839373862
episode: 12789	 average reward: -75.45460614152204	 avg steps: 150.8	 lr: 0.0005	 epsilon: 0.05000265174327563
episode: 12799	 average reward: -83.1298224127373	 avg steps: 164.3	 lr: 0.0005	 epsilon: 0.05000262535798919
episode: 12809	 average reward: -81.75529265255292	 avg steps: 161.6	 lr: 0.0005	 epsilon: 0.050002599235240724
episode: 12819	 average reward: -79.77168367346938	 avg steps: 157.8	 lr: 0.0005	 epsilon: 0.05000257337241795
episode: 12829	 average reward: -81.46355140186915	 avg steps: 161.5	 lr: 0.0005	 epsilon: 0.05000254776693457
episode: 12839	 average reward: -81.89267990074441	 avg steps: 162.2	 lr: 0.0005	 epsilon: 0.05000252241623
episode: 12849	 average reward: -80.26485461441213	 avg steps: 159.2	 lr: 0.0005	 epsilon: 0.050002497317769165
ep

episode: 13519	 average reward: -76.75857519788919	 avg steps: 152.6	 lr: 0.0005	 epsilon: 0.05000127789892394
episode: 13529	 average reward: -85.54138972809668	 avg steps: 166.5	 lr: 0.0005	 epsilon: 0.050001265183617194
episode: 13539	 average reward: -80.16898734177215	 avg steps: 159.0	 lr: 0.0005	 epsilon: 0.05000125259482987
episode: 13549	 average reward: -85.20958083832335	 avg steps: 168.0	 lr: 0.0005	 epsilon: 0.05000124013130306
episode: 13559	 average reward: -86.83303938859494	 avg steps: 171.1	 lr: 0.0005	 epsilon: 0.05000122779179043
episode: 13569	 average reward: -80.5110829639012	 avg steps: 158.9	 lr: 0.0005	 epsilon: 0.05000121557505799
episode: 13579	 average reward: -80.55583437892095	 avg steps: 160.4	 lr: 0.0005	 epsilon: 0.05000120347988407
episode: 13589	 average reward: -78.76591639871383	 avg steps: 156.5	 lr: 0.0005	 epsilon: 0.05000119150505915
episode: 13599	 average reward: -78.40903225806451	 avg steps: 156.0	 lr: 0.0005	 epsilon: 0.05000117964938572
e

episode: 14259	 average reward: -79.6734693877551	 avg steps: 157.8	 lr: 0.0005	 epsilon: 0.05000060970335924
episode: 14269	 average reward: -76.46891534391534	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.050000603636709454
episode: 14279	 average reward: -74.20977596741344	 avg steps: 148.3	 lr: 0.0005	 epsilon: 0.05000059763042384
episode: 14289	 average reward: -77.59568909209666	 avg steps: 154.1	 lr: 0.0005	 epsilon: 0.050000591683901766
episode: 14299	 average reward: -78.22186076772934	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.05000058579654858
episode: 14309	 average reward: -75.57161981258366	 avg steps: 150.4	 lr: 0.0005	 epsilon: 0.05000057996777553
episode: 14319	 average reward: -77.63529411764706	 avg steps: 154.0	 lr: 0.0005	 epsilon: 0.05000057419699974
episode: 14329	 average reward: -76.66139657444005	 avg steps: 152.8	 lr: 0.0005	 epsilon: 0.050000568483644134
episode: 14339	 average reward: -84.11732522796352	 avg steps: 165.5	 lr: 0.0005	 epsilon: 0.05000056282713736

episode: 14999	 average reward: -77.29646596858639	 avg steps: 153.8	 lr: 0.0005	 epsilon: 0.05000029089795704
episode: 15009	 average reward: -78.57928802588997	 avg steps: 155.5	 lr: 0.0005	 epsilon: 0.050000288003474
episode: 15019	 average reward: -77.849609375	 avg steps: 154.6	 lr: 0.0005	 epsilon: 0.050000285137791554
episode: 15029	 average reward: -75.27175368139224	 avg steps: 150.4	 lr: 0.0005	 epsilon: 0.050000282300623124
episode: 15039	 average reward: -79.80830670926518	 avg steps: 157.5	 lr: 0.0005	 epsilon: 0.05000027949168499
episode: 15049	 average reward: -79.47603833865814	 avg steps: 157.5	 lr: 0.0005	 epsilon: 0.05000027671069626
episode: 15059	 average reward: -77.4397116644823	 avg steps: 153.6	 lr: 0.0005	 epsilon: 0.05000027395737883
episode: 15069	 average reward: -80.01651842439644	 avg steps: 158.4	 lr: 0.0005	 epsilon: 0.050000271231457366
episode: 15079	 average reward: -79.50223642172524	 avg steps: 157.5	 lr: 0.0005	 epsilon: 0.05000026853265927
episod

episode: 15739	 average reward: -80.18939393939394	 avg steps: 159.4	 lr: 0.0005	 epsilon: 0.0500001387914633
episode: 15749	 average reward: -80.70025188916877	 avg steps: 159.8	 lr: 0.0005	 epsilon: 0.05000013741046517
episode: 15759	 average reward: -76.0877659574468	 avg steps: 151.4	 lr: 0.0005	 epsilon: 0.050000136043208195
episode: 15769	 average reward: -76.86939313984169	 avg steps: 152.6	 lr: 0.0005	 epsilon: 0.05000013468955566
episode: 15779	 average reward: -82.03233830845771	 avg steps: 161.8	 lr: 0.0005	 epsilon: 0.050000133349372186
episode: 15789	 average reward: -76.47546419098143	 avg steps: 151.8	 lr: 0.0005	 epsilon: 0.05000013202252376
episode: 15799	 average reward: -80.62192816635161	 avg steps: 159.7	 lr: 0.0005	 epsilon: 0.0500001307088777
episode: 15809	 average reward: -78.86791237113403	 avg steps: 156.2	 lr: 0.0005	 epsilon: 0.05000012940830264
episode: 15819	 average reward: -78.2444733420026	 avg steps: 154.8	 lr: 0.0005	 epsilon: 0.050000128120668515
ep

episode: 16479	 average reward: -75.27443105756359	 avg steps: 150.4	 lr: 0.0005	 epsilon: 0.050000066219338495
episode: 16489	 average reward: -74.58175675675676	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000006556044507
episode: 16499	 average reward: -80.16836086404066	 avg steps: 158.4	 lr: 0.0005	 epsilon: 0.050000064908107746
episode: 16509	 average reward: -79.49808673469387	 avg steps: 157.8	 lr: 0.0005	 epsilon: 0.05000006426226128
episode: 16519	 average reward: -74.33559322033898	 avg steps: 148.5	 lr: 0.0005	 epsilon: 0.050000063622841096
episode: 16529	 average reward: -84.11218335343787	 avg steps: 166.8	 lr: 0.0005	 epsilon: 0.05000006298978325
episode: 16539	 average reward: -79.03729903536977	 avg steps: 156.5	 lr: 0.0005	 epsilon: 0.050000062363024435
episode: 16549	 average reward: -75.72066666666667	 avg steps: 151.0	 lr: 0.0005	 epsilon: 0.050000061742501975
episode: 16559	 average reward: -81.04072681704261	 avg steps: 160.6	 lr: 0.0005	 epsilon: 0.05000006112815

episode: 17219	 average reward: -77.41781270464963	 avg steps: 153.7	 lr: 0.0005	 epsilon: 0.050000031594167875
episode: 17229	 average reward: -74.44993234100136	 avg steps: 148.8	 lr: 0.0005	 epsilon: 0.050000031279800655
episode: 17239	 average reward: -79.18473380372033	 avg steps: 156.9	 lr: 0.0005	 epsilon: 0.05000003096856143
episode: 17249	 average reward: -77.94729993493819	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.0500000306604191
episode: 17259	 average reward: -74.0374149659864	 avg steps: 148.0	 lr: 0.0005	 epsilon: 0.05000003035534283
episode: 17269	 average reward: -78.93496458467482	 avg steps: 156.3	 lr: 0.0005	 epsilon: 0.05000003005330213
episode: 17279	 average reward: -80.52937460518004	 avg steps: 159.3	 lr: 0.0005	 epsilon: 0.050000029754266775
episode: 17289	 average reward: -78.0078277886497	 avg steps: 154.3	 lr: 0.0005	 epsilon: 0.05000002945820687
episode: 17299	 average reward: -78.88766946417043	 avg steps: 155.9	 lr: 0.0005	 epsilon: 0.050000029165092816


episode: 17969	 average reward: -75.62868632707774	 avg steps: 150.2	 lr: 0.0005	 epsilon: 0.05000001492402817
episode: 17979	 average reward: -77.1218872870249	 avg steps: 153.6	 lr: 0.0005	 epsilon: 0.05000001477553161
episode: 17989	 average reward: -77.50850785340315	 avg steps: 153.8	 lr: 0.0005	 epsilon: 0.05000001462851261
episode: 17999	 average reward: -78.29785853341986	 avg steps: 155.1	 lr: 0.0005	 epsilon: 0.050000014482956476
episode: 18009	 average reward: -77.77254901960784	 avg steps: 154.0	 lr: 0.0005	 epsilon: 0.05000001433884865
episode: 18019	 average reward: -78.74645161290323	 avg steps: 156.0	 lr: 0.0005	 epsilon: 0.050000014196174725
episode: 18029	 average reward: -80.71994884910485	 avg steps: 157.4	 lr: 0.0005	 epsilon: 0.050000014054920425
episode: 18039	 average reward: -78.76162790697674	 avg steps: 155.8	 lr: 0.0005	 epsilon: 0.05000001391507163
episode: 18049	 average reward: -76.5933774834437	 avg steps: 152.0	 lr: 0.0005	 epsilon: 0.05000001377661435


episode: 18709	 average reward: -76.76897689768977	 avg steps: 152.5	 lr: 0.0005	 epsilon: 0.050000007120461515
episode: 18719	 average reward: -81.47371714643305	 avg steps: 160.8	 lr: 0.0005	 epsilon: 0.05000000704961174
episode: 18729	 average reward: -77.09671052631579	 avg steps: 153.0	 lr: 0.0005	 epsilon: 0.05000000697946693
episode: 18739	 average reward: -80.32637571157495	 avg steps: 159.1	 lr: 0.0005	 epsilon: 0.05000000691002007
episode: 18749	 average reward: -79.19730077120822	 avg steps: 156.6	 lr: 0.0005	 epsilon: 0.050000006841264225
episode: 18759	 average reward: -79.31153846153846	 avg steps: 157.0	 lr: 0.0005	 epsilon: 0.05000000677319251
episode: 18769	 average reward: -76.91578947368421	 avg steps: 153.0	 lr: 0.0005	 epsilon: 0.05000000670579812
episode: 18779	 average reward: -77.83071895424837	 avg steps: 154.0	 lr: 0.0005	 epsilon: 0.050000006639074314
episode: 18789	 average reward: -82.13490099009901	 avg steps: 162.6	 lr: 0.0005	 epsilon: 0.0500000065730144

episode: 19459	 average reward: -81.15809284818067	 avg steps: 160.4	 lr: 0.0005	 epsilon: 0.05000000336346786
episode: 19469	 average reward: -81.38784461152882	 avg steps: 160.6	 lr: 0.0005	 epsilon: 0.0500000033300008
episode: 19479	 average reward: -80.80754716981131	 avg steps: 160.0	 lr: 0.0005	 epsilon: 0.050000003296866734
episode: 19489	 average reward: -79.32415118513774	 avg steps: 157.1	 lr: 0.0005	 epsilon: 0.05000000326406236
episode: 19499	 average reward: -77.83799609629148	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.0500000032315844
episode: 19509	 average reward: -77.44546048334422	 avg steps: 154.1	 lr: 0.0005	 epsilon: 0.0500000031994296
episode: 19519	 average reward: -79.7123724489796	 avg steps: 157.8	 lr: 0.0005	 epsilon: 0.050000003167594745
episode: 19529	 average reward: -77.37508196721312	 avg steps: 153.5	 lr: 0.0005	 epsilon: 0.05000000313607665
episode: 19539	 average reward: -79.40089514066496	 avg steps: 157.4	 lr: 0.0005	 epsilon: 0.05000000310487216
epi

episode: 20199	 average reward: -79.04434447300771	 avg steps: 156.6	 lr: 0.0005	 epsilon: 0.05000000160475732
episode: 20209	 average reward: -79.95825426944971	 avg steps: 159.1	 lr: 0.0005	 epsilon: 0.05000000158878972
episode: 20219	 average reward: -80.7369414726243	 avg steps: 159.9	 lr: 0.0005	 epsilon: 0.050000001572981
episode: 20229	 average reward: -77.4869109947644	 avg steps: 153.8	 lr: 0.0005	 epsilon: 0.050000001557329574
episode: 20239	 average reward: -78.52512886597938	 avg steps: 156.2	 lr: 0.0005	 epsilon: 0.050000001541833886
episode: 20249	 average reward: -84.97462235649547	 avg steps: 166.5	 lr: 0.0005	 epsilon: 0.05000000152649239
episode: 20259	 average reward: -88.13267670915411	 avg steps: 173.6	 lr: 0.0005	 epsilon: 0.05000000151130353
episode: 20269	 average reward: -85.24835032993401	 avg steps: 167.7	 lr: 0.0005	 epsilon: 0.05000000149626581
episode: 20279	 average reward: -89.38355376653249	 avg steps: 174.9	 lr: 0.0005	 epsilon: 0.05000000148137772
epi

episode: 20939	 average reward: -81.9857849196539	 avg steps: 162.8	 lr: 0.0005	 epsilon: 0.05000000076565205
episode: 20949	 average reward: -81.91687344913151	 avg steps: 162.2	 lr: 0.0005	 epsilon: 0.05000000075803369
episode: 20959	 average reward: -80.69716088328076	 avg steps: 159.5	 lr: 0.0005	 epsilon: 0.05000000075049112
episode: 20969	 average reward: -80.56075949367089	 avg steps: 159.0	 lr: 0.0005	 epsilon: 0.05000000074302361
episode: 20979	 average reward: -77.79569190600522	 avg steps: 154.2	 lr: 0.0005	 epsilon: 0.05000000073563041
episode: 20989	 average reward: -77.68770331815224	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.05000000072831076
episode: 20999	 average reward: -78.25307443365696	 avg steps: 155.5	 lr: 0.0005	 epsilon: 0.05000000072106395
episode: 21009	 average reward: -76.41990771259064	 avg steps: 152.7	 lr: 0.0005	 epsilon: 0.05000000071388924
episode: 21019	 average reward: -74.61782579338285	 avg steps: 149.1	 lr: 0.0005	 epsilon: 0.050000000706785924
e

episode: 21679	 average reward: -80.39570164348926	 avg steps: 159.2	 lr: 0.0005	 epsilon: 0.05000000036530325
episode: 21689	 average reward: -80.13869537682078	 avg steps: 158.9	 lr: 0.0005	 epsilon: 0.05000000036166842
episode: 21699	 average reward: -79.4077855775367	 avg steps: 157.7	 lr: 0.0005	 epsilon: 0.05000000035806976
episode: 21709	 average reward: -77.36499674690957	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.050000000354506906
episode: 21719	 average reward: -74.84656796769852	 avg steps: 149.6	 lr: 0.0005	 epsilon: 0.050000000350979505
episode: 21729	 average reward: -78.9800899165061	 avg steps: 156.7	 lr: 0.0005	 epsilon: 0.0500000003474872
episode: 21739	 average reward: -83.86901152213463	 avg steps: 165.9	 lr: 0.0005	 epsilon: 0.05000000034402965
episode: 21749	 average reward: -78.3638726445744	 avg steps: 154.9	 lr: 0.0005	 epsilon: 0.05000000034060649
episode: 21759	 average reward: -77.48041775456919	 avg steps: 154.2	 lr: 0.0005	 epsilon: 0.0500000003372174
epis

episode: 22419	 average reward: -82.15747055176689	 avg steps: 162.3	 lr: 0.0005	 epsilon: 0.050000000174291265
episode: 22429	 average reward: -74.62052667116814	 avg steps: 149.1	 lr: 0.0005	 epsilon: 0.05000000017255704
episode: 22439	 average reward: -80.09936305732484	 avg steps: 158.0	 lr: 0.0005	 epsilon: 0.05000000017084007
episode: 22449	 average reward: -76.98484848484848	 avg steps: 152.8	 lr: 0.0005	 epsilon: 0.05000000016914018
episode: 22459	 average reward: -83.22596448254745	 avg steps: 164.3	 lr: 0.0005	 epsilon: 0.050000000167457204
episode: 22469	 average reward: -81.71285892634208	 avg steps: 161.2	 lr: 0.0005	 epsilon: 0.05000000016579098
episode: 22479	 average reward: -75.34231805929919	 avg steps: 149.4	 lr: 0.0005	 epsilon: 0.050000000164141335
episode: 22489	 average reward: -75.52101400933957	 avg steps: 150.9	 lr: 0.0005	 epsilon: 0.0500000001625081
episode: 22499	 average reward: -76.53509933774835	 avg steps: 152.0	 lr: 0.0005	 epsilon: 0.05000000016089111

episode: 23159	 average reward: -77.67756703727927	 avg steps: 153.9	 lr: 0.0005	 epsilon: 0.05000000008315679
episode: 23169	 average reward: -76.2271818787475	 avg steps: 151.1	 lr: 0.0005	 epsilon: 0.05000000008232937
episode: 23179	 average reward: -77.44094488188976	 avg steps: 153.4	 lr: 0.0005	 epsilon: 0.050000000081510176
episode: 23189	 average reward: -78.59108527131782	 avg steps: 155.8	 lr: 0.0005	 epsilon: 0.05000000008069914
episode: 23199	 average reward: -78.20544394037589	 avg steps: 155.3	 lr: 0.0005	 epsilon: 0.05000000007989616
episode: 23209	 average reward: -77.07293035479633	 avg steps: 153.2	 lr: 0.0005	 epsilon: 0.05000000007910119
episode: 23219	 average reward: -76.58090185676393	 avg steps: 151.8	 lr: 0.0005	 epsilon: 0.050000000078314115
episode: 23229	 average reward: -80.53324889170361	 avg steps: 158.9	 lr: 0.0005	 epsilon: 0.05000000007753488
episode: 23239	 average reward: -76.99208965062624	 avg steps: 152.7	 lr: 0.0005	 epsilon: 0.05000000007676339


episode: 23899	 average reward: -72.66876310272536	 avg steps: 144.1	 lr: 0.0005	 epsilon: 0.050000000039675266
episode: 23909	 average reward: -76.77938279711097	 avg steps: 153.3	 lr: 0.0005	 epsilon: 0.050000000039280484
episode: 23919	 average reward: -74.39659863945579	 avg steps: 148.0	 lr: 0.0005	 epsilon: 0.05000000003888964
episode: 23929	 average reward: -74.58136394328157	 avg steps: 149.1	 lr: 0.0005	 epsilon: 0.05000000003850268
episode: 23939	 average reward: -74.73584905660377	 avg steps: 149.4	 lr: 0.0005	 epsilon: 0.05000000003811957
episode: 23949	 average reward: -70.45039826212889	 avg steps: 139.1	 lr: 0.0005	 epsilon: 0.05000000003774028
episode: 23959	 average reward: -75.09919028340082	 avg steps: 149.2	 lr: 0.0005	 epsilon: 0.05000000003736475
episode: 23969	 average reward: -75.68662674650699	 avg steps: 151.3	 lr: 0.0005	 epsilon: 0.05000000003699297
episode: 23979	 average reward: -71.7412935323383	 avg steps: 141.7	 lr: 0.0005	 epsilon: 0.050000000036624886

episode: 24639	 average reward: -73.3487250172295	 avg steps: 146.1	 lr: 0.0005	 epsilon: 0.050000000018929625
episode: 24649	 average reward: -77.08240680183127	 avg steps: 153.9	 lr: 0.0005	 epsilon: 0.05000000001874127
episode: 24659	 average reward: -74.27297481279783	 avg steps: 147.9	 lr: 0.0005	 epsilon: 0.05000000001855479
episode: 24669	 average reward: -70.29884225759768	 avg steps: 139.2	 lr: 0.0005	 epsilon: 0.05000000001837017
episode: 24679	 average reward: -72.97007654836464	 avg steps: 144.7	 lr: 0.0005	 epsilon: 0.05000000001818738
episode: 24689	 average reward: -74.57567567567567	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000000001800641
episode: 24699	 average reward: -70.3367198838897	 avg steps: 138.8	 lr: 0.0005	 epsilon: 0.05000000001782725
episode: 24709	 average reward: -73.15475364330327	 avg steps: 145.1	 lr: 0.0005	 epsilon: 0.050000000017649864
episode: 24719	 average reward: -75.6711051930759	 avg steps: 151.2	 lr: 0.0005	 epsilon: 0.05000000001747424
ep

episode: 25379	 average reward: -75.92572944297082	 avg steps: 151.8	 lr: 0.0005	 epsilon: 0.050000000009031584
episode: 25389	 average reward: -75.56495669553631	 avg steps: 151.1	 lr: 0.0005	 epsilon: 0.05000000000894172
episode: 25399	 average reward: -75.42256341789052	 avg steps: 150.8	 lr: 0.0005	 epsilon: 0.05000000000885275
episode: 25409	 average reward: -76.44993412384717	 avg steps: 152.8	 lr: 0.0005	 epsilon: 0.050000000008764665
episode: 25419	 average reward: -75.4188376753507	 avg steps: 150.7	 lr: 0.0005	 epsilon: 0.05000000000867745
episode: 25429	 average reward: -74.5195945945946	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000000000859111
episode: 25439	 average reward: -74.96908602150538	 avg steps: 149.8	 lr: 0.0005	 epsilon: 0.05000000000850563
episode: 25449	 average reward: -74.81897711978466	 avg steps: 149.6	 lr: 0.0005	 epsilon: 0.050000000008420996
episode: 25459	 average reward: -76.40804218852999	 avg steps: 152.7	 lr: 0.0005	 epsilon: 0.05000000000833721


episode: 26119	 average reward: -77.16960208741031	 avg steps: 154.3	 lr: 0.0005	 epsilon: 0.0500000000043091
episode: 26129	 average reward: -76.44927536231884	 avg steps: 152.8	 lr: 0.0005	 epsilon: 0.05000000000426622
episode: 26139	 average reward: -77.15851272015655	 avg steps: 154.3	 lr: 0.0005	 epsilon: 0.05000000000422377
episode: 26149	 average reward: -77.63683527885863	 avg steps: 155.2	 lr: 0.0005	 epsilon: 0.05000000000418174
episode: 26159	 average reward: -76.92277486910994	 avg steps: 153.8	 lr: 0.0005	 epsilon: 0.050000000004140135
episode: 26169	 average reward: -76.7769028871391	 avg steps: 153.4	 lr: 0.0005	 epsilon: 0.05000000000409894
episode: 26179	 average reward: -76.6585686145765	 avg steps: 153.3	 lr: 0.0005	 epsilon: 0.05000000000405815
episode: 26189	 average reward: -77.17416829745596	 avg steps: 154.3	 lr: 0.0005	 epsilon: 0.050000000004017775
episode: 26199	 average reward: -77.72344559585493	 avg steps: 155.4	 lr: 0.0005	 epsilon: 0.0500000000039778
epi

episode: 26859	 average reward: -79.06815286624204	 avg steps: 158.0	 lr: 0.0005	 epsilon: 0.050000000002055935
episode: 26869	 average reward: -78.45926876202694	 avg steps: 156.9	 lr: 0.0005	 epsilon: 0.05000000000203547
episode: 26879	 average reward: -77.11618798955614	 avg steps: 154.2	 lr: 0.0005	 epsilon: 0.050000000002015224
episode: 26889	 average reward: -78.67946257197697	 avg steps: 157.3	 lr: 0.0005	 epsilon: 0.05000000000199517
episode: 26899	 average reward: -77.7279792746114	 avg steps: 155.4	 lr: 0.0005	 epsilon: 0.05000000000197532
episode: 26909	 average reward: -78.42618741976894	 avg steps: 156.8	 lr: 0.0005	 epsilon: 0.05000000000195566
episode: 26919	 average reward: -78.02646868947708	 avg steps: 155.9	 lr: 0.0005	 epsilon: 0.050000000001936204
episode: 26929	 average reward: -78.42554557124518	 avg steps: 156.8	 lr: 0.0005	 epsilon: 0.05000000000191694
episode: 26939	 average reward: -77.88041370394312	 avg steps: 155.7	 lr: 0.0005	 epsilon: 0.05000000000189787

episode: 27599	 average reward: -79.02421924792861	 avg steps: 157.9	 lr: 0.0005	 epsilon: 0.05000000000098091
episode: 27609	 average reward: -77.63748378728924	 avg steps: 155.2	 lr: 0.0005	 epsilon: 0.050000000000971156
episode: 27619	 average reward: -76.9109364767518	 avg steps: 153.7	 lr: 0.0005	 epsilon: 0.05000000000096149
episode: 27629	 average reward: -77.52207792207793	 avg steps: 155.0	 lr: 0.0005	 epsilon: 0.05000000000095192
episode: 27639	 average reward: -77.081045751634	 avg steps: 154.0	 lr: 0.0005	 epsilon: 0.05000000000094245
episode: 27649	 average reward: -78.24517374517374	 avg steps: 156.4	 lr: 0.0005	 epsilon: 0.050000000000933076
episode: 27659	 average reward: -77.63748378728924	 avg steps: 155.2	 lr: 0.0005	 epsilon: 0.05000000000092379
episode: 27669	 average reward: -78.03806451612903	 avg steps: 156.0	 lr: 0.0005	 epsilon: 0.0500000000009146
episode: 27679	 average reward: -77.45578673602081	 avg steps: 154.8	 lr: 0.0005	 epsilon: 0.0500000000009055
epis

episode: 28349	 average reward: -72.01328671328672	 avg steps: 144.0	 lr: 0.0005	 epsilon: 0.050000000000463354
episode: 28359	 average reward: -97.45427375971309	 avg steps: 168.3	 lr: 0.0005	 epsilon: 0.05000000000045874
episode: 28369	 average reward: -73.65708418891171	 avg steps: 147.1	 lr: 0.0005	 epsilon: 0.05000000000045418
episode: 28379	 average reward: -75.38101604278074	 avg steps: 150.6	 lr: 0.0005	 epsilon: 0.05000000000044966
episode: 28389	 average reward: -76.1093439363817	 avg steps: 151.9	 lr: 0.0005	 epsilon: 0.05000000000044518
episode: 28399	 average reward: -76.1929940515532	 avg steps: 152.3	 lr: 0.0005	 epsilon: 0.050000000000440754
episode: 28409	 average reward: -75.48331108144193	 avg steps: 150.8	 lr: 0.0005	 epsilon: 0.05000000000043637
episode: 28419	 average reward: -78.49870967741936	 avg steps: 156.0	 lr: 0.0005	 epsilon: 0.050000000000432025
episode: 28429	 average reward: -120.85535617183251	 avg steps: 184.9	 lr: 0.0005	 epsilon: 0.05000000000042773

episode: 29099	 average reward: -68.28379387602689	 avg steps: 134.9	 lr: 0.0005	 epsilon: 0.050000000000218876
episode: 29109	 average reward: -70.38294797687861	 avg steps: 139.4	 lr: 0.0005	 epsilon: 0.0500000000002167
episode: 29119	 average reward: -72.58951048951049	 avg steps: 144.0	 lr: 0.0005	 epsilon: 0.05000000000021454
episode: 29129	 average reward: -73.37077877325981	 avg steps: 146.1	 lr: 0.0005	 epsilon: 0.0500000000002124
episode: 29139	 average reward: -73.30171821305842	 avg steps: 146.5	 lr: 0.0005	 epsilon: 0.05000000000021029
episode: 29149	 average reward: -71.63514467184191	 avg steps: 142.7	 lr: 0.0005	 epsilon: 0.0500000000002082
episode: 29159	 average reward: -71.4559659090909	 avg steps: 141.8	 lr: 0.0005	 epsilon: 0.05000000000020613
episode: 29169	 average reward: -73.3328738800827	 avg steps: 146.1	 lr: 0.0005	 epsilon: 0.050000000000204076
episode: 29179	 average reward: -73.91603578802477	 avg steps: 146.3	 lr: 0.0005	 epsilon: 0.05000000000020205
epis

episode: 29849	 average reward: -73.60096818810511	 avg steps: 145.6	 lr: 0.0005	 epsilon: 0.05000000000010339
episode: 29859	 average reward: -73.77244688142564	 avg steps: 146.9	 lr: 0.0005	 epsilon: 0.05000000000010236
episode: 29869	 average reward: -74.39659863945579	 avg steps: 148.0	 lr: 0.0005	 epsilon: 0.050000000000101345
episode: 29879	 average reward: -73.2864044168392	 avg steps: 145.9	 lr: 0.0005	 epsilon: 0.05000000000010033
episode: 29889	 average reward: -73.33287101248267	 avg steps: 145.2	 lr: 0.0005	 epsilon: 0.05000000000009933
episode: 29899	 average reward: -74.64439946018894	 avg steps: 149.2	 lr: 0.0005	 epsilon: 0.05000000000009835
episode: 29909	 average reward: -71.09162491052255	 avg steps: 140.7	 lr: 0.0005	 epsilon: 0.05000000000009737
episode: 29919	 average reward: -73.26334026334027	 avg steps: 145.3	 lr: 0.0005	 epsilon: 0.0500000000000964
episode: 29929	 average reward: -73.7517146776406	 avg steps: 146.8	 lr: 0.0005	 epsilon: 0.05000000000009544
epi

episode: 30589	 average reward: -78.89003215434083	 avg steps: 156.5	 lr: 0.0005	 epsilon: 0.05000000000004933
episode: 30599	 average reward: -70.26923076923077	 avg steps: 138.8	 lr: 0.0005	 epsilon: 0.05000000000004884
episode: 30609	 average reward: -79.15877080665813	 avg steps: 157.2	 lr: 0.0005	 epsilon: 0.05000000000004835
episode: 30619	 average reward: -74.81725543478261	 avg steps: 148.2	 lr: 0.0005	 epsilon: 0.050000000000047874
episode: 30629	 average reward: -77.2107682206172	 avg steps: 153.3	 lr: 0.0005	 epsilon: 0.050000000000047395
episode: 30639	 average reward: -78.10902011680727	 avg steps: 155.1	 lr: 0.0005	 epsilon: 0.050000000000046924
episode: 30649	 average reward: -75.30384875084403	 avg steps: 149.1	 lr: 0.0005	 epsilon: 0.05000000000004646
episode: 30659	 average reward: -72.40042075736325	 avg steps: 143.6	 lr: 0.0005	 epsilon: 0.050000000000045994
episode: 30669	 average reward: -73.03894297635605	 avg steps: 144.8	 lr: 0.0005	 epsilon: 0.0500000000000455

episode: 31329	 average reward: -78.51189710610933	 avg steps: 156.5	 lr: 0.0005	 epsilon: 0.05000000000002354
episode: 31339	 average reward: -74.0710868079289	 avg steps: 147.3	 lr: 0.0005	 epsilon: 0.050000000000023304
episode: 31349	 average reward: -76.2753036437247	 avg steps: 149.2	 lr: 0.0005	 epsilon: 0.050000000000023075
episode: 31359	 average reward: -76.39104278074866	 avg steps: 150.6	 lr: 0.0005	 epsilon: 0.05000000000002284
episode: 31369	 average reward: -73.85123966942149	 avg steps: 146.2	 lr: 0.0005	 epsilon: 0.05000000000002262
episode: 31379	 average reward: -74.43643779741673	 avg steps: 148.1	 lr: 0.0005	 epsilon: 0.05000000000002239
episode: 31389	 average reward: -75.0281879194631	 avg steps: 150.0	 lr: 0.0005	 epsilon: 0.050000000000022166
episode: 31399	 average reward: -74.22305593451568	 avg steps: 147.6	 lr: 0.0005	 epsilon: 0.050000000000021944
episode: 31409	 average reward: -71.05483405483406	 avg steps: 139.6	 lr: 0.0005	 epsilon: 0.05000000000002173


episode: 32079	 average reward: -79.18228829993537	 avg steps: 155.7	 lr: 0.0005	 epsilon: 0.05000000000001112
episode: 32089	 average reward: -74.88046448087432	 avg steps: 147.4	 lr: 0.0005	 epsilon: 0.05000000000001101
episode: 32099	 average reward: -74.85332419465388	 avg steps: 146.9	 lr: 0.0005	 epsilon: 0.0500000000000109
episode: 32109	 average reward: -74.323991797676	 avg steps: 147.3	 lr: 0.0005	 epsilon: 0.05000000000001079
episode: 32119	 average reward: -74.80462899931926	 avg steps: 147.9	 lr: 0.0005	 epsilon: 0.05000000000001068
episode: 32129	 average reward: -73.88211103495544	 avg steps: 146.9	 lr: 0.0005	 epsilon: 0.05000000000001058
episode: 32139	 average reward: -70.06734250543084	 avg steps: 139.1	 lr: 0.0005	 epsilon: 0.050000000000010474
episode: 32149	 average reward: -71.89373680506685	 avg steps: 143.1	 lr: 0.0005	 epsilon: 0.05000000000001037
episode: 32159	 average reward: -70.79597701149426	 avg steps: 140.2	 lr: 0.0005	 epsilon: 0.050000000000010265
ep

episode: 32819	 average reward: -73.23414634146341	 avg steps: 144.5	 lr: 0.0005	 epsilon: 0.050000000000005304
episode: 32829	 average reward: -72.8728403593642	 avg steps: 145.7	 lr: 0.0005	 epsilon: 0.050000000000005256
episode: 32839	 average reward: -75.63291139240506	 avg steps: 151.1	 lr: 0.0005	 epsilon: 0.0500000000000052
episode: 32849	 average reward: -70.74622573687994	 avg steps: 140.1	 lr: 0.0005	 epsilon: 0.05000000000000515
episode: 32859	 average reward: -73.67770876466528	 avg steps: 145.9	 lr: 0.0005	 epsilon: 0.050000000000005096
episode: 32869	 average reward: -73.37508602890571	 avg steps: 146.3	 lr: 0.0005	 epsilon: 0.05000000000000505
episode: 32879	 average reward: -76.676	 avg steps: 151.0	 lr: 0.0005	 epsilon: 0.050000000000005
episode: 32889	 average reward: -71.22675250357653	 avg steps: 140.8	 lr: 0.0005	 epsilon: 0.05000000000000495
episode: 32899	 average reward: -73.59751037344398	 avg steps: 145.6	 lr: 0.0005	 epsilon: 0.0500000000000049
episode: 32909

episode: 33559	 average reward: -78.08311688311689	 avg steps: 155.0	 lr: 0.0005	 epsilon: 0.050000000000002535
episode: 33569	 average reward: -76.1998658618377	 avg steps: 150.1	 lr: 0.0005	 epsilon: 0.05000000000000251
episode: 33579	 average reward: -76.32042723631508	 avg steps: 150.8	 lr: 0.0005	 epsilon: 0.05000000000000248
episode: 33589	 average reward: -77.51513157894736	 avg steps: 153.0	 lr: 0.0005	 epsilon: 0.05000000000000246
episode: 33599	 average reward: -81.10575719649562	 avg steps: 160.8	 lr: 0.0005	 epsilon: 0.05000000000000243
episode: 33609	 average reward: -82.30560690080098	 avg steps: 163.3	 lr: 0.0005	 epsilon: 0.05000000000000241
episode: 33619	 average reward: -76.06486486486486	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000000000000238
episode: 33629	 average reward: -77.6984126984127	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.05000000000000236
episode: 33639	 average reward: -76.0468854655057	 avg steps: 150.3	 lr: 0.0005	 epsilon: 0.05000000000000234
epi

episode: 34299	 average reward: -69.64014869888476	 avg steps: 135.5	 lr: 0.0005	 epsilon: 0.05000000000000121
episode: 34309	 average reward: -74.91836734693878	 avg steps: 148.0	 lr: 0.0005	 epsilon: 0.050000000000001196
episode: 34319	 average reward: -73.37204450625869	 avg steps: 144.8	 lr: 0.0005	 epsilon: 0.05000000000000119
episode: 34329	 average reward: -76.29940515532056	 avg steps: 152.3	 lr: 0.0005	 epsilon: 0.050000000000001175
episode: 34339	 average reward: -74.32807137954701	 avg steps: 146.7	 lr: 0.0005	 epsilon: 0.05000000000000116
episode: 34349	 average reward: -72.64170748775368	 avg steps: 143.9	 lr: 0.0005	 epsilon: 0.050000000000001155
episode: 34359	 average reward: -79.22037155669443	 avg steps: 157.1	 lr: 0.0005	 epsilon: 0.05000000000000114
episode: 34369	 average reward: -79.85604606525912	 avg steps: 157.3	 lr: 0.0005	 epsilon: 0.05000000000000113
episode: 34379	 average reward: -83.95705128205128	 avg steps: 157.0	 lr: 0.0005	 epsilon: 0.0500000000000011

episode: 35039	 average reward: -73.36907216494845	 avg steps: 146.5	 lr: 0.0005	 epsilon: 0.05000000000000058
episode: 35049	 average reward: -70.32129963898917	 avg steps: 139.5	 lr: 0.0005	 epsilon: 0.05000000000000057
episode: 35059	 average reward: -71.38284066330209	 avg steps: 139.7	 lr: 0.0005	 epsilon: 0.050000000000000565
episode: 35069	 average reward: -72.73625608907446	 avg steps: 144.7	 lr: 0.0005	 epsilon: 0.050000000000000565
episode: 35079	 average reward: -75.92585170340682	 avg steps: 150.7	 lr: 0.0005	 epsilon: 0.05000000000000056
episode: 35089	 average reward: -76.68729208250166	 avg steps: 151.3	 lr: 0.0005	 epsilon: 0.05000000000000055
episode: 35099	 average reward: -75.44083840432725	 avg steps: 148.9	 lr: 0.0005	 epsilon: 0.050000000000000544
episode: 35109	 average reward: -82.19330855018588	 avg steps: 162.4	 lr: 0.0005	 epsilon: 0.05000000000000054
episode: 35119	 average reward: -67.63966691900076	 avg steps: 133.1	 lr: 0.0005	 epsilon: 0.0500000000000005

episode: 35779	 average reward: -79.36872218690401	 avg steps: 158.3	 lr: 0.0005	 epsilon: 0.05000000000000028
episode: 35789	 average reward: -78.85047923322684	 avg steps: 157.5	 lr: 0.0005	 epsilon: 0.05000000000000027
episode: 35799	 average reward: -77.79209844559585	 avg steps: 155.4	 lr: 0.0005	 epsilon: 0.05000000000000027
episode: 35809	 average reward: -76.34704711347047	 avg steps: 151.7	 lr: 0.0005	 epsilon: 0.050000000000000266
episode: 35819	 average reward: -76.5592885375494	 avg steps: 152.8	 lr: 0.0005	 epsilon: 0.050000000000000266
episode: 35829	 average reward: -74.72677966101695	 avg steps: 148.5	 lr: 0.0005	 epsilon: 0.050000000000000266
episode: 35839	 average reward: -76.48611111111111	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.05000000000000026
episode: 35849	 average reward: -75.42886041807148	 avg steps: 149.3	 lr: 0.0005	 epsilon: 0.05000000000000026
episode: 35859	 average reward: -78.42149292149293	 avg steps: 156.4	 lr: 0.0005	 epsilon: 0.05000000000000026

episode: 36519	 average reward: -76.95313531353135	 avg steps: 152.5	 lr: 0.0005	 epsilon: 0.050000000000000135
episode: 36529	 average reward: -76.52272727272727	 avg steps: 150.6	 lr: 0.0005	 epsilon: 0.050000000000000135
episode: 36539	 average reward: -69.7727603787327	 avg steps: 138.3	 lr: 0.0005	 epsilon: 0.050000000000000135
episode: 36549	 average reward: -75.80106453759149	 avg steps: 151.3	 lr: 0.0005	 epsilon: 0.05000000000000013
episode: 36559	 average reward: -74.36136205698402	 avg steps: 144.9	 lr: 0.0005	 epsilon: 0.05000000000000013
episode: 36569	 average reward: -74.89348710990502	 avg steps: 148.4	 lr: 0.0005	 epsilon: 0.05000000000000013
episode: 36579	 average reward: -68.01495886312641	 avg steps: 134.7	 lr: 0.0005	 epsilon: 0.05000000000000013
episode: 36589	 average reward: -81.96112852664577	 avg steps: 160.5	 lr: 0.0005	 epsilon: 0.05000000000000013
episode: 36599	 average reward: -83.55144291091594	 avg steps: 160.4	 lr: 0.0005	 epsilon: 0.05000000000000012

episode: 37269	 average reward: -111.29518664047151	 avg steps: 204.6	 lr: 0.0005	 epsilon: 0.050000000000000065
episode: 37279	 average reward: -87.24736842105263	 avg steps: 172.0	 lr: 0.0005	 epsilon: 0.050000000000000065
episode: 37289	 average reward: -79.20788302606485	 avg steps: 158.3	 lr: 0.0005	 epsilon: 0.050000000000000065
episode: 37299	 average reward: -78.81765834932821	 avg steps: 157.3	 lr: 0.0005	 epsilon: 0.050000000000000065
episode: 37309	 average reward: -79.2531806615776	 avg steps: 158.2	 lr: 0.0005	 epsilon: 0.050000000000000065
episode: 37319	 average reward: -77.04930966469428	 avg steps: 153.1	 lr: 0.0005	 epsilon: 0.05000000000000006
episode: 37329	 average reward: -74.72150170648464	 avg steps: 147.5	 lr: 0.0005	 epsilon: 0.05000000000000006
episode: 37339	 average reward: -75.79797297297297	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000000000000006
episode: 37349	 average reward: -79.25063613231552	 avg steps: 158.2	 lr: 0.0005	 epsilon: 0.05000000000000

episode: 38009	 average reward: -73.33680555555556	 avg steps: 145.0	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38019	 average reward: -74.93711967545639	 avg steps: 148.9	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38029	 average reward: -73.00698324022346	 avg steps: 144.2	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38039	 average reward: -78.28635778635778	 avg steps: 156.4	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38049	 average reward: -68.5829596412556	 avg steps: 134.8	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38059	 average reward: -76.69047619047619	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38069	 average reward: -79.50765306122449	 avg steps: 157.8	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38079	 average reward: -79.0019267822736	 avg steps: 156.7	 lr: 0.0005	 epsilon: 0.05000000000000003
episode: 38089	 average reward: -78.99166132135984	 avg steps: 156.9	 lr: 0.0005	 epsilon: 0.05000000000000003
epi

episode: 38749	 average reward: -72.60676532769556	 avg steps: 142.9	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38759	 average reward: -75.80681818181819	 avg steps: 150.6	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38769	 average reward: -76.46798679867987	 avg steps: 152.5	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38779	 average reward: -77.86854153041203	 avg steps: 153.9	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38789	 average reward: -74.75967413441956	 avg steps: 148.3	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38799	 average reward: -73.64099037138926	 avg steps: 146.4	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38809	 average reward: -85.32349323493234	 avg steps: 163.6	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38819	 average reward: -100.3420745920746	 avg steps: 172.6	 lr: 0.0005	 epsilon: 0.05000000000000002
episode: 38829	 average reward: -78.84947643979058	 avg steps: 153.8	 lr: 0.0005	 epsilon: 0.05000000000000002
e

episode: 39499	 average reward: -74.61709520500348	 avg steps: 144.9	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39509	 average reward: -84.8360450563204	 avg steps: 160.8	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39519	 average reward: -71.31724627395316	 avg steps: 141.9	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39529	 average reward: -74.12032630863358	 avg steps: 148.1	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39539	 average reward: -75.651677852349	 avg steps: 150.0	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39549	 average reward: -73.1459649122807	 avg steps: 143.5	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39559	 average reward: -75.59324324324324	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39569	 average reward: -78.36522301228183	 avg steps: 155.7	 lr: 0.0005	 epsilon: 0.05000000000000001
episode: 39579	 average reward: -80.87695190505934	 avg steps: 161.1	 lr: 0.0005	 epsilon: 0.05000000000000001
episo

episode: 40259	 average reward: -81.94164619164619	 avg steps: 163.8	 lr: 0.0005	 epsilon: 0.05
episode: 40269	 average reward: -80.08825396825397	 avg steps: 158.5	 lr: 0.0005	 epsilon: 0.05
episode: 40279	 average reward: -71.52567760342367	 avg steps: 141.2	 lr: 0.0005	 epsilon: 0.05
episode: 40289	 average reward: -67.05837755875663	 avg steps: 132.9	 lr: 0.0005	 epsilon: 0.05
episode: 40299	 average reward: -69.78209831254586	 avg steps: 137.3	 lr: 0.0005	 epsilon: 0.05
episode: 40309	 average reward: -74.19417475728156	 avg steps: 145.2	 lr: 0.0005	 epsilon: 0.05
episode: 40319	 average reward: -72.5025053686471	 avg steps: 140.7	 lr: 0.0005	 epsilon: 0.05
episode: 40329	 average reward: -71.92095977417078	 avg steps: 142.7	 lr: 0.0005	 epsilon: 0.05
episode: 40339	 average reward: -76.95767195767196	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.05
episode: 40349	 average reward: -75.62892223738062	 avg steps: 147.6	 lr: 0.0005	 epsilon: 0.05
episode: 40359	 average reward: -76.67526

episode: 41119	 average reward: -68.79422930903569	 avg steps: 132.7	 lr: 0.0005	 epsilon: 0.05
episode: 41129	 average reward: -66.57098283931357	 avg steps: 129.2	 lr: 0.0005	 epsilon: 0.05
episode: 41139	 average reward: -77.91716368455931	 avg steps: 151.9	 lr: 0.0005	 epsilon: 0.05
episode: 41149	 average reward: -77.00816882232812	 avg steps: 147.9	 lr: 0.0005	 epsilon: 0.05
episode: 41159	 average reward: -68.78201219512195	 avg steps: 132.2	 lr: 0.0005	 epsilon: 0.05
episode: 41169	 average reward: -74.54690757470466	 avg steps: 144.9	 lr: 0.0005	 epsilon: 0.05
episode: 41179	 average reward: -60.140797285835454	 avg steps: 118.9	 lr: 0.0005	 epsilon: 0.05
episode: 41189	 average reward: -73.32408759124088	 avg steps: 138.0	 lr: 0.0005	 epsilon: 0.05
episode: 41199	 average reward: -72.95667870036101	 avg steps: 139.5	 lr: 0.0005	 epsilon: 0.05
episode: 41209	 average reward: -69.93721633888049	 avg steps: 133.2	 lr: 0.0005	 epsilon: 0.05
episode: 41219	 average reward: -70.307

episode: 41979	 average reward: -69.4810408921933	 avg steps: 135.5	 lr: 0.0005	 epsilon: 0.05
episode: 41989	 average reward: -68.65399239543726	 avg steps: 132.5	 lr: 0.0005	 epsilon: 0.05
episode: 41999	 average reward: -75.07268877911079	 avg steps: 142.7	 lr: 0.0005	 epsilon: 0.05
episode: 42009	 average reward: -69.69821162444113	 avg steps: 135.2	 lr: 0.0005	 epsilon: 0.05
episode: 42019	 average reward: -75.65578231292517	 avg steps: 148.0	 lr: 0.0005	 epsilon: 0.05
episode: 42029	 average reward: -66.7919254658385	 avg steps: 129.8	 lr: 0.0005	 epsilon: 0.05
episode: 42039	 average reward: -72.72838616714698	 avg steps: 139.8	 lr: 0.0005	 epsilon: 0.05
episode: 42049	 average reward: -68.20789074355083	 avg steps: 132.8	 lr: 0.0005	 epsilon: 0.05
episode: 42059	 average reward: -69.65952732644018	 avg steps: 136.4	 lr: 0.0005	 epsilon: 0.05
episode: 42069	 average reward: -67.66946564885497	 avg steps: 132.0	 lr: 0.0005	 epsilon: 0.05
episode: 42079	 average reward: -68.013709

episode: 42839	 average reward: -82.7230376515635	 avg steps: 157.7	 lr: 0.0005	 epsilon: 0.05
episode: 42849	 average reward: -85.78373493975904	 avg steps: 167.0	 lr: 0.0005	 epsilon: 0.05
episode: 42859	 average reward: -75.72746628814762	 avg steps: 141.9	 lr: 0.0005	 epsilon: 0.05
episode: 42869	 average reward: -63.046744574290486	 avg steps: 120.8	 lr: 0.0005	 epsilon: 0.05
episode: 42879	 average reward: -79.76205997392438	 avg steps: 154.4	 lr: 0.0005	 epsilon: 0.05
episode: 42889	 average reward: -83.07770700636942	 avg steps: 158.0	 lr: 0.0005	 epsilon: 0.05
episode: 42899	 average reward: -79.9953642384106	 avg steps: 152.0	 lr: 0.0005	 epsilon: 0.05
episode: 42909	 average reward: -76.70604781997187	 avg steps: 143.2	 lr: 0.0005	 epsilon: 0.05
episode: 42919	 average reward: -69.55623565416985	 avg steps: 131.7	 lr: 0.0005	 epsilon: 0.05
episode: 42929	 average reward: -79.57415254237289	 avg steps: 142.6	 lr: 0.0005	 epsilon: 0.05
episode: 42939	 average reward: -74.56832

episode: 43709	 average reward: -69.05167173252279	 avg steps: 132.6	 lr: 0.0005	 epsilon: 0.05
episode: 43719	 average reward: -73.13163636363636	 avg steps: 138.5	 lr: 0.0005	 epsilon: 0.05
episode: 43729	 average reward: -61.9635761589404	 avg steps: 121.8	 lr: 0.0005	 epsilon: 0.05
episode: 43739	 average reward: -75.57102473498233	 avg steps: 142.5	 lr: 0.0005	 epsilon: 0.05
episode: 43749	 average reward: -68.82194934765924	 avg steps: 131.3	 lr: 0.0005	 epsilon: 0.05
episode: 43759	 average reward: -75.05911330049261	 avg steps: 143.1	 lr: 0.0005	 epsilon: 0.05
episode: 43769	 average reward: -68.10355486862441	 avg steps: 130.4	 lr: 0.0005	 epsilon: 0.05
episode: 43779	 average reward: -81.86722797927462	 avg steps: 155.4	 lr: 0.0005	 epsilon: 0.05
episode: 43789	 average reward: -68.78056188306758	 avg steps: 132.7	 lr: 0.0005	 epsilon: 0.05
episode: 43799	 average reward: -68.24750957854405	 avg steps: 131.5	 lr: 0.0005	 epsilon: 0.05
episode: 43809	 average reward: -75.20446

episode: 44569	 average reward: -71.01490312965723	 avg steps: 135.2	 lr: 0.0005	 epsilon: 0.05
episode: 44579	 average reward: -70.47067557535263	 avg steps: 135.7	 lr: 0.0005	 epsilon: 0.05
episode: 44589	 average reward: -68.3216995447648	 avg steps: 132.8	 lr: 0.0005	 epsilon: 0.05
episode: 44599	 average reward: -80.50839793281654	 avg steps: 155.8	 lr: 0.0005	 epsilon: 0.05
episode: 44609	 average reward: -77.34794520547945	 avg steps: 147.0	 lr: 0.0005	 epsilon: 0.05
episode: 44619	 average reward: -77.13445945945946	 avg steps: 149.0	 lr: 0.0005	 epsilon: 0.05
episode: 44629	 average reward: -65.31633466135459	 avg steps: 126.5	 lr: 0.0005	 epsilon: 0.05
episode: 44639	 average reward: -76.48168624740843	 avg steps: 145.7	 lr: 0.0005	 epsilon: 0.05
episode: 44649	 average reward: -78.10187667560322	 avg steps: 150.2	 lr: 0.0005	 epsilon: 0.05
episode: 44659	 average reward: -61.11392405063291	 avg steps: 119.5	 lr: 0.0005	 epsilon: 0.05
episode: 44669	 average reward: -68.71230

episode: 45429	 average reward: -74.0321004884857	 avg steps: 144.3	 lr: 0.0005	 epsilon: 0.05
episode: 45439	 average reward: -68.70428893905192	 avg steps: 133.9	 lr: 0.0005	 epsilon: 0.05
episode: 45449	 average reward: -78.90543130990416	 avg steps: 157.5	 lr: 0.0005	 epsilon: 0.05
episode: 45459	 average reward: -71.42908309455588	 avg steps: 140.6	 lr: 0.0005	 epsilon: 0.05
episode: 45469	 average reward: -71.7971119133574	 avg steps: 139.5	 lr: 0.0005	 epsilon: 0.05
episode: 45479	 average reward: -75.26004084411164	 avg steps: 147.9	 lr: 0.0005	 epsilon: 0.05
episode: 45489	 average reward: -75.74982911825018	 avg steps: 147.3	 lr: 0.0005	 epsilon: 0.05
episode: 45499	 average reward: -78.7797619047619	 avg steps: 152.2	 lr: 0.0005	 epsilon: 0.05
episode: 45509	 average reward: -78.08144796380091	 avg steps: 155.7	 lr: 0.0005	 epsilon: 0.05
episode: 45519	 average reward: -80.2784090909091	 avg steps: 159.4	 lr: 0.0005	 epsilon: 0.05
episode: 45529	 average reward: -79.56822549

episode: 46289	 average reward: -76.96893588896232	 avg steps: 152.3	 lr: 0.0005	 epsilon: 0.05
episode: 46299	 average reward: -74.77754820936639	 avg steps: 146.2	 lr: 0.0005	 epsilon: 0.05
episode: 46309	 average reward: -79.88164556962025	 avg steps: 159.0	 lr: 0.0005	 epsilon: 0.05
episode: 46319	 average reward: -81.52230483271376	 avg steps: 162.4	 lr: 0.0005	 epsilon: 0.05
episode: 46329	 average reward: -77.99288486416559	 avg steps: 155.6	 lr: 0.0005	 epsilon: 0.05
episode: 46339	 average reward: -76.82939632545931	 avg steps: 153.4	 lr: 0.0005	 epsilon: 0.05
episode: 46349	 average reward: -79.90192307692308	 avg steps: 157.0	 lr: 0.0005	 epsilon: 0.05
episode: 46359	 average reward: -73.9126213592233	 avg steps: 145.2	 lr: 0.0005	 epsilon: 0.05
episode: 46369	 average reward: -77.54586857514639	 avg steps: 154.7	 lr: 0.0005	 epsilon: 0.05
episode: 46379	 average reward: -77.92043984476068	 avg steps: 155.6	 lr: 0.0005	 epsilon: 0.05
episode: 46389	 average reward: -81.90998

episode: 47149	 average reward: -78.94042280589366	 avg steps: 157.1	 lr: 0.0005	 epsilon: 0.05
episode: 47159	 average reward: -79.3879365079365	 avg steps: 158.5	 lr: 0.0005	 epsilon: 0.05
episode: 47169	 average reward: -78.03102779573368	 avg steps: 155.7	 lr: 0.0005	 epsilon: 0.05
episode: 47179	 average reward: -75.71124417831005	 avg steps: 151.3	 lr: 0.0005	 epsilon: 0.05
episode: 47189	 average reward: -75.80735785953178	 avg steps: 150.5	 lr: 0.0005	 epsilon: 0.05
episode: 47199	 average reward: -72.85594405594405	 avg steps: 144.0	 lr: 0.0005	 epsilon: 0.05
episode: 47209	 average reward: -73.39623955431755	 avg steps: 144.6	 lr: 0.0005	 epsilon: 0.05
episode: 47219	 average reward: -73.29647546648238	 avg steps: 145.7	 lr: 0.0005	 epsilon: 0.05
episode: 47229	 average reward: -72.88440111420613	 avg steps: 144.6	 lr: 0.0005	 epsilon: 0.05
episode: 47239	 average reward: -70.27545787545787	 avg steps: 137.5	 lr: 0.0005	 epsilon: 0.05
episode: 47249	 average reward: -75.63848

episode: 48009	 average reward: -77.15862524785194	 avg steps: 152.3	 lr: 0.0005	 epsilon: 0.05
episode: 48019	 average reward: -78.41365979381443	 avg steps: 156.2	 lr: 0.0005	 epsilon: 0.05
episode: 48029	 average reward: -72.62991573033707	 avg steps: 143.4	 lr: 0.0005	 epsilon: 0.05
episode: 48039	 average reward: -74.87759336099585	 avg steps: 145.6	 lr: 0.0005	 epsilon: 0.05
episode: 48049	 average reward: -75.14844804318489	 avg steps: 149.2	 lr: 0.0005	 epsilon: 0.05
episode: 48059	 average reward: -75.87959866220736	 avg steps: 150.5	 lr: 0.0005	 epsilon: 0.05
episode: 48069	 average reward: -71.65103056147832	 avg steps: 141.7	 lr: 0.0005	 epsilon: 0.05
episode: 48079	 average reward: -75.28881355932204	 avg steps: 148.5	 lr: 0.0005	 epsilon: 0.05
episode: 48089	 average reward: -75.40080160320642	 avg steps: 150.7	 lr: 0.0005	 epsilon: 0.05
episode: 48099	 average reward: -75.00402955003358	 avg steps: 149.9	 lr: 0.0005	 epsilon: 0.05
episode: 48109	 average reward: -76.7872

episode: 48879	 average reward: -74.34010840108401	 avg steps: 148.6	 lr: 0.0005	 epsilon: 0.05
episode: 48889	 average reward: -73.72131147540983	 avg steps: 147.4	 lr: 0.0005	 epsilon: 0.05
episode: 48899	 average reward: -74.18138586956522	 avg steps: 148.2	 lr: 0.0005	 epsilon: 0.05
episode: 48909	 average reward: -73.9550408719346	 avg steps: 147.8	 lr: 0.0005	 epsilon: 0.05
episode: 48919	 average reward: -74.16700610997964	 avg steps: 148.3	 lr: 0.0005	 epsilon: 0.05
episode: 48929	 average reward: -73.87593728698023	 avg steps: 147.7	 lr: 0.0005	 epsilon: 0.05
episode: 48939	 average reward: -74.27137042062415	 avg steps: 148.4	 lr: 0.0005	 epsilon: 0.05
episode: 48949	 average reward: -72.4394993045897	 avg steps: 144.8	 lr: 0.0005	 epsilon: 0.05
episode: 48959	 average reward: -70.71632216678546	 avg steps: 141.3	 lr: 0.0005	 epsilon: 0.05
episode: 48969	 average reward: -73.20853406744666	 avg steps: 146.3	 lr: 0.0005	 epsilon: 0.05
episode: 48979	 average reward: -87.084649

episode: 49739	 average reward: -75.399064171123	 avg steps: 150.6	 lr: 0.0005	 epsilon: 0.05
episode: 49749	 average reward: -72.06991525423729	 avg steps: 142.6	 lr: 0.0005	 epsilon: 0.05
episode: 49759	 average reward: -74.15953835709436	 avg steps: 148.3	 lr: 0.0005	 epsilon: 0.05
episode: 49769	 average reward: -70.45896877269426	 avg steps: 138.7	 lr: 0.0005	 epsilon: 0.05
episode: 49779	 average reward: -74.76429567642957	 avg steps: 144.4	 lr: 0.0005	 epsilon: 0.05
episode: 49789	 average reward: -69.97520058351569	 avg steps: 138.1	 lr: 0.0005	 epsilon: 0.05
episode: 49799	 average reward: -75.50803212851406	 avg steps: 150.4	 lr: 0.0005	 epsilon: 0.05
episode: 49809	 average reward: -73.49072164948454	 avg steps: 146.5	 lr: 0.0005	 epsilon: 0.05
episode: 49819	 average reward: -70.60057887120115	 avg steps: 139.2	 lr: 0.0005	 epsilon: 0.05
episode: 49829	 average reward: -72.4179523141655	 avg steps: 143.6	 lr: 0.0005	 epsilon: 0.05
episode: 49839	 average reward: -71.6281050

ImportError: DLL load failed while importing _backend_agg: The specified module could not be found.

In [7]:
from helper import greedy_policy, initialize_grids, initialize_state_dict, get_closest_in_grid
import matplotlib.image as img


# Evaluation Parameters
seed = []
n_eval_episodes = 1
max_steps = 10000

# load q table
q = np.loadtxt('data/q_rand_decay.txt')

# initialize discretization
grid_x, grid_v = initialize_grids()
state_to_qtable = initialize_state_dict()

# trace
file = open('trace/trace_eval.txt', 'w')

env = gym.make('MountainCar-v0')

for episode in range(n_eval_episodes):
    if seed:
        state = env.reset(seed=seed[episode])
    else:
        state = env.reset()[0]
    step = 0
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
        action = greedy_policy(q, state, grid_x, grid_v, state_to_qtable)

        # write current state and action taken to trace
        file.write(f'{state[0]},{state[1]},{action}\n')

        new_state, reward, terminated, truncated, info = env.step(action)
        total_rewards_ep += reward

        if terminated:  # or truncated:
            break
        state = new_state


In [10]:
trace = []
actions = []

# load trace and actions into list
with open('trace/trace_eval.txt') as f:
    for line in f.readlines():
        x, v, a = line.split(',')
        trace.append((float(x), float(v)))
        actions.append(int(a))

# discretize trace
trace_discrete = []
grid_x, grid_v = initialize_grids()
state_to_q = initialize_state_dict()
for trac in trace:
    # state = state_to_q[get_closest_in_grid(np.asarray(trac), grid_x, grid_v)]
    x, v = get_closest_in_grid(np.asarray(trac), grid_x, grid_v)
    state = [grid_x.index(x)+0.5, grid_v.index(v)+0.5]
    trace_discrete.append(state)

# load v values plot
plot = img.imread('plots/q_values_rand_decay.png')

# plot trace onto plot
fig, ax = plt.subplots()
# ax.imshow(plot, extent=[0, 20, 0, 20])
with open('plots/q_values.pkl', 'rb') as fid:
    ax = pickle.load(fid)
ax.plot(*zip(*trace_discrete[::1]))

# plot starting point
sns.lineplot(x=[trace_discrete[0][0]], y=[trace_discrete[0][1]], marker='o', markersize=25, markeredgecolor='green', color='green', markeredgewidth=2)
# plot finish point
sns.lineplot(x=[trace_discrete[-1][0]+1], y=[trace_discrete[-1][1]], marker='o', markersize=25, markeredgecolor='red', color='red', markeredgewidth=2)

ax.set_xticklabels(grid_x)
ax.set_yticklabels(grid_v)
plt.savefig('trace/trace.png')
plt.close()


ImportError: DLL load failed while importing _backend_agg: The specified module could not be found.