# DQN vs. DSQN for the CartPole Environment

In [1]:
import os
import gym
import torch
import random

import numpy as np
import seaborn as sns
import torch.optim as optim
import matplotlib.pyplot as plt

from datetime import date
from model import QNetwork, DSNN
from agent import Agent, ReplayBuffer
from matplotlib.gridspec import GridSpec

%matplotlib inline

In [2]:
# Environment specific parameters
env_name = 'CartPole-v0'
n_runs = 10
n_evaluations = 100
max_steps = 200
num_episodes = 1000

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
# Create Results Directory
dirs = os.listdir('.')
if not any('result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'result_' + str(result_id) + '_{}'.format(
    str(d.year) + str(d.month) + str(d.day))
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

Created Directory result_2_20211018 to store the results in


In [4]:
# Hyperparameters
batch_size = 128
discount_factor = 0.999
eps_start = 1.0
eps_end = 0.05
eps_decay = 0.999
update_every = 4
target_update_frequency = 100
learning_rate = 0.001
replay_memory_size = 4*10**4
tau = 1e-3

In [5]:
# SNN Hyperparameters
time_step = 1e-3
simulation_time = 10
alpha = 1
beta = 1
weight_scale = 1
threshold = 0.1
architecture = [4, 64, 64, 2]

In [6]:
seeds = [random.getrandbits(32) for _ in range(n_runs)]

np.save('seeds', seeds)

## DQN Training

In [None]:
smoothed_scores_dqn_all = []
dqn_completion_after = []

for i_run in range(n_runs):
    print("Run # {}".format(i_run))
    seed = seeds[i_run]
    
    torch.manual_seed(seed)
    random.seed(seed)

    policy_net = QNetwork(architecture, seed).to(device)
    target_net = QNetwork(architecture, seed).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                  replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                  update_every, target_update_frequency, optimizer, learning_rate,
                  num_episodes, max_steps, i_run, result_dir, seed, tau)
    
    smoothed_scores, scores, best_average_after = agent.train_agent()

    np.save(result_dir + '/scores_{}'.format(i_run), scores)
    np.save(result_dir + '/smoothed_scores_DQN_{}'.format(i_run), smoothed_scores)

    # save smoothed scores in list to plot later
    dqn_completion_after.append(best_average_after)
    smoothed_scores_dqn_all.append(smoothed_scores)
    print("")

In [None]:
# Plot scores of individual runs
for i in range(len(smoothed_scores_dqn_all)):
    fig = plt.figure()
    plt.plot(smoothed_scores_dqn_all[i])
    plt.ylim(0, 250)
    plt.grid(True)
    plt.savefig(result_dir + '/training_dqn_{}.png'.format(i), dpi=1000)
    plt.show()

In [None]:
# Plot results (mean)
best_smoothed_scores_dqn = [smoothed_scores_dqn_all[best_runs[0]],
                            smoothed_scores_dqn_all[best_runs[1]],
                            smoothed_scores_dqn_all[best_runs[2]],
                            smoothed_scores_dqn_all[best_runs[3]],
                            smoothed_scores_dqn_all[best_runs[4]],
                            smoothed_scores_dqn_all[best_runs[5]],
                            smoothed_scores_dqn_all[best_runs[6]],
                            smoothed_scores_dqn_all[best_runs[7]],
                            smoothed_scores_dqn_all[best_runs[8]],
                            smoothed_scores_dqn_all[best_runs[9]]]
mean_smoothed_scores_dqn = np.mean(best_smoothed_scores_dqn, axis=0)
std_smoothed_scores = np.std(best_smoothed_scores_dqn, axis=0)

avg_dqn_completion_after = np.mean([dqn_completion_after[best_runs[0]],
                                dqn_completion_after[best_runs[1]],
                                dqn_completion_after[best_runs[2]],
                                dqn_completion_after[best_runs[3]],
                                dqn_completion_after[best_runs[4]],
                                dqn_completion_after[best_runs[5]],
                                dqn_completion_after[best_runs[6]],
                                dqn_completion_after[best_runs[7]],
                                dqn_completion_after[best_runs[8]],
                                dqn_completion_after[best_runs[9]]])

fig = plt.figure()
plt.plot(range(len(best_smoothed_scores_dqn[0])), mean_smoothed_scores_dqn)
plt.fill_between(range(len(best_smoothed_scores_dqn[0])),
                 np.nanpercentile(best_smoothed_scores_dqn, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dqn, 97, axis=0), alpha=0.25)
plt.vlines(avg_dqn_completion_after, 0, 250, 'C0')
#plt.fill_between(range(len(smoothed_scores_dqn_all[0])), mean_smoothed_scores-std_smoothed_scores,
#                 mean_smoothed_scores+std_smoothed_scores, alpha=0.25)
plt.ylim(0, 250)
plt.grid(True)
plt.savefig(result_dir + '/DQN_training.png', dpi=1000)
plt.show()

## DSQN Training

DSQNs without two-neurons-input encoding

In [None]:
smoothed_scores_dsqn_all = []
dsqn_completion_after = []
simulation_time = 10

for i_run in range(n_runs):
    print("Run # {}".format(i_run))
    seed = seeds[i_run]

    torch.manual_seed(seed)
    random.seed(seed)

    policy_net = DSNN(architecture, seed, alpha, beta, batch_size, threshold, simulation_time)
    target_net = DSNN(architecture, seed, alpha, beta, batch_size, threshold, simulation_time)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

    agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                  replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                  update_every, target_update_frequency, optimizer, learning_rate,
                  num_episodes, max_steps, i_run, result_dir, seed, tau, SQN=True, quantization=False)

    smoothed_scores, scores, best_average_after = agent.train_agent()

    np.save(result_dir + '/scores_{}'.format(i_run), scores)
    np.save(result_dir + '/smoothed_scores_DSQN_{}'.format(i_run), smoothed_scores)

    # save smoothed scores in list to plot later
    smoothed_scores_dsqn_all.append(smoothed_scores)
    dsqn_completion_after.append(best_average_after)
    print("")

Run # 0
Episode 1	Average Score: 33.00	 Epsilon: 0.97Episode 2	Average Score: 22.00	 Epsilon: 0.96Episode 3	Average Score: 19.67	 Epsilon: 0.94Episode 4	Average Score: 28.00	 Epsilon: 0.89



Episode 100	Average Score: 44.20	 Epsilon: 0.05
Episode 200	Average Score: 41.85	 Epsilon: 0.05
Episode 300	Average Score: 38.64	 Epsilon: 0.05
Episode 400	Average Score: 35.87	 Epsilon: 0.05
Episode 500	Average Score: 35.94	 Epsilon: 0.05
Episode 600	Average Score: 45.84	 Epsilon: 0.05
Episode 700	Average Score: 48.64	 Epsilon: 0.05
Episode 778	Average Score: 49.41	 Epsilon: 0.05

## Quantized DSQN Training

In [None]:
smoothed_scores_dsqn_quantized_all = []
dsqn_quantized_completion_after = []
simulation_time = 8

for i_run in range(n_runs):
    print("Run # {}".format(i_run))
    seed = seeds[i_run]

    torch.manual_seed(seed)
    random.seed(seed)

    policy_net = DSQN(architecture, seed, alpha, beta, weight_scale, batch_size, threshold, simulation_time)
    target_net = DSQN(architecture, seed, alpha, beta, weight_scale, batch_size, threshold, simulation_time)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

    agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                  replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                  update_every, target_update_frequency, optimizer, learning_rate,
                  num_episodes, max_steps, i_run, result_dir, seed, tau, SQN=True, two_neurons=False,
                  quantization=True)

    smoothed_scores, scores, best_average_after = agent.train_agent()

    np.save(result_dir + '/scores_{}'.format(i_run), scores)
    np.save(result_dir + '/smoothed_scores_DSQN_Loihi_{}'.format(i_run), smoothed_scores)

    # save smoothed scores in list to plot later
    smoothed_scores_dsqn_quantized_all.append(smoothed_scores)
    dsqn_quantized_completion_after.append(best_average_after)
    print("")

In [None]:
smoothed_scores_dsqn_quantized_all = smoothed_scores_dsqn_all
dsqn_quantized_completion_after = dsqn_completion_after

In [None]:
policy_net = DSQN(architecture, seed, alpha, beta, weight_scale, batch_size, threshold, simulation_time, two_neurons=False)

In [None]:
dsqn_completion_after

In [None]:
policy_net.weights = weights

In [None]:
seed = seeds[0]
policy_net = DSQN(architecture, seed, alpha, beta, weight_scale, batch_size, threshold, simulation_time, two_neurons=False)
target_net = DSQN(architecture, seed, alpha, beta, weight_scale, batch_size, threshold, simulation_time, two_neurons=False)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

agent = Agent(env_name, policy_net, target_net, architecture, batch_size,
                  replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
                  update_every, target_update_frequency, optimizer, learning_rate,
                  num_episodes, max_steps, 0, result_dir, seed, tau, SQN=True, two_neurons=False)

In [None]:
weights = policy_net.weights

In [None]:
weights

In [None]:
q_weights = agent.quantize_weights(weights)

In [None]:
q_weights

In [None]:
quant_weights = [q_w.tensor.float() for q_w in q_weights]

In [None]:
quant_weights[0].requires_grad = True

In [None]:
quant_weights

In [None]:
step = (1.8 + 1.8)/255

In [None]:
w = np.concatenate((weights[0].detach().numpy()[0], weights[0].detach().numpy()[1], weights[0].detach().numpy()[2], weights[0].detach().numpy()[3]))
bins = np.arange(-1.8, 1.8, step)
plt.hist(w, bins)
plt.title('FP32 Weights')
plt.savefig('weights_fp32.png', dpi=1000)

In [None]:
w = np.concatenate((quant_weights[0].detach().numpy()[0], quant_weights[0].detach().numpy()[1], quant_weights[0].detach().numpy()[2], quant_weights[0].detach().numpy()[3]), axis=0)
bins = range(-128, 127)
plt.hist(w, bins)
plt.title('Quantized Weights')
plt.savefig('weights_quantized.png', dpi=1000)

In [None]:
policy_net.weights = quant_weights

In [None]:
env = gym.make(env_name)

In [None]:
obs = env.reset()

In [None]:
best_runs = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
best_smoothed_scores_dsqn = [smoothed_scores_dsqn_all[best_runs[0]],
                             smoothed_scores_dsqn_all[best_runs[1]],
                             smoothed_scores_dsqn_all[best_runs[2]],
                             smoothed_scores_dsqn_all[best_runs[3]],
                             smoothed_scores_dsqn_all[best_runs[4]],
                             smoothed_scores_dsqn_all[best_runs[5]],
                             smoothed_scores_dsqn_all[best_runs[6]],
                             smoothed_scores_dsqn_all[best_runs[7]],
                             smoothed_scores_dsqn_all[best_runs[8]],
                             smoothed_scores_dsqn_all[best_runs[9]]]
mean_smoothed_scores_dsqn = np.mean(best_smoothed_scores_dsqn, axis=0)

avg_dsqn_completion_after = np.mean([dsqn_completion_after[best_runs[0]],
                                dsqn_completion_after[best_runs[1]],
                                dsqn_completion_after[best_runs[2]],
                                dsqn_completion_after[best_runs[3]],
                                dsqn_completion_after[best_runs[4]],
                                dsqn_completion_after[best_runs[5]],
                                dsqn_completion_after[best_runs[6]],
                                dsqn_completion_after[best_runs[7]],
                                dsqn_completion_after[best_runs[8]],
                                dsqn_completion_after[best_runs[9]]])

fig = plt.figure()
plt.plot(range(len(best_smoothed_scores_dsqn[0])), mean_smoothed_scores_dsqn)
plt.fill_between(range(len(best_smoothed_scores_dsqn[0])),
                 np.nanpercentile(best_smoothed_scores_dsqn, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dsqn, 97, axis=0), alpha=0.25)

plt.vlines(avg_dsqn_completion_after, 0, 250, 'C0')


plt.ylim(0, 250)
plt.grid(True)
plt.savefig(result_dir + '/DSQN_training.png', dpi=1000)
plt.title('CartPole-v0 DSQN')
plt.show()

## Plot Quantized DSQN Training

In [None]:
smoothed_scores_dsqn_quantized_0 = np.load('result_23_2021416/smoothed_scores_DSQN_0.npy')
smoothed_scores_dsqn_quantized_1 = np.load('result_23_2021416/smoothed_scores_DSQN_1.npy')
smoothed_scores_dsqn_quantized_2 = np.load('result_23_2021416/smoothed_scores_DSQN_2.npy')
smoothed_scores_dsqn_quantized_3 = np.load('result_23_2021416/smoothed_scores_DSQN_3.npy')
smoothed_scores_dsqn_quantized_4 = np.load('result_23_2021416/smoothed_scores_DSQN_4.npy')
smoothed_scores_dsqn_quantized_5 = np.load('result_23_2021416/smoothed_scores_DSQN_5.npy')
smoothed_scores_dsqn_quantized_6 = np.load('result_23_2021416/smoothed_scores_DSQN_6.npy')
smoothed_scores_dsqn_quantized_7 = np.load('result_23_2021416/smoothed_scores_DSQN_7.npy')
smoothed_scores_dsqn_quantized_8 = np.load('result_23_2021416/smoothed_scores_DSQN_8.npy')
smoothed_scores_dsqn_quantized_9 = np.load('result_23_2021416/smoothed_scores_DSQN_9.npy')
smoothed_scores_dsqn_quantized_all = [smoothed_scores_dsqn_quantized_0, smoothed_scores_dsqn_quantized_1, smoothed_scores_dsqn_quantized_2, smoothed_scores_dsqn_quantized_3, smoothed_scores_dsqn_quantized_4, smoothed_scores_dsqn_quantized_5, smoothed_scores_dsqn_quantized_6, smoothed_scores_dsqn_quantized_7, smoothed_scores_dsqn_quantized_8, smoothed_scores_dsqn_quantized_9]

In [None]:
best_smoothed_scores_dsqn_quantized = [smoothed_scores_dsqn_quantized_all[best_runs[0]],
                             smoothed_scores_dsqn_quantized_all[best_runs[1]],
                             smoothed_scores_dsqn_quantized_all[best_runs[2]],
                             smoothed_scores_dsqn_quantized_all[best_runs[3]],
                             smoothed_scores_dsqn_quantized_all[best_runs[4]],
                             smoothed_scores_dsqn_quantized_all[best_runs[5]],
                             smoothed_scores_dsqn_quantized_all[best_runs[6]],
                             smoothed_scores_dsqn_quantized_all[best_runs[7]],
                             smoothed_scores_dsqn_quantized_all[best_runs[8]],
                             smoothed_scores_dsqn_quantized_all[best_runs[9]]]
mean_smoothed_scores_dsqn_quantized = np.mean(best_smoothed_scores_dsqn_quantized, axis=0)

avg_dsqn_quantized_completion_after = np.mean([dsqn_quantized_completion_after[best_runs[0]],
                                dsqn_quantized_completion_after[best_runs[1]],
                                dsqn_quantized_completion_after[best_runs[2]],
                                dsqn_quantized_completion_after[best_runs[3]],
                                dsqn_quantized_completion_after[best_runs[4]],
                                dsqn_quantized_completion_after[best_runs[5]],
                                dsqn_quantized_completion_after[best_runs[6]],
                                dsqn_quantized_completion_after[best_runs[7]],
                                dsqn_quantized_completion_after[best_runs[8]],
                                dsqn_quantized_completion_after[best_runs[9]]])

fig = plt.figure()
plt.plot(range(len(best_smoothed_scores_dsqn_quantized[0])), mean_smoothed_scores_dsqn_quantized)
plt.fill_between(range(len(best_smoothed_scores_dsqn_quantized[0])),
                 np.nanpercentile(best_smoothed_scores_dsqn_quantized, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dsqn_quantized, 97, axis=0), alpha=0.25)

plt.vlines(avg_dsqn_quantized_completion_after, 0, 250, 'C0')


plt.ylim(0, 250)
plt.grid(True)
plt.savefig(result_dir + '/DSQN_training.png', dpi=1000)
plt.title('CartPole-v0 DSQN Quantized')
plt.show()

In [None]:
# Plot smoothed DQN vs. DSQN Training
#mean_smoothed_scores_dqn = np.mean(smoothed_scores_dqn_all, axis=0)
#mean_smoothed_scores_dsqn = np.mean(smoothed_scores_dsqn_all, axis=0)

fig = plt.figure()

dqn = plt.plot(range(len(best_smoothed_scores_dqn[0])), mean_smoothed_scores_dqn, color='C0', label='DQN')
plt.fill_between(range(len(best_smoothed_scores_dqn[0])),
                 np.nanpercentile(best_smoothed_scores_dqn, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dqn, 97, axis=0), alpha=0.25)
plt.vlines(avg_dqn_completion_after, 0, 250, 'C0')

dsqn = plt.plot(range(len(best_smoothed_scores_dsqn[0])), mean_smoothed_scores_dsqn, color='C1', label='DSQN')
plt.fill_between(range(len(best_smoothed_scores_dsqn[0])),
                 np.nanpercentile(best_smoothed_scores_dsqn, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dsqn, 97, axis=0), alpha=0.25)
plt.vlines(avg_dsqn_completion_after, 0, 250, 'C1')

dsqn_quantized = plt.plot(range(len(best_smoothed_scores_dsqn_quantized[0])), mean_smoothed_scores_dsqn_quantized, color='C2', label='Quantized DSQN')
plt.fill_between(range(len(best_smoothed_scores_dsqn_quantized[0])),
                 np.nanpercentile(best_smoothed_scores_dsqn_quantized, 2, axis=0),
                 np.nanpercentile(best_smoothed_scores_dsqn_quantized, 97, axis=0), alpha=0.25)
plt.vlines(avg_dsqn_quantized_completion_after, 0, 250, 'C2')


plt.grid(True)
plt.ylim(0, 250)
plt.legend(loc='lower right')
plt.xlabel('episode')
plt.ylabel('sum of rewards')
plt.title(env_name)
plt.savefig(result_dir + '/DQN_vs_DSQN_training.png', dpi=1000)
plt.show()

## Evaluate trained DQN and DSQN models

In [None]:
gym_evaluation_seeds = [random.getrandbits(32) for _ in range(n_evaluations)]

In [None]:
# Test best trained DQN on the same environment for 200 timesteps
evaluation_dqn_200 = []
for i in best_runs:
    print("Run # {}".format(i))
    dqn = QNetwork(architecture, 1).to(device)
    dqn.load_state_dict(torch.load(result_dir + '/checkpoint_DQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dqn, 100, 200, gym_evaluation_seeds)
    evaluation_dqn_200.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dqn_200', evaluation_dqn_200)
print("Total Mean Reward: {}".format(np.mean(evaluation_dqn_200)))
print("Total Deviation: {}".format(np.std(evaluation_dqn_200)))

In [None]:
# Test best trained DQN on the same environment for 500 timesteps
evaluation_dqn_500 = []
for i in best_runs:
    print("Run # {}".format(i))
    dqn = QNetwork(architecture, 1).to(device)
    dqn.load_state_dict(torch.load(result_dir + '/checkpoint_DQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dqn, 100, 500, gym_evaluation_seeds)
    evaluation_dqn_500.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dqn_500', evaluation_dqn_500)
print("Total Mean Reward: {}".format(np.mean(evaluation_dqn_500)))
print("Total Deviation: {}".format(np.std(evaluation_dqn_500)))

In [None]:
# Test best trained DQN on the same environment for 1000 timesteps
evaluation_dqn_1000 = []
for i in best_runs:
    print("Run # {}".format(i))
    dqn = QNetwork(architecture, 1).to(device)
    dqn.load_state_dict(torch.load(result_dir + '/checkpoint_DQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dqn, 100, 1000, gym_evaluation_seeds)
    evaluation_dqn_1000.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dqn_1000', evaluation_dqn_1000)
print("Total Mean Reward: {}".format(np.mean(evaluation_dqn_1000)))
print("Total Deviation: {}".format(np.std(evaluation_dqn_1000)))

In [None]:
# Test best trained DSQN on the same environment for 200 timesteps
evaluation_dsqn_200 = []
for i in best_runs:
    print("Run # {}".format(i))
    dsqn = DSQN(architecture, 0, alpha, beta, weight_scale, 1, threshold, simulation_time)
    dsqn.load_state_dict(torch.load(result_dir + '/checkpoint_DSQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dsqn, 100, 200, gym_evaluation_seeds)
    evaluation_dsqn_200.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dsqn_200', evaluation_dsqn_200)
print("Total Mean Reward: {}".format(np.mean(evaluation_dsqn_200)))
print("Total Deviation: {}".format(np.std(evaluation_dsqn_200)))

In [None]:
# Test best trained DSQN on the same environment for 200 timesteps
evaluation_dsqn_500 = []
for i in best_runs:
    print("Run # {}".format(i))
    dsqn = DSQN(architecture, 0, alpha, beta, weight_scale, 1, threshold, simulation_time)
    dsqn.load_state_dict(torch.load(result_dir + '/checkpoint_DSQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dsqn, 100, 500, gym_evaluation_seeds)
    evaluation_dsqn_500.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dsqn_500', evaluation_dsqn_500)
print("Total Mean Reward: {}".format(np.mean(evaluation_dsqn_500)))
print("Total Deviation: {}".format(np.std(evaluation_dsqn_500)))

In [None]:
# Test best trained DSQN on the same environment for 200 timesteps
evaluation_dsqn_1000 = []
for i in best_runs:
    print("Run # {}".format(i))
    dsqn = DSQN(architecture, 0, alpha, beta, weight_scale, 1, threshold, simulation_time)
    dsqn.load_state_dict(torch.load(result_dir + '/checkpoint_DSQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dsqn, 100, 1000, gym_evaluation_seeds)
    evaluation_dsqn_1000.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dsqn_1000', evaluation_dsqn_1000)
print("Total Mean Reward: {}".format(np.mean(evaluation_dsqn_1000)))
print("Total Deviation: {}".format(np.std(evaluation_dsqn_1000)))

In [None]:
means = [np.mean(evaluation_dqn_200), np.mean(evaluation_dsqn_200)]
stds = [np.std(evaluation_dqn_200), np.std(evaluation_dsqn_200)]
#x_pos = np.arange(len(means))
x_pos = [0.5, .65]

plt.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, capsize=10, width=0.1)
plt.ylim(0, 250)
plt.xticks(x_pos, ['DQN', 'DSQN'])
plt.ylabel('Accumlative Reward')
plt.title('CartPole-v0 Evaluation over 200 timesteps')
plt.grid(True)
plt.savefig(result_dir + '/CartPole_evaluation_200.png', dpi=1000)

In [None]:
means = [np.mean(evaluation_dqn_500), np.mean(evaluation_dsqn_500)]
stds = [np.std(evaluation_dqn_500), np.std(evaluation_dsqn_500)]
x_pos = [0.5, .65]

plt.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, capsize=10, width=0.1)
plt.ylim(0, 550)
plt.xticks(x_pos, ['DQN', 'DSQN'])
plt.ylabel('Accumlative Reward')
plt.title('CartPole-v0 Evaluation over 500 timesteps')
plt.grid(True)
plt.savefig(result_dir + '/CartPole_evaluation_500.png', dpi=1000)

In [None]:
means = [np.mean(evaluation_dqn_1000), np.mean(evaluation_dsqn_1000)]
stds = [np.std(evaluation_dqn_1000), np.std(evaluation_dsqn_1000)]
x_pos = [0.5, .65]

plt.bar(x_pos, means, yerr=stds, align='center', alpha=0.5, capsize=10, width=0.1)
plt.ylim(0, 1150)
plt.xticks(x_pos, ['DQN', 'DSQN'])
plt.ylabel('Accumlative Reward')
plt.title('CartPole-v0 Evaluation over 1000 timesteps')
plt.grid(True)
plt.savefig(result_dir + '/CartPole_evaluation_1000.png', dpi=1000)

In [None]:
# Get the membrane potential of the first layer, first item in batch
potential = [mem[1][0] for mem in mem_rec]
neuron1 = [p[0] for p in potential]
neuron2 = [p[1] for p in potential]

In [None]:
# Plot the membrane potential for both output neurons for one random run before training
plt.plot(neuron1, color='b', label='Output Neuron 1')
plt.plot(neuron2, color='g', label='Output Neuron 2')
plt.grid(True)
plt.ylim(-25, 25)
plt.xlabel('time steps')
plt.ylabel('membrane potential')
plt.legend(loc='upper right')
plt.savefig('cartpole_output_neurons_potential_b4_training.png', dpi=1000)

In [None]:
# Get the membrane potential of the hidden layer neurons
potential = [mem[0][0] for mem in mem_rec]
neurons = []
for i in range(len(potential[0])):
    neurons.append([p[i] for p in potential])

In [None]:
# Plot the membrane potential for the hidden layer neurons
for i in range(len(neurons)):
    plt.plot(neurons[i], label='neuron {}'.format(i + 1))
plt.grid(True)
plt.legend(loc='best')
plt.xlabel('time')
plt.ylabel('membrane potential')

Test Code

In [None]:
# Fill the play buffer with some data
env = gym.make(env_name)
memory = ReplayBuffer(replay_memory_size, batch_size, random_seeds[0])
for i in range(1000):
    print("Episode: {}".format(i), end='\r')
    state = env.reset()
    for t in range(1000):
        action = random.randint(0, 1)
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)
        state = next_state
        if done:
            break


In [None]:
import sunblaze_envs

In [None]:
random_env = sunblaze_envs.make('SunblazeCartPoleRandomNormal-v0')

In [None]:
result_dir = 'result_20_2021122'


In [None]:
evaluation_dsqn_random_200 = []

dsqn = DSQN(architecture, 0, alpha, beta, weight_scale, 1, threshold, simulation_time)
optimizer = optim.Adam(dsqn.parameters(), lr=learning_rate)

for i in best_runs:
    print("Run # {}".format(i))
    dsqn = DSQN(architecture, 0, alpha, beta, weight_scale, 1, threshold, simulation_time)
    dsqn.load_state_dict(torch.load(result_dir + '/checkpoint_DSQN_{}.pt'.format(i)))
    
    agent = Agent(env_name, dsqn, dsqn, architecture, batch_size,
              replay_memory_size, discount_factor, eps_start, eps_end, eps_decay,
              update_every, target_update_frequency, optimizer, learning_rate,
              num_episodes, max_steps, 0, result_dir, 0, tau, SQN=True, two_neurons=False, random=True)
    
    rewards = agent.evaluate_agent(dsqn, 100, 200, gym_evaluation_seeds)
    evaluation_dsqn_random_200.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dsqn_200', evaluation_dsqn_random_200)
print("Total Mean Reward: {}".format(np.mean(evaluation_dsqn_random_200)))
print("Total Deviation: {}".format(np.std(evaluation_dsqn_random_200)))

In [None]:
evaluation_dqn_random_200 = []

for i in best_runs:
    print("Run # {}".format(i))
    dqn = QNetwork(architecture, 1).to(device)
    dqn.load_state_dict(torch.load(result_dir + '/checkpoint_DQN_{}.pt'.format(i)))
    rewards = agent.evaluate_agent(dqn, 100, 200, gym_evaluation_seeds)
    evaluation_dqn_random_200.extend(rewards)
    print("Mean Rewards: {}".format(np.mean(rewards)))
    print("Deviation: {}".format(np.std(rewards)))
    print("-----------------")
np.save(result_dir + '/evaluation_dqn_200', evaluation_dqn_random_200)
print("Total Mean Reward: {}".format(np.mean(evaluation_dqn_random_200)))
print("Total Deviation: {}".format(np.std(evaluation_dqn_random_200)))

In [None]:
result_dir