# Twin Delayed Deep Deterministic Policy Gradients (TD3)

In [None]:
import os
import site
import torch
import random

import numpy as np
import gymnasium as gym
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline

site.addsitedir('../src/')

from datetime import date
from td3_agent import Agent
from collections import deque
from model import TD3CriticNetwork, TD3ActorDSNN

In [None]:
# Create Results Directory
dirs = os.listdir('.')
if not any('result' in d for d in dirs):
    result_id = 1
else:
    results = [d for d in dirs if 'result' in d]
    result_id = len(results) + 1

# Get today's date and add it to the results directory
d = date.today()
result_dir = 'td3_result_' + str(result_id) + '_{}'.format(
    str(d.year) + str(d.month) + str(d.day))
os.mkdir(result_dir)
print('Created Directory {} to store the results in'.format(result_dir))

In [None]:
n_runs = 10
n_timesteps = 1e6
batch_size = 128

seeds = np.load('../seeds/training_seeds.npy')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
actor_learning_rate = 0.001
critic_learning_rate = 0.001
tau = 0.005
layer1_size = 400
layer2_size = 300
noise = 0.1
warmup = 1000
update_actor_interval = 2
update_target_interval = 2
buffer_size = int(2e5)
pop_size = 10
pop_coding = False
two_neuron = True
mutually_exclusive = False

In [None]:
alpha = 0.5
beta = 0.5
weight_scale = 1
threshold = 0.8
sim_time = 5

In [None]:
smoothed_scores_all = []
#torch.autograd.set_detect_anomaly(True)

for i in range(n_runs):
    print("Run # {}".format(i))

    seed = int(seeds[i])
    
    env = gym.make('HalfCheetah-v3')
    
    if two_neuron:
        input_dims = (env.observation_space.shape[0]*2,)
    elif pop_coding:
        input_dims = (env.observation_space.shape[0]*pop_size,)
    else:
        input_dims = env.observation_space.shape
    n_actions = env.action_space.shape[0]

    actor_architecture = [input_dims[0], layer1_size, layer2_size, n_actions]
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    actor = TD3ActorDSNN(actor_architecture, seed, alpha, beta, weight_scale, 1,
                              threshold, sim_time, actor_learning_rate, name='actor_{}'.format(i), device=device)
    target_actor = TD3ActorDSNN(actor_architecture, seed, alpha, beta, weight_scale, 1,
                              threshold, sim_time, actor_learning_rate, name='target_actor_{}'.format(i), device=device)

    critic_1 = TD3CriticNetwork(critic_learning_rate, input_dims, layer1_size,
                                layer2_size, n_actions=n_actions, name='critic_1_{}'.format(i))
    critic_2 = TD3CriticNetwork(critic_learning_rate, input_dims, layer1_size,
                                layer2_size, n_actions=n_actions, name='critic_2_{}'.format(i))
    target_critic_1 = TD3CriticNetwork(critic_learning_rate, input_dims, layer1_size,
                                    layer2_size, n_actions=n_actions, name='target_critic_1_{}'.format(i))
    target_critic_2 = TD3CriticNetwork(critic_learning_rate, input_dims, layer1_size,
                                    layer2_size, n_actions=n_actions, name='target_critic_2_{}'.format(i))

    agent = Agent(actor, critic_1, critic_2, target_actor, target_critic_1, target_critic_2,
                  input_dims, tau, env, n_timesteps, result_dir, n_actions=n_actions, seed=seed,
                  noise=noise, update_actor_interval=update_actor_interval, warmup=warmup,
                  update_target_interval=update_target_interval, two_neuron=two_neuron,
                  buffer_size=buffer_size, spiking=True, normalize=True)
 
    smoothed_scores, reward_history, best_average, best_average_after = agent.train_agent()
    print(agent.max_obs)
    smoothed_scores_all.append(smoothed_scores)

In [None]:
final_smoothed_scores = [smoothed_scores_all[i] for i in range(n_runs)]
mean_smoothed_scores_dqn = np.mean(final_smoothed_scores, axis=0)
std_smoothed_scores = np.std(final_smoothed_scores, axis=0)

fig = plt.figure()
plt.plot(range(len(final_smoothed_scores[0])), mean_smoothed_scores_dqn)
plt.fill_between(range(len(final_smoothed_scores[0])),
                 np.nanpercentile(final_smoothed_scores, 2, axis=0),
                 np.nanpercentile(final_smoothed_scores, 97, axis=0), alpha=0.25)
plt.grid(True)
plt.savefig(result_dir + '/td3_training_snn.png', dpi=300)
plt.show()