## Sloving Needle Master with Twin Delayed DDPG (TD3)
Code modified from https://github.com/nikhilbarhate99/TD3-PyTorch-BipedalWalker-v2 <br>


In [1]:
import sys
import numpy as np
import torch
import argparse
import os
import random
from environment import Environment
from environment import PID
import utils
import TD3_priorized
import TD3
import math
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
class Args:
    policy_name = "TD3"
    env_name = "Needle Master"
    seed = 1e6
    eval_freq = 5e3 # How often (time steps) we evaluate
    max_timesteps = 1e6  # Max time steps to run environment for
    save_models = "store"
    expl_noise = 1    # Std of Gaussian exploration noise
    batch_size = 100
    discount = 0.99   # Discount factor
    tau = 0.005         # Target network update rate
    policy_noise = 0.2   # Noise added to target policy during critic update
    noise_clip = 0.5
    policy_freq = 2  # Frequency of delayed policy updates
    max_size = 1e6
    pid_freq = 9e2    # How often purely random policy is run for
    pid_interval = 5e2   # How many time steps purely random policy is run for
    filename = 'environment_15'
    
args=Args()

In [3]:
# Setup
random.seed(args.seed)
torch.manual_seed(random.randint(1, 10000))
if torch.cuda.is_available():
    args.device = torch.device('cuda')
    torch.cuda.manual_seed(random.randint(1, 10000))
    torch.backends.cudnn.enabled = False  # Disable nondeterministic ops (not sure if critical but better safe than sorry)
else:
    args.device = torch.device('cpu')

In [4]:
sys.path.insert(0, '/home/lifan/workspace/RL/needle_master_tools/data')

### Model evaluation

In [5]:
def evaluate_policy(policy, log_f):
    eval_path = './evaluate/'
    evaluation_time = 3
    if not os.path.exists(eval_path):
        os.mkdir(eval_path)

    state = env.reset(log_f)
    done = False
    env.episode_num += 1
    env.episode_reward = 0
    episode_timesteps = 0
    average_reward = 0


    while not done:
        action = policy.select_action(state)
        # print("state: " + str(state))
        # print("action: " + str(action))
        new_state, reward, done = env.step(action, log_f)
        # print("next state: " + str(next_state))
        # print("done: " +str(done))
        env.episode_reward += reward
        state = new_state
        episode_timesteps += 1
        env.total_timesteps += 1

    env.render(save_image=True, save_path=eval_path)

    print ("---------------------------------------")
    print ("Episode_num: %d: %f" % (env.episode_num, env.episode_reward))
    print ("---------------------------------------")
    return env.episode_reward



In [6]:
file_name = "%s_%s" % (args.filename, args.policy_name)
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

if not os.path.exists("./results"):
    os.makedirs("./results")
if not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

## environment set up
action_dim = 2

""" Adding the log file """
logfile = "%s_%s" % (args.filename, args.policy_name)
log_f = open("log_"+logfile+".txt","w+")
env_path = '/home/lifan/workspace/RL/needle_master_tools/data/'+ args.filename + '.txt'
env = Environment(action_dim,log_f, filename = env_path)

state_dim = len(env.gates) + 9


""""  for PID controller """
action_constrain = [10, np.pi/20]
parameter = [0.1,0.0009]
pid = PID( parameter, env.width, env.height )

""" [lower bound],[higher bound] """
# env.action_bound = np.array((-1,1)) ## for one dimension action
env.action_bound = np.array(([0, -1],[1, 1]))   ## for two dimension action
max_action = 1


""" parameters for epsilon declay """
epsilon_start = 1
epsilon_final = 0.01
decay_rate = 250000
ep_decay = []

""" beta Prioritized Experience Replay"""
beta_start = 0.4
beta_frames = 250000


### for plotting
Reward = []
save_path = './out/'
""" start straightly """
evaluations = []

# Initialize policy
# policy = TD3.TD3(state_dim, action_dim, max_action)
policy = TD3_priorized.TD3(state_dim, action_dim, max_action)
# replay_buffer = utils.ReplayBuffer(args.max_size)
replay_buffer = utils.NaivePrioritizedBuffer(int(args.max_size))

# Evaluate untrained policy
# evaluations = [evaluate_policy(policy)]


env.total_timesteps = 0
timesteps_since_eval = 0
done = True

while env.total_timesteps < args.max_timesteps:

    # Evaluate episode
    if timesteps_since_eval >= args.eval_freq:
        timesteps_since_eval %= args.eval_freq
        evaluations.append(evaluate_policy(policy, log_f))
        
        if env.last_reward > 100 and env.episode_num > 100: 
            policy.save(file_name, directory="./pytorch_models")
            np.save("./results/%s" % (file_name), evaluations)

        continue


    ## finish one episode, and train episode_times
    if done:
#         log_f.write('~~~~~~~~~~~~~~~~~~~~~~~~ iteration {} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n'.format(env.episode_num))


        ## load model
        # policy.load(file_name,"./pytorch_models")

        ## training as usual
            # if env.total_timesteps != 0 and env.episode_reward > 500:
        if env.total_timesteps != 0:
            log_f.write('Total:{}, Episode Num:{}, Eposide:{}, Reward:{}\n'.format(env.total_timesteps, env.episode_num, episode_timesteps, env.episode_reward))
            log_f.flush()
            
            if env.episode_num % 20 == 0:
                print (("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (
                env.total_timesteps, env.episode_num, episode_timesteps, env.episode_reward))
                env.render( save_image=True, save_path=save_path)

        if env.total_timesteps != 0:
            beta = min(1.0, beta_start + env.total_timesteps * (1.0 - beta_start) / beta_frames)
            policy.train(replay_buffer, episode_timesteps, beta, args.batch_size, 
                             args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq)

        Reward.append(env.episode_reward)

        # Reset environment
        state = env.reset(log_f)

        done = False

        env.episode_num += 1
        env.episode_reward = 0
        episode_timesteps = 0

    """ exploration rate decay """
    args.expl_noise = (epsilon_start - epsilon_final) * math.exp(-1. * env.total_timesteps / decay_rate)
    ep_decay.append(args.expl_noise)
#     log_f.write('epsilon decay:{}\n'.format(args.expl_noise))
#     if env.total_timesteps % 500 == 0 and args.expl_noise > 0:
#         args.expl_noise *= 0.9

    """ alternative between random selected action and policy selected action """
#     if env.total_timesteps % args.pid_freq < args.pid_interval:
# #     if env.total_timesteps < args.pid_interval:
#         state_pid = state[0:3]
#         action = pid.PIDcontroller( state_pid, env.next_gate, env.gates)
# #         log_f.write('PID Action:{}\n'.format(action))
# #         action = env.sample_action()
#         # log_f.write('~~~~~~~~~~~random action~~~~~~~~~~\n')
#         # log_f.write('random selected action:{}\n'.format(action))

#     else:
#         # print("state: " +str(state))
#         action = policy.select_action(state)
#         # print("select")
#         # log_f.write('~~~~~~~~~~~selected action~~~~~~~~~~\n')
#         log_f.write('Action based on policy:{}\n'.format(action))
#         # print("action based on policy:" + str(action))
#         # print("action selected: " +str(action))
        
#         if args.expl_noise != 0:
#             noise = np.random.normal(0, args.expl_noise, size=action_dim)
#             # print("noise: " + str(noise))
#             action = (action + noise).clip(-1, 1)


    """ using PID controller """
    # state_pid = state[0:3]
    # action = pid.PIDcontroller( state_pid, env.next_gate, env.gates)
    # print("action based on PID: " + str(action))

    """ action selected based on pure policy """
    action = policy.select_action(state)
    log_f.write('action based on policy:{}\n'.format(action))
    # print("action based on policy:" + str(action))
    if args.expl_noise != 0:
#         state_pid = state[0:3]
#         guidance = pid.PIDcontroller( state_pid, env.next_gate, env.gates, env.total_timesteps)
        noise = np.random.normal(0, args.expl_noise, size=action_dim)
        # print("noise: " + str(noise))
#         action = ((1 - args.expl_noise) * action + args.expl_noise * guidance)
        action = action + noise
        action[0] = np.clip(action[0],0,1)
        action[1] = np.clip(action[1],-1,1)


    ### select action only based on pure RL
    # action = policy.select_action(state)
    # print("action selected: " +str(action))


    # Perform action
    new_state, reward, done = env.step(action, log_f)

    done_bool = 0 if episode_timesteps + 1 == env.max_time else float(done)
    env.episode_reward += reward

    # Store data in replay buffer
    replay_buffer.add(state, new_state, action, reward, done_bool)
    # print("state: " + str(state))
    state = new_state

    episode_timesteps += 1
    env.total_timesteps += 1
    timesteps_since_eval += 1

plt.plot(range(len(Reward)), np.array(Reward), 'b')
plt.savefig('./results/episode reward.png')

plt.plot(range(len(policy.actor_loss)), policy.actor_loss, 'b')
plt.savefig('./results/actor loss.png')

plt.plot(range(len(policy.critic_loss)), policy.critic_loss, 'b')
plt.savefig('./results/critic loss.png')

plt.plot(range(len(evaluations)), np.array(evaluations), 'b')
plt.savefig('./results/evaluation reward.png')
print(evaluations)



---------------------------------------
Settings: environment_15_TD3
---------------------------------------
Total T: 174 Episode Num: 20 Episode T: 5 Reward: -0.714488
Total T: 283 Episode Num: 40 Episode T: 4 Reward: 0.112196
Total T: 377 Episode Num: 60 Episode T: 4 Reward: -0.768431
Total T: 466 Episode Num: 80 Episode T: 6 Reward: 0.126771
Total T: 563 Episode Num: 100 Episode T: 7 Reward: -0.303387
Total T: 647 Episode Num: 120 Episode T: 4 Reward: -0.649450
Total T: 736 Episode Num: 140 Episode T: 5 Reward: -0.218081
Total T: 833 Episode Num: 160 Episode T: 8 Reward: 0.108082
Total T: 942 Episode Num: 180 Episode T: 4 Reward: 0.205640
Total T: 1041 Episode Num: 200 Episode T: 5 Reward: 0.237422
Total T: 1131 Episode Num: 220 Episode T: 5 Reward: 0.372746
Total T: 1228 Episode Num: 240 Episode T: 5 Reward: -0.220079
Total T: 1336 Episode Num: 260 Episode T: 4 Reward: 0.177994
Total T: 1437 Episode Num: 280 Episode T: 3 Reward: -0.430322
Total T: 1535 Episode Num: 300 Episode T: 4

Total T: 25997 Episode Num: 2420 Episode T: 13 Reward: 440.297943
Total T: 26244 Episode Num: 2440 Episode T: 12 Reward: 1.588981
Total T: 26508 Episode Num: 2460 Episode T: 11 Reward: 1.085616
Total T: 26759 Episode Num: 2480 Episode T: 12 Reward: 39.294495
Total T: 27006 Episode Num: 2500 Episode T: 13 Reward: 1.477102
Total T: 27264 Episode Num: 2520 Episode T: 12 Reward: -0.229936
Total T: 27528 Episode Num: 2540 Episode T: 14 Reward: 440.066812
Total T: 27783 Episode Num: 2560 Episode T: 12 Reward: 0.318469
Total T: 28036 Episode Num: 2580 Episode T: 12 Reward: 1.092425
Total T: 28296 Episode Num: 2600 Episode T: 12 Reward: -0.490624
Total T: 28549 Episode Num: 2620 Episode T: 14 Reward: 441.097484
Total T: 28801 Episode Num: 2640 Episode T: 12 Reward: 1.186365
Total T: 29056 Episode Num: 2660 Episode T: 13 Reward: 440.379821
Total T: 29309 Episode Num: 2680 Episode T: 14 Reward: 439.540512
Total T: 29561 Episode Num: 2700 Episode T: 12 Reward: 1.278114
Total T: 29811 Episode Num:

Total T: 39595 Episode Num: 4920 Episode T: 3 Reward: -0.334006
Total T: 39665 Episode Num: 4940 Episode T: 2 Reward: -0.789446
Total T: 39731 Episode Num: 4960 Episode T: 4 Reward: 0.036070
Total T: 39799 Episode Num: 4980 Episode T: 2 Reward: -0.795202
Total T: 39872 Episode Num: 5000 Episode T: 3 Reward: -0.764742
Total T: 39943 Episode Num: 5020 Episode T: 7 Reward: 0.145214
Total T: 40013 Episode Num: 5040 Episode T: 3 Reward: 0.434810
Total T: 40085 Episode Num: 5060 Episode T: 3 Reward: 0.343149
---------------------------------------
Episode_num: 5063: -0.786185
---------------------------------------
Total T: 40149 Episode Num: 5080 Episode T: 2 Reward: -0.785759
Total T: 40224 Episode Num: 5100 Episode T: 3 Reward: 0.393539
Total T: 40290 Episode Num: 5120 Episode T: 3 Reward: 0.639325
Total T: 40355 Episode Num: 5140 Episode T: 3 Reward: -0.867455
Total T: 40426 Episode Num: 5160 Episode T: 3 Reward: -1.106902
Total T: 40489 Episode Num: 5180 Episode T: 3 Reward: -0.118572
T

Total T: 48114 Episode Num: 7440 Episode T: 3 Reward: -0.696644
Total T: 48186 Episode Num: 7460 Episode T: 4 Reward: -0.363317
Total T: 48249 Episode Num: 7480 Episode T: 2 Reward: -0.723095
Total T: 48304 Episode Num: 7500 Episode T: 3 Reward: -0.167865
Total T: 48376 Episode Num: 7520 Episode T: 4 Reward: -0.668306
Total T: 48440 Episode Num: 7540 Episode T: 3 Reward: -0.615165
Total T: 48503 Episode Num: 7560 Episode T: 3 Reward: -0.125034
Total T: 48565 Episode Num: 7580 Episode T: 3 Reward: 0.765571
Total T: 48633 Episode Num: 7600 Episode T: 5 Reward: -0.528495
Total T: 48696 Episode Num: 7620 Episode T: 2 Reward: -0.734448
Total T: 48755 Episode Num: 7640 Episode T: 3 Reward: 0.240622
Total T: 48815 Episode Num: 7660 Episode T: 3 Reward: -0.013495
Total T: 48876 Episode Num: 7680 Episode T: 5 Reward: 0.217159
Total T: 48940 Episode Num: 7700 Episode T: 3 Reward: -0.175599
Total T: 48998 Episode Num: 7720 Episode T: 4 Reward: -0.583267
Total T: 49057 Episode Num: 7740 Episode T:

KeyboardInterrupt: 