## Sloving Needle Master with Twin Delayed DDPG (TD3)
Code modified from https://github.com/nikhilbarhate99/TD3-PyTorch-BipedalWalker-v2 <br>


In [19]:
import numpy as np
import torch
import argparse
import os
import sys
import random
from environment import Environment
import utils
import TD3
import math
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
class Args:
    policy_name = "TD3"
    env_name = "Needle Master"
    seed = 0
    start_timesteps = 2e5  # How many time steps purely random policy is run for
    eval_freq = 5e3  # How often (time steps) we evaluate
    max_timesteps = 1e7   # Max time steps to run environment for
    save_models = "store"
    expl_noise = 1    # Std of Gaussian exploration noise
    batch_size = 100
    discount = 0.99   # Discount factor
    tau = 0.005         # Target network update rate
    policy_noise = 0.2   # Noise added to target policy during critic update
    noise_clip = 0.5
    policy_freq = 2  # Frequency of delayed policy updates
    max_size = 5e3
    filename = 'environment_14.txt'
    
args=Args()

In [21]:
# Setup
random.seed(args.seed)
torch.manual_seed(random.randint(1, 10000))
if torch.cuda.is_available():
    args.device = torch.device('cuda')
    torch.cuda.manual_seed(random.randint(1, 10000))
    torch.backends.cudnn.enabled = False  # Disable nondeterministic ops (not sure if critical but better safe than sorry)
else:
    args.device = torch.device('cpu')

In [22]:
sys.path.insert(0, '/home/lifan/workspace/RL/needle_master_tools/data')

### Model evaluation

In [23]:
def evaluate_policy(policy, eval_episodes= 3):
    avg_reward = 0.
    for _ in range(eval_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy.select_action(state)
            next_state, reward, done = env.step(action, save_image=False)
            avg_reward += reward

    avg_reward /= eval_episodes
    env.episode_reward = avg_reward
    frame = env.render(save_image=True, save_path=save_path)

    print ("---------------------------------------")
    print ("Episode_num: %d, Evaluation over %d episodes: %f" % (env.episode_num, eval_episodes, avg_reward))
    print ("---------------------------------------")
    return avg_reward


In [None]:
file_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed),1)
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

if not os.path.exists("./results"):
    os.makedirs("./results")
if not os.path.exists("./pytorch_models"):
    os.makedirs("./pytorch_models")

## environment set up
env = Environment(filename=args.filename)
env.GetTargetPoint()
state_dim = 10
action_dim = 2

""""  for PID controller """
action_constrain = [10, np.pi/20]
parameter = [0.1,0.0009]

""" [lower bound],[higher bound] """
env.action_bound = np.array(([-1, -1],[1, 1]))  ## modified lower bound
max_action = 1.0

### for plotting
Reward = []
save_path = './out/'

# Initialize policy
policy = TD3.TD3(state_dim, action_dim, max_action)
replay_buffer = utils.ReplayBuffer(args.max_size)

# Evaluate untrained policy
# evaluations = [evaluate_policy(policy)]
""" start straightly """
evaluations = []

env.total_timesteps = 0
timesteps_since_eval = 0
done = True

while env.total_timesteps < args.max_timesteps:

    ## finish one episode, and train episode_times
    if done:

        # Evaluate episode
        if timesteps_since_eval >= args.eval_freq:
            timesteps_since_eval %= args.eval_freq
            evaluations.append(evaluate_policy(policy))

            policy.save(file_name, directory="./pytorch_models")
            np.save("./results/%s" % (file_name), evaluations)

        ## load model
        # policy.load(file_name,"./pytorch_models")

        ## training as usual
        else:
            # if env.total_timesteps != 0 and env.episode_reward > 500:
            if env.total_timesteps != 0:
                print (("Total T: %d Episode Num: %d Episode T: %d Reward: %f") % (
                    env.total_timesteps, env.episode_num, episode_timesteps, env.episode_reward))
                frame = env.render(env.episode_num, save_image=True, save_path=save_path)

            if env.total_timesteps != 0:
                policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq)


        Reward.append(env.episode_reward)
        plt.plot(np.arange(1, env.episode_num), Reward[1:env.episode_num], 'b')
        plt.savefig('./out/episode reward.png')
        
#         print("~~~~~~~~~~~~~~~~~~~~~~~~ round done ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")


        # Reset environment
        state = env.reset()

        done = False

        env.episode_num += 1
        env.episode_reward = 0
        episode_timesteps = 0
        
        """ exploration rate decay """
        if env.total_timesteps % 1000 == 0 and args.expl_noise > 0:
            args.expl_noise -= 0.001 

    # Select action randomly or according to policy
    if env.total_timesteps % args.max_size < args.start_timesteps:
        action = env.sample_action()
#         print("randomly selected: " + str(action))
        # action = env.PIDcontroller(action_constrain, parameter, env.t)
        # print("PID controller: " +str(action))
    else:
        action = policy.select_action(state)
        # print("action based on polilcy:" + str(action))
        # print("action selected: " +str(action))
        if args.expl_noise != 0:
            action = (action + np.random.normal(0, args.expl_noise, size=2)).clip(
                env.action_bound[0,:], env.action_bound[1,:])
        # print("noised action: " +str(action))



    # Perform action
    new_state, reward, done = env.step(action, save_image=True)

    running = env.check_status()

    done_bool = 0 if episode_timesteps + 1 == env.max_time else float(done)
    env.episode_reward += reward

    # Store data in replay buffer
    replay_buffer.add((state, new_state, action, reward, done_bool))
#     print("state: " + str(state))
    
    state = new_state


    episode_timesteps += 1
    env.total_timesteps += 1
    timesteps_since_eval += 1

    plt.plot(range(len(policy.actor_loss)), policy.actor_loss)
    plt.savefig('./results/actor loss.png')

    plt.plot(range(len(policy.critic_loss)), policy.critic_loss)
    plt.savefig('./results/critic loss.png')

# Final evaluation
# plt.plot(np.arange(1,episode_num),Reward[0:episode_num],'b')
# plt.savefig('./out/episode reward.png')

    # if  env.total_timesteps % 1000 == 0:
    #     evaluations.append(evaluate_policy(policy))
    #     policy.save("%s" % (file_name), directory="./pytorch_models")
    #     np.save("./results/%s" % (file_name), evaluations)

---------------------------------------
Settings: TD3_Needle Master_0_1
---------------------------------------
Total T: 82 Episode Num: 1 Episode T: 82 Reward: 16.787760
Total T: 126 Episode Num: 2 Episode T: 44 Reward: 265.263533
Total T: 146 Episode Num: 3 Episode T: 20 Reward: 0.885530
Total T: 230 Episode Num: 4 Episode T: 84 Reward: 110.035524
Total T: 310 Episode Num: 5 Episode T: 80 Reward: -126.826092
Total T: 389 Episode Num: 6 Episode T: 79 Reward: 126.739138
Total T: 414 Episode Num: 7 Episode T: 25 Reward: -7.360321
Total T: 495 Episode Num: 8 Episode T: 81 Reward: -126.175066
Total T: 579 Episode Num: 9 Episode T: 84 Reward: 120.108929
Total T: 608 Episode Num: 10 Episode T: 29 Reward: -13.255395
Total T: 645 Episode Num: 11 Episode T: 37 Reward: -30.155327
Total T: 728 Episode Num: 12 Episode T: 83 Reward: 14.810490
Total T: 810 Episode Num: 13 Episode T: 82 Reward: 110.509970
Total T: 899 Episode Num: 14 Episode T: 89 Reward: 10.899278
Total T: 982 Episode Num: 15 Episo

Total T: 8058 Episode Num: 129 Episode T: 85 Reward: 118.965170
Total T: 8145 Episode Num: 130 Episode T: 87 Reward: 16.655809
Total T: 8175 Episode Num: 131 Episode T: 30 Reward: 206.044144
Total T: 8256 Episode Num: 132 Episode T: 81 Reward: 115.138813
Total T: 8337 Episode Num: 133 Episode T: 81 Reward: -127.336912
Total T: 8354 Episode Num: 134 Episode T: 17 Reward: 4.636181
Total T: 8440 Episode Num: 135 Episode T: 86 Reward: 99.243077
Total T: 8475 Episode Num: 136 Episode T: 35 Reward: -27.737042
Total T: 8497 Episode Num: 137 Episode T: 22 Reward: -0.926884
Total T: 8579 Episode Num: 138 Episode T: 82 Reward: 116.226994
Total T: 8601 Episode Num: 139 Episode T: 22 Reward: -1.867284
Total T: 8653 Episode Num: 140 Episode T: 52 Reward: -40.771620
Total T: 8716 Episode Num: 141 Episode T: 63 Reward: -85.265404
Total T: 8754 Episode Num: 142 Episode T: 38 Reward: 197.708735
Total T: 8837 Episode Num: 143 Episode T: 83 Reward: -127.507280
Total T: 8872 Episode Num: 144 Episode T: 35