In [42]:
import torch
import numpy as np
import time
import rospy
from utils import ReplayBuffer,MultiStepMemory,PER
from PER import Memory
from std_msgs.msg import Float32MultiArray
import sys
import os
# from Env.environment_stage_2 import Env
import numpy as np
log_interval = 5           # print avg reward after interval
gamma = 0.99                # discount for future rewards
batch_size = 256            # num of transitions sampled from replay buffer
lr = 5e-4
exploration_noise =0.8
polyak = 0.995              # target policy update parameter (1-tau)
policy_noise = 0.1         # target policy smoothing noise
noise_clip = 0.5
policy_delay = 2            # delayed policy updates parameter
max_episodes = 600         # max num of episodes
max_timesteps = 500        # max timesteps in one episode
pretrain_times = 0
warmup_epoch = 0
state_dim = 64
action_dim = 2
max_action = 1.0

In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# class Actor(nn.Module):
    
#     def __init__(self, state_dim, action_dim, max_action):
#         super(Actor, self).__init__()
        
#         self.l1 = nn.Linear(state_dim, 256)
#         self.l2 = nn.Linear(256, 64)
#         self.l3 = nn.Linear(64, action_dim)
        
#         self.max_action = max_action
        
#     def forward(self, state):
#         a = F.relu(self.l1(state))
#         a = F.relu(self.l2(a))
#         a = torch.tanh(self.l3(a)) * self.max_action
#         return a
        
# class Critic(nn.Module):
#     def __init__(self, state_dim, action_dim):
#         super(Critic, self).__init__()
        
#         self.l1 = nn.Linear(state_dim + action_dim, 256)
#         self.l2 = nn.Linear(256, 64)
#         self.l3 = nn.Linear(64, 1)
        
#     def forward(self, state, action):
#         state_action = torch.cat([state, action], 1)
        
#         q = F.relu(self.l1(state_action))
#         q = F.relu(self.l2(q))
#         q = self.l3(q)
#         return q

class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()
        
        self.l1 = nn.Linear(state_dim-4, 24)
        # self.l2 = nn.Linear(256, 24)
        self.l2 = nn.Linear(24+4,256)
        self.l3 = nn.Linear(256,64)
        self.l4 = nn.Linear(64, action_dim)
        
        self.max_action = max_action
        
    def forward(self, state):
        a = F.relu(self.l1(state[:,:-4]))
        a = F.relu(self.l2(torch.cat([a,state[:,-4:]],1)))
        a = F.relu(self.l3(a))
        a = torch.tanh(self.l4(a)) * self.max_action
        return a
        
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()
        
        self.l1 = nn.Linear(state_dim-4, 24)
        self.l2 = nn.Linear(24+4+action_dim, 256)
        self.l3 = nn.Linear(256,64)
        self.l4 = nn.Linear(64, 1)
        
    def forward(self, state, action):
        # state_action = torch.cat([state, action], 1)
        q = F.relu(self.l1(state[:,:-4]))
        q = F.relu(self.l2(torch.cat([q,state[:,-4:],action],1)))
        q = F.relu(self.l3(q))
        q = self.l4(q)
        return q    

class TD3:
    def __init__(self, lr, state_dim, action_dim, max_action):
        
        self.actor = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        
        self.critic_1 = Critic(state_dim, action_dim).to(device)
        self.critic_1_target = Critic(state_dim, action_dim).to(device)
        self.critic_1_target.load_state_dict(self.critic_1.state_dict())
        self.critic_1_optimizer = optim.Adam(self.critic_1.parameters(), lr=lr)
        
        self.critic_2 = Critic(state_dim, action_dim).to(device)
        self.critic_2_target = Critic(state_dim, action_dim).to(device)
        self.critic_2_target.load_state_dict(self.critic_2.state_dict())
        self.critic_2_optimizer = optim.Adam(self.critic_2.parameters(), lr=lr)
        self.state_dim = state_dim
        self.max_action = max_action
    
    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def batch_inference(self,state):
        state = torch.FloatTensor(state.reshape(-1, self.state_dim)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()

    def analyze_action(self,state,action):
        result1 = self.critic_1(state,action)
        result2 = self.critic_2(state,action)
        return result1,result2

    # def state_estimate(self,state,action):
    #     laser = state[:-4]
    #     heading = state[-4]
    #     dis = state[-3]

    #     delta_t = 0.2
    #     max_angle_vel = 2
    #     max_linear_spd = 0.3
    #     vx = action[1] * max_linear_spd/2 + max_linear_spd/2
    #     wz = action[0] * max_angle_vel
    #     theta = wz * delta_t
    #     delta_x = vx * delta_t
    #     new_dis = dis - delta_x * math.cos(theta)
    #     new_heading = heading - theta

    #     if new_heading > math.pi:
    #         new_heading -= 2 * math.pi

    #     elif new_heading < -math.pi:
    #         new_heading += 2 * math.pi

    #     return new_dis,new_heading


    def actor_SL(self,replay_buffer,n_iter,batch_size,sample_mode=1):
        for i in range(n_iter):
            # Sample a batch of transitions from replay buffer:
            state, action_, reward, next_state, done = replay_buffer.sample(batch_size,sample_mode)
            # state, action_, reward, next_state, done,idx,weights = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(state).to(device)
            action = torch.FloatTensor(action_).to(device)
            action_inference = self.actor(state)
            actor_loss = F.mse_loss(action_inference,action).mean()
            # Optimize the actor
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()
            if i == 0:
                print("actor_loss:%d",actor_loss.item())

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_( param.data)

    def critic_SL(self, replay_buffer, n_iter, batch_size, gamma, polyak, policy_noise, noise_clip, policy_delay,sample_mode=0):

        for i in range(n_iter):
            # Sample a batch of transitions from replay buffer:
            state, action_, reward, next_state, done = replay_buffer.sample(batch_size,sample_mode)
            # state, action_, reward, next_state, done,idx,weights = replay_buffer.sample(batch_size)
            state = torch.FloatTensor(state).to(device)
            action = torch.FloatTensor(action_).to(device)
            reward = torch.FloatTensor(reward).reshape((batch_size,1)).to(device)
            next_state = torch.FloatTensor(next_state).to(device)
            done = torch.FloatTensor(done).reshape((batch_size,1)).to(device)
            # Select next action according to target policy:
            noise = torch.FloatTensor(action_).data.normal_(0, policy_noise).to(device)
            noise = noise.clamp(-noise_clip, noise_clip)
            next_action = (self.actor_target(next_state) + noise)
            next_action = next_action.clamp(-self.max_action, self.max_action)

            # Compute target Q-value:
            target_Q1 = self.critic_1_target(next_state, next_action)
            target_Q2 = self.critic_2_target(next_state, next_action)
            target_Q = torch.min(target_Q1, target_Q2)
            target_Q = reward + ((1-done) * gamma * target_Q).detach()

            # Optimize Critic 1:
            current_Q1 = self.critic_1(state, action)
            # replay_buffer.batch_update(idx,torch.abs(target_Q-current_Q1).detach().cpu().numpy())
            # loss_Q1 = (torch.pow(current_Q1-target_Q,2) * torch.FloatTensor(weights).cuda()).mean()
            loss_Q1 = F.mse_loss(current_Q1, target_Q)
            self.critic_1_optimizer.zero_grad()
            loss_Q1.backward()
            self.critic_1_optimizer.step()
            
            # Optimize Critic 2:
            current_Q2 = self.critic_2(state, action)
            # loss_Q2 = (torch.pow(current_Q2-target_Q,2) * torch.FloatTensor(weights).cuda()).mean()
            loss_Q2 = F.mse_loss(current_Q2, target_Q)
            self.critic_2_optimizer.zero_grad()
            loss_Q2.backward()
            self.critic_2_optimizer.step()

            if i == 0:
                print("critic_loss:{}".format(loss_Q1.item()))
            
            for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
                target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))
            
            for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
                target_param.data.copy_( (polyak * target_param.data) + ((1-polyak) * param.data))

        
        
      
        


In [44]:
class importDataset(object):
    def __init__(self, state_dim, action_dim,max_size=-1):
        self.datasetPath = "/home/cmq/ljn/RL/turtlebot3/src/dqn-navigation/turtlebot3_machine_learning/turtlebot3_machine_learning/scripts/dataset_cali.npy"
        self.dataset = np.load(self.datasetPath, allow_pickle=True)
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_size = max_size

    def transfer(self):
        self.buffer = []
        for data in self.dataset:
            if len(self.buffer) == self.max_size:
                break
            state = data[0]
            action = data[1]
            reward = data[2]
            next_state = data[3]
            finish = float(data[4])
            assert len(state) == state_dim
            assert(len(action)) == action_dim
            assert len(next_state) == state_dim
            self.buffer.append((state, action, reward, next_state, finish))
        return self.buffer
policy = TD3(lr, state_dim, action_dim, max_action)
replay_buffer = ReplayBuffer()
replay_buffer.importDataset(importDataset(state_dim,action_dim).transfer())

IMPORT DATASET WITH LENGTH:20001


In [45]:
def calculate_loss():
    loss_fn = torch.nn.MSELoss(reduction='sum')
    sample = replay_buffer.sample(128,1)
    action = policy.batch_inference(sample[0])
    action = torch.FloatTensor(action).reshape(-1,2)
    print(action[1])
    gt = torch.FloatTensor(sample[1])
    print(gt[1])

    loss = loss_fn(action, gt)
    print(loss/action.size(0))
calculate_loss()

tensor([ 0.0775, -0.0457])
tensor([0.5833, 0.0000])
tensor(0.5218)


In [46]:
    pretrain_times = 100
    for i in range(pretrain_times):
        if(i%10 == 0):
            print("actor supervised learning batch:%d",i)
            policy.actor_SL(replay_buffer,500,batch_size)
    for i in range(pretrain_times):
        if(i % 10 == 0):
            print("finish batch :%d", i)
            calculate_loss()
        policy.critic_SL(replay_buffer, 500, batch_size, gamma,
                      polyak, policy_noise, noise_clip, policy_delay, 1)

('actor supervised learning batch:%d', 0)
('actor_loss:%d', 0.2973363995552063)
('actor supervised learning batch:%d', 10)
('actor_loss:%d', 0.24844270944595337)
('actor supervised learning batch:%d', 20)
('actor_loss:%d', 0.26869669556617737)
('actor supervised learning batch:%d', 30)
('actor_loss:%d', 0.23424625396728516)
('actor supervised learning batch:%d', 40)
('actor_loss:%d', 0.2623080611228943)
('actor supervised learning batch:%d', 50)
('actor_loss:%d', 0.2664201855659485)
('actor supervised learning batch:%d', 60)
('actor_loss:%d', 0.24360713362693787)
('actor supervised learning batch:%d', 70)
('actor_loss:%d', 0.23707878589630127)
('actor supervised learning batch:%d', 80)
('actor_loss:%d', 0.2589711546897888)
('actor supervised learning batch:%d', 90)
('actor_loss:%d', 0.2700992822647095)
('finish batch :%d', 0)
tensor([0.6703, 0.0177])
tensor([-0.6667, -0.2334])
tensor(0.4984)
critic_loss:164.121643066
critic_loss:303.043273926
critic_loss:143.398986816
critic_loss:393.9