In [1]:
from DroneEnv import DroneEnvClass
import math
import collections
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 5000)
        self.minibatch_size = 64

    def append(self, state, action, reward, next_state, terminal):
        self.buffer.append([state, action, reward, next_state, terminal])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        #mini_batch.append(self.buffer[-1])
        s_lst, action, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst).to(device), torch.FloatTensor(action).to(device), torch.FloatTensor(r_lst).to(device), \
                torch.FloatTensor(s_prime_lst).to(device), torch.FloatTensor(done_mask_lst).to(device)
    
    def size(self):
        return len(self.buffer)

In [3]:
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.actionNetwork = nn.Sequential(
            nn.Linear(12, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128,4)
        ).to(device)
        
    def forward(self, state):
        return torch.tanh(self.actionNetwork(state))
    

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(12+4, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128,1)
        ).to(device)
        
    def forward(self, state, action):
        state = state.view(-1,state.shape[-1])
        action = action.view(-1,action.shape[-1])
        return self.valueNetwork(torch.cat([state, action], dim = -1))



class DDPG():
    def __init__(self):
        super(DDPG, self).__init__()
        self.actor = Actor()
        self.actor_target = Actor()
        self.critic = Critic()
        self.critic_target = Critic()
        self.actionOptimizer = optim.Adam(self.actor.parameters(), lr = 0.001)
        self.valueOptimizer = optim.Adam(self.critic.parameters(), lr = 0.001)
        self.criticLoss = nn.MSELoss()
        self.env = DroneEnvClass()
        self.replay_buffer = ReplayBuffer()
        self.tau = 0.001
        self.gamma = 0.9
        self.num_replay = 15
        self.reward = 0
        self.count = 0
        self.last_state = None
        self.last_action = None
        self.flag = False
        
    def train(self, epi):
        self.last_state = self.env.reset()
        
        
        while True:
            action = self.actor(torch.FloatTensor(self.last_state).view(-1,self.last_state.shape[-1]).to(device))
            print('action :',action/2+0.5, '\tHeight :', self.last_state[-1])
            print('Layer weight :', torch.square(self.actor.actionNetwork[-1].weight).sum(), 'shae :',self.actor.actionNetwork[-1].weight.shape)
            state, reward, done = self.env.step(action[0].detach().cpu())
            self.count += 1
            self.reward += reward
            
            self.replay_buffer.append(self.last_state, action.detach().cpu().numpy()/2 + 0.5, reward, state, done)
            if self.replay_buffer.size()>self.replay_buffer.minibatch_size:
                for _ in range(self.num_replay):
                    self.optimize_network()
            else:
                print(self.replay_buffer.size())

            if done:
                print(f'Epi : {epi}   Avg reward : {self.reward/self.count}')
                break
            self.last_state = state
        torch.cuda.empty_cache()
        
            
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def optimize_network(self): 
        states, actions, rewards, next_states, terminals = self.replay_buffer.sample()
        q_next_mat = self.critic_target(next_states, self.actor_target(next_states)).view(-1)
        targetQ = rewards + q_next_mat*(1-terminals)*self.gamma
    
        self.valueOptimizer.zero_grad()
        q_mat = self.critic(states, actions).view(-1)
        valueLoss = self.criticLoss(q_mat,targetQ)
        valueLoss.backward()
        self.valueOptimizer.step()
        
        self.actionOptimizer.zero_grad()
        q_mat = self.critic(states, self.actor(states)).view(-1)
        actionLoss = (-q_mat.mean()+ 10*torch.square(self.actor.actionNetwork[-1].weight).sum()).backward() # + 10*torch.square(self.actor.actionNetwork[-1].weight).sum()
        self.actionOptimizer.step()
        
        self.soft_update(self.critic, self.critic_target, self.tau)
        self.soft_update(self.actor, self.actor_target, self.tau)


In [4]:
model = DDPG()

for epi in range(1000):
    model.train(epi)


Connected!
Client Ver:1 (Min Req: 1), Server Ver:1 (Min Req: 1)

Connected!
Client Ver:1 (Min Req: 1), Server Ver:1 (Min Req: 1)

action : tensor([[0.4985, 0.5267, 0.5296, 0.4933]], device='cuda:0',
       grad_fn=<AddBackward0>) 	Height : -0.24682579934597015
Layer weight : tensor(1.2827, device='cuda:0', grad_fn=<SumBackward0>) shae : torch.Size([4, 128])
1
action : tensor([[0.6029, 0.5249, 0.5353, 0.4817]], device='cuda:0',
       grad_fn=<AddBackward0>) 	Height : -0.017784837633371353
Layer weight : tensor(1.2827, device='cuda:0', grad_fn=<SumBackward0>) shae : torch.Size([4, 128])
2
action : tensor([[0.6853, 0.5541, 0.5955, 0.5476]], device='cuda:0',
       grad_fn=<AddBackward0>) 	Height : 0.9944005608558655
Layer weight : tensor(1.2827, device='cuda:0', grad_fn=<SumBackward0>) shae : torch.Size([4, 128])
3
action : tensor([[0.7870, 0.5573, 0.6427, 0.5362]], device='cuda:0',
       grad_fn=<AddBackward0>) 	Height : 2.402008295059204
Layer weight : tensor(1.2827, device='cuda:0', 

KeyboardInterrupt: 