In [1]:
from DroneEnv import DroneEnvClass
import math
import collections
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 500)
        self.minibatch_size = 16

    def append(self, state, reward, next_state, terminal):
        self.buffer.append([state, reward, next_state, terminal])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        s_lst, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst).to(device), torch.FloatTensor(r_lst).to(device), \
                torch.FloatTensor(s_prime_lst).to(device), torch.FloatTensor(done_mask_lst).to(device)
    
    def size(self):
        return len(self.buffer)

In [3]:
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.actionNetwork = nn.Sequential(
            nn.Linear(12, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128,4)
        )
        
    def forward(self, state):
        state = torch.FloatTensor(state).view(-1,state.shape[-1])
        return torch.sigmoid(self.actionNetwork(state))
    

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(12+4, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128,1)
        )
        
    def forward(self, state, qValue):
        state = torch.FloatTensor(state).view(-1,state.shape[-1])
        qValue = torch.FloatTensor(qValue).view(-1,qValue.shape[-1])
        return self.valueNetwork(torch.cat([state, qValue], dim = -1))



class DDPG():
    def __init__(self):
        super(DDPG, self).__init__()
        self.actor = Actor()
        self.actor_target = Actor()
        self.critic = Critic()
        self.critic_target = Critic()
        self.actionOptimizer = optim.Adam(self.actor.parameters(), lr = 0.01)
        self.valueOptimizer = optim.Adam(self.critic.parameters(), lr = 0.01)
        self.env = DroneEnvClass()
        self.total_reward = 0
        self.tau = 0.001
        self.gamma = 0.9
        self.last_state = None
        self.last_action = None
        self.flag = False
        
    def train(self, epi):
        self.last_state = self.env.reset()
        while True:
            #print('state :', self.last_state)
            print('height :', self.last_state[-1])
            last_w = self.actor.actionNetwork[-1].weight
            
            self.actor.eval()
            action = self.actor(self.last_state)
            print('action :',action)
            state, reward, done= self.env.step(action[0])
            
            self.total_reward += reward
            #self.replay_buffer.append(self.last_state, reward, state, done)
            '''
            self.env.client.simPause(True)
            if self.replay_buffer.size()>self.replay_buffer.minibatch_size:
                print('actions : ', action, 'q : ', self.last_q)
                for _ in range(self.num_replay):
                    self.optimize_network(self.agent)
            else:
                print('batxh_size : ', self.replay_buffer.size(), 'actions : ', action)
            self.env.client.simPause(False)
            '''
            self.actor.train()
            q_next_mat = self.critic_target(state, self.actor_target(state))
            targetQ = reward + q_next_mat*(1-done)*self.gamma
            
            self.valueOptimizer.zero_grad()
            q_mat = self.critic(self.last_state, action.detach())
            valueLoss = F.smooth_l1_loss(q_mat,targetQ)
            valueLoss.backward()
            self.valueOptimizer.step()
            
            self.actionOptimizer.zero_grad()
            q_mat = self.critic(state, self.actor(state))
            actionLoss = (-q_mat).backward()
            self.actionOptimizer.step()
            w = self.actor.actionNetwork[-1].weight
            
            self.soft_update(self.critic, self.critic_target, self.tau)
            self.soft_update(self.actor, self.actor_target, self.tau)        
            #print(last_w)
            #print(w)
            if done:
                break
            self.last_state = state
        if(False):
            print('#episode : ',epi, 'avg_reward : ', self.total_reward/10, 'batch_size : ', self.replay_buffer.size())
            self.total_reward = 0
            
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def optimize_network(self, agent):
        states, rewards, next_states, terminals = self.replay_buffer.sample()
        
        q_next_mat, _ = agent.qValue(next_states)
        v_next_vec = q_next_mat*(1-terminals.view(-1,1))
        target_vec = rewards.view(-1,1) + v_next_vec
        
        q_mat, _ = agent.qValue(states)
        loss = F.smooth_l1_loss(q_mat,target_vec)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [None]:
model = DDPG()

for epi in range(1000):
    model.train(epi)


Connected!
Client Ver:1 (Min Req: 1), Server Ver:1 (Min Req: 1)

Connected!
Client Ver:1 (Min Req: 1), Server Ver:1 (Min Req: 1)

height : -0.14649657905101776
action : tensor([[0.4800, 0.5047, 0.4819, 0.5254]], grad_fn=<SigmoidBackward>)
height : -0.08581556379795074
action : tensor([[0.5985, 0.3676, 0.6957, 0.6878]], grad_fn=<SigmoidBackward>)
height : -0.01787920668721199
action : tensor([[0.9802, 0.0197, 0.9995, 0.9999]], grad_fn=<SigmoidBackward>)
height : 0.04369944706559181
action : tensor([[1.0000e+00, 3.3132e-18, 1.0000e+00, 1.0000e+00]],
       grad_fn=<SigmoidBackward>)
height : 0.1488141566514969
action : tensor([[1., 0., 1., 1.]], grad_fn=<SigmoidBackward>)
height : 0.4242457151412964
action : tensor([[1., 0., 1., 1.]], grad_fn=<SigmoidBackward>)
height : 0.7864145040512085
action : tensor([[1., 0., 1., 1.]], grad_fn=<SigmoidBackward>)
height : 1.2540855407714844
action : tensor([[1., 0., 1., 1.]], grad_fn=<SigmoidBackward>)
height : 1.816831111907959
action : tensor([[1.,

In [5]:
a = torch.from_numpy(np.array([1,2,3]))
b = torch.Tensor(np.array([[1],[2],[3]]))
b.shape

torch.Size([3, 1])

In [7]:
c = a*torch.squeeze(b,1)
c

tensor([1., 4., 9.])