In [1]:
from DroneEnv import DroneAutomaticDrivingEnv
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import collections
import copy
import random

device = 'cuda'

In [2]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 500)
        self.minibatch_size = 128

    def append(self, state, action, reward, terminal, next_state):
        self.buffer.append([state, action, reward, terminal, next_state])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst), np.array(a_lst, dtype = np.float32), torch.FloatTensor(r_lst), torch.FloatTensor(s_prime_lst), \
                torch.FloatTensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(24, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128,66),
            nn.ReLU(),
            nn.Linear(66,5)
        ).to(device)
        self.epsilon = 0.1
        
    def forward(self, x):
        output = self.valueNetwork(x.to(device))
        return output
    
    
    def sample_action(self, state):
        pi = self.forward(torch.from_numpy(state).float())
        if np.random.random() < self.epsilon:
            action = np.random.randint(5)
            return action
        else : 
            action = torch.argmax(pi).item()
            return action

In [4]:
class Agent():    
    def __init__(self):
        self.discount = 0.9
        self.step_size = 0.1
        self.last_state = None
        self.last_action = None
        self.replay_buffer = ReplayBuffer()
        self.network = Net().to(device)
        self.num_replay = 15
        self.optimizer = optim.Adam(self.network.parameters(), lr = 0.0005)
        self.env = DroneAutomaticDrivingEnv()
        self.total_reward = 0
        self.flag = False
        
    def train(self, epi):
        self.last_state = self.env.reset()
        show = True if epi%10==0 else False
        
        while True:
            self.env.render()
            self.last_action = self.network.sample_action(self.last_state)
            state, reward, done, info = self.env.step(self.last_action)
            self.total_reward += reward
            self.replay_buffer.append(self.last_state, self.last_action, reward, done, state)
            if self.replay_buffer.size()>self.replay_buffer.minibatch_size:
                for _ in range(self.num_replay):
                    self.optimize_network(self.network)
            if done:
                break
            self.last_state = state
        if(epi%10 == 0):
            print('#episode : ',epi, 'avg_reward : ', self.total_reward/10, 'batch_size : ', self.replay_buffer.size())
            self.total_reward = 0
        self.env.close()
            
            
    def optimize_network(self, network):
        states, actions, rewards, terminals, next_states = self.replay_buffer.sample()
        
        q_next_mat = network.forward(next_states)
        v_next_vec = torch.max(q_next_mat, dim = -1)[0]*(1-terminals.to(device))
        target_vec = rewards.to(device) + v_next_vec
        
        q_mat = network.forward(states)
        batch_indices = np.arange(self.replay_buffer.minibatch_size, dtype = np.float32)
        q_vec = q_mat[batch_indices,actions]
        loss = F.smooth_l1_loss(q_vec,target_vec).to(device)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [5]:
model = Agent()
for epi in range(10000):
    model.train(epi)



#episode :  0 avg_reward :  -12.4 batch_size :  105


KeyboardInterrupt: 