In [31]:
from DroneEnv import DroneAutomaticDrivingEnv
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import collections
import copy
import random
from torch.distributions import Categorical

In [32]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 50000)
        self.minibatch_size = 128

    def append(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst), torch.tensor(a_lst), torch.FloatTensor(r_lst), \
                torch.FloatTensor(s_prime_lst), torch.FloatTensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

In [33]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.policyNet = nn.Sequential(
            nn.Linear(24, 128),
            nn.ReLU(),
            nn.Linear(128,66),
            nn.ReLU(),
            nn.Linear(66,5)
        )
        self.valueNet = nn.Sequential(
            nn.Linear(24, 128),
            nn.ReLU(),
            nn.Linear(128,66),
            nn.ReLU(),
            nn.Linear(66,1)
        )
        self.epsilon = torch.tensor([0.01])
        
    def policy(self, x):
        policy = self.policyNet(x)
        policy = F.softmax(policy, dim =-1)
        policy = torch.max(policy, self.epsilon)
        return policy
    
    def v(self,x):
        return self.valueNet(x).view(-1)
    
    def sample_action(self, state):
        pi = self.policy(torch.from_numpy(state).float())
        a = Categorical(pi)
        return a.sample().item()

In [36]:
class Agent():    
    def __init__(self):
        self.discount = 0.9
        self.step_size = 0.1
        self.last_state = None
        self.last_action = None
        self.replay_buffer = ReplayBuffer()
        self.network = Net()
        self.targetNet = Net()
        self.targetNet.load_state_dict(self.network.state_dict())
        self.num_replay = 10
        self.optimizer = optim.Adam(self.network.parameters(), lr = 0.0005)
        self.env = DroneAutomaticDrivingEnv()
        self.total_reward = 0
        self.flag = False
        
    def train(self, epi):
        self.last_state = self.env.reset()
        show = True if epi%10==0 else False

        while True:
            #if epi%10 == 0:
            self.env.render()
            self.last_action = self.network.sample_action(self.last_state)
            state, reward, done, info = self.env.step(self.last_action)
            self.total_reward += reward
            self.replay_buffer.append(self.last_state, self.last_action, reward, state, done)
            if reward == 10:
                self.flag = True
            if self.replay_buffer.size() >= 500:
                for _ in range(self.num_replay):
                    self.optimize_network(self.network)
            if done:
                break
            self.last_state = state
        if(epi%10 == 0):
            print('#episode : ',epi, 'avg_reward : ', self.total_reward/10, 'batch_size: ', self.replay_buffer.size(), 'flag: ',self.flag)
            self.targetNet.load_state_dict(self.network.state_dict())
            self.total_reward = 0
        self.env.close()
            
            
    def optimize_network(self, network):
        states, actions, rewards, next_states, terminals = self.replay_buffer.sample()
        
        next_value = network.v(next_states)
        target_vec = rewards + next_value*(1-terminals)
        policy = network.policy(states)
        value = network.v(states)
        policy_vec = policy.gather(1,actions.view(-1,1)).view(-1)
        delta = target_vec-value
        loss = torch.sum(-torch.log(policy_vec)*delta.detach()) + F.smooth_l1_loss(value,target_vec.detach())
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [37]:
model = Agent()
for epi in range(10000):
    model.train(epi)

#episode :  0 avg_reward :  -12.75837361725375 batch_size:  215 flag:  True
#episode :  10 avg_reward :  -225.71013072626846 batch_size:  2651 flag:  True


KeyboardInterrupt: 