In [1]:
from DroneEnv import DroneAutomaticDrivingEnv
import math
import collections
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [2]:
class DronNet(nn.Module):
    def __init__(self):
        super(DronNet, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(22, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128,64),
            nn.ReLU()
        )
        self.x_y = nn.Linear(66,5)
        self.w = Variable(torch.FloatTensor())
        self.epsilon = 0.02
        self.gamma = 0.9
        self.step_size = 0.1
        
    def forward(self, x):
        output = self.valueNetwork(x[0:22])
        new_input = torch.cat([torch.unsqueeze(output,1), torch.unsqueeze(x[22:],1)]).view(-1)
        x_y_output = self.x_y(new_input)
        return x_y_output
    
    def sample_action(self, state):
        pi = self.forward(torch.from_numpy(state).float())
        if random.random() < self.epsilon:
            action = np.random.randint(0,5)
            return action, max(pi)
        else : 
            action = torch.argmax(pi).item()
            return action, max(pi)
            

In [3]:
def main():
    env = DroneAutomaticDrivingEnv()
    model = DronNet()
    

    score_pre = 0
    optimizer = optim.Adam(model.parameters(), lr=0.005)
    for episode in range(1000):
        last_state = env.reset()
        score = 0.0  
        while True:
            last_action, pre_value = model.sample_action(last_state)
            optimizer.zero_grad()
            pre_value.backward()
            
            state, r, done, info = env.step(last_action)
            action, value = model.sample_action(state)
            delta = (r + model.gamma * value - pre_value)
            
            for i in range(4):
                model.valueNetwork[i*2].weight.grad = -delta*model.valueNetwork[i*2].weight.grad
            model.x_y.weight.grad = -delta*model.x_y.weight.grad
            optimizer.step()

            last_state = state
            last_action = action
            pre_value = value
            score += r
            if done:
                break
            env.render()

        env.close()
        
                    
        score_pre= score
        print(score)

In [4]:
main()

-176744.05456080212
-122094.61372165772
-189508.24524995527
-55948.31070040812
-2512.4629712164847
-22577.25794556573
-2032.5547456618576
-14566.404768161117
536.9045058984827
-1449.3425628545288
-661.5277639325394
-469.3993887352094
-189.0385987384102
-818.1891891770352
300.0
-1558.2521156656653
-781.5755320542727
526.3981499005532
204.58158457573825
65.17343245142024
-718.090963618647
-616.4256213743706
-825.9348601968629
300.0
-1041.2591962051767
-1108.3709900576152
-539.8632218457576
-1103.2211376142495
95.84052751085264
-844.6282457982808
-1573.7924199454985
-7241.398338259473
-1334.5143580877875
-1171.702185684131
-1352.775209309113
-533.099058459895
-792.8151496791218
-1497.0910703755126
300.0
300.0
132.6841793622552
-715.6012295318291
-717.5199885068489
-597.8286986552517
300.0
-4.601060205714873
-819.512825463218
-25.158324252968782
-599.7465259308073
-507.1886862486497
18.67771301377337
-826.2461635079728
-507.16068161351643
571.8454757435436
-618.6283490471249
128.6343804178

KeyboardInterrupt: 