In [1]:
import gym
import math
import collections
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 5000)
        self.minibatch_size = 63

    def append(self, state, action, reward, next_state, terminal):
        self.buffer.append([state, action, reward, next_state, terminal])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        mini_batch.append(self.buffer[-1])
        s_lst, action, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst).to(device), torch.FloatTensor(action).to(device), torch.FloatTensor(r_lst).to(device), \
                torch.FloatTensor(s_prime_lst).to(device), torch.FloatTensor(done_mask_lst).to(device)
    
    def size(self):
        return len(self.buffer)

In [3]:
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.actionNetwork = nn.Sequential(
            nn.Linear(3, 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 128),
            nn.ReLU(),
            nn.Linear(128,1)
        ).to(device)
        
    def forward(self, state):
        return self.actionNetwork(state)
    

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(3+1, 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 128),
            nn.ReLU(),
            nn.Linear(128,1)
        ).to(device)
        
    def forward(self, state, action):
        state = state.view(-1,state.shape[-1])
        action = action.view(-1,action.shape[-1])
        return self.valueNetwork(torch.cat([state, action], dim = -1))



class DDPG():
    def __init__(self):
        super(DDPG, self).__init__()
        self.actor = Actor()
        self.actor_target = Actor()
        self.critic = Critic()
        self.critic_target = Critic()
        self.actionOptimizer = optim.Adam(self.actor.parameters(), lr = 0.001)
        self.valueOptimizer = optim.Adam(self.critic.parameters(), lr = 0.001)
        self.criticLoss = nn.MSELoss()
        self.env = gym.make('Pendulum-v0')
        self.replay_buffer = ReplayBuffer()
        self.tau = 0.001
        self.gamma = 0.9
        self.num_replay = 15
        self.reward = 0
        self.count = 0
        self.last_state = None
        self.last_action = None
        self.flag = False
        print(self.env.action_space.shape)
        
    def train(self, epi):
        self.last_state = self.env.reset()
        
        
        while True:
            self.env.render()
            action = self.actor(torch.FloatTensor(self.last_state).view(-1,self.last_state.shape[-1]).to(device))
            state, reward, done, _= self.env.step(action[0].detach().cpu())
            self.count += 1
            self.reward += reward
            #print('action :',action)
            self.replay_buffer.append(self.last_state, action.detach().cpu().numpy(), reward, state, done)
            if self.replay_buffer.size()>self.replay_buffer.minibatch_size:
                for _ in range(self.num_replay):
                    self.optimize_network()

            if done:
                print(f'Epi : {epi}   Avg reward : {self.reward/self.count}')
                break
            self.last_state = state
        torch.cuda.empty_cache()
        
            
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def optimize_network(self):
        states, actions, rewards, next_states, terminals = self.replay_buffer.sample()
        q_next_mat = self.critic_target(next_states, self.actor_target(next_states)).view(-1)
        targetQ = rewards + q_next_mat*(1-terminals)*self.gamma
    
        self.valueOptimizer.zero_grad()
        q_mat = self.critic(states, actions).view(-1)
        valueLoss = self.criticLoss(q_mat,targetQ)
        valueLoss.backward()
        self.valueOptimizer.step()
        
        self.actionOptimizer.zero_grad()
        q_mat = self.critic(states, self.actor(states)).view(-1)
        actionLoss = (-q_mat.mean()).backward()
        self.actionOptimizer.step()
        
        self.soft_update(self.critic, self.critic_target, self.tau)
        self.soft_update(self.actor, self.actor_target, self.tau)


In [4]:
model = DDPG()

for epi in range(200):
    model.train(epi)


(1,)
Epi : 0   Avg reward : -6.7147722244262695
Epi : 1   Avg reward : -7.488945484161377
Epi : 2   Avg reward : -7.291668891906738
Epi : 3   Avg reward : -7.558082103729248
Epi : 4   Avg reward : -6.955118656158447
Epi : 5   Avg reward : -6.620588302612305
Epi : 6   Avg reward : -6.442498683929443
Epi : 7   Avg reward : -6.172084331512451
Epi : 8   Avg reward : -6.329494476318359
Epi : 9   Avg reward : -6.149030685424805
Epi : 10   Avg reward : -5.885885715484619
Epi : 11   Avg reward : -5.731205463409424
Epi : 12   Avg reward : -5.441232681274414
Epi : 13   Avg reward : -5.197465419769287
Epi : 14   Avg reward : -5.022012710571289
Epi : 15   Avg reward : -4.7904767990112305
Epi : 16   Avg reward : -4.548197269439697
Epi : 17   Avg reward : -4.412535190582275
Epi : 18   Avg reward : -4.251612186431885
Epi : 19   Avg reward : -4.073489189147949
Epi : 20   Avg reward : -3.9126720428466797
Epi : 21   Avg reward : -3.8602805137634277
Epi : 22   Avg reward : -3.81107759475708
Epi : 23   Av

KeyboardInterrupt: 

In [None]:
torch.save(model, 'C:\\ProgramStudy\\RL_Drone\\AirSim\\PythonClient\\multirotor\\parameter.pt')