In [89]:
import gym
import math
import collections
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import wandb
wandb.init(project="CARCLA", entity="nninept")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [90]:
class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen = 5000)
        self.minibatch_size = 32

    def append(self, state, action, reward, next_state, terminal):
        self.buffer.append([state, action, reward, next_state, terminal])

    def sample(self):
        mini_batch = random.sample(self.buffer, self.minibatch_size)
        s_lst, action, r_lst, s_prime_lst, done_mask_lst = map(list, zip(*mini_batch))
        return torch.FloatTensor(s_lst).to(device), torch.FloatTensor(action).to(device), torch.FloatTensor(r_lst).to(device), \
                torch.FloatTensor(s_prime_lst).to(device), torch.FloatTensor(done_mask_lst).to(device)
    
    def size(self):
        return len(self.buffer)

In [91]:
class Actor(nn.Module):
    def __init__(self,action_space, observation_space, max_action):
        super(Actor, self).__init__()
        self.actionNetwork = nn.Sequential(
            nn.Linear(observation_space[0], 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256,action_space[0]),
            nn.Tanh()
        ).to(device)
        self.action_range = max_action[0]
        
    def forward(self, state):
        return self.actionNetwork(state) * self.action_range

    def select_action(self, state):
        action = self.forward(state)
        policy = torch.normal(action.detach(), 0.1)
        policy = torch.clamp(policy, max=self.action_range, min=self.action_range*(-1))
        return policy

    

class Critic(nn.Module):
    def __init__(self, observation_space):
        super(Critic, self).__init__()
        self.valueNetwork = nn.Sequential(
            nn.Linear(observation_space[0], 256),
            nn.ReLU(),
            nn.Linear(256, 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256,1)
        ).to(device)
        
    def forward(self, state):
        return self.valueNetwork(state)



class CARCLA():
    def __init__(self, env):
        super(CARCLA, self).__init__()
        self.envName = env
        self.env = gym.make(env)
        self.actor = Actor(self.env.action_space.shape, self.env.observation_space.shape, self.env.action_space.high)
        self.actor_target = Actor(self.env.action_space.shape, self.env.observation_space.shape, self.env.action_space.high)
        self.critic = Critic(self.env.observation_space.shape)
        self.critic_target = Critic(self.env.observation_space.shape)
        self.actionOptimizer = optim.RMSprop(self.actor.parameters(), lr = 0.001)
        self.valueOptimizer = optim.RMSprop(self.critic.parameters(), lr = 0.001)
        self.criticLoss = nn.MSELoss()
        self.actionLoss = nn.MSELoss()
        self.replay_buffer = ReplayBuffer()
        wandb.watch(self.actor, self.actionLoss, log="all", log_freq=10)
        wandb.watch(self.critic, self.criticLoss, log="all", log_freq=10)
        self.tau = 0.001
        self.discount = 0.9
        self.num_replay = 15
        self.last_state = None
        self.last_action = None
        self.maxStep = 0
        self.maxReward = None
        print("action space : ",self.env.action_space.shape)
        print("action range : ",self.env.action_space.low, self.env.action_space.high)
        
    def train(self, epi):
        self.last_state = self.env.reset()
        totalReward = 0
        count = 0
        
        
        while True:
            # self.env.render()
            action = self.actor.select_action(torch.FloatTensor(self.last_state).to(device))
            state, reward, done, _= self.env.step(action.detach().cpu().numpy())
            count += 1
            totalReward += reward
            

            self.replay_buffer.append(self.last_state, action.detach().cpu().numpy(), reward, state, done)
            if self.replay_buffer.size()>self.replay_buffer.minibatch_size:
                for _ in range(self.num_replay):
                    self.optimize_network()

            if done:
                break
            self.last_state = state
        wandb.log({"Accumulated Reward": totalReward, "Avg Reward":totalReward/count, "Step" : count})
        if((epi+1)%10 == 0):
            print(f'Epi : {epi} \t Avg reward : {totalReward/count} \t Step : {count}')
            
        # if(self.maxStep < count):
        #     torch.save(model.actor.state_dict(), './checkpoint/CARCLA_best.pt')
        #     self.maxStep = count
        #     print("Save Best")
        
        if(self.maxReward == None or self.maxReward < totalReward):
            torch.save(model.actor.state_dict(), f'./checkpoint/CARCLA_best_{self.envName}.pt')
            self.maxReward = totalReward
            print("Save Best Reward : ",totalReward)

        torch.cuda.empty_cache()
        
            
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
            
    def optimize_network(self):
        states, actions, rewards, next_states, terminals = self.replay_buffer.sample()
        q_next_mat = self.critic_target(next_states).view(-1)
        targetQ = rewards + q_next_mat*(1-terminals)*self.discount
    
        self.valueOptimizer.zero_grad()
        q_mat = self.critic(states).view(-1)
        valueLoss = self.criticLoss(q_mat,targetQ)
        valueLoss.backward()
        self.valueOptimizer.step()
        
        
        policyUpdateIdx = targetQ - q_mat > 0
        policy_evaluation = self.criticLoss(actions[policyUpdateIdx] ,self.actor.forward(states[policyUpdateIdx]))
        self.actionOptimizer.zero_grad()
        policy_evaluation.backward()
        self.actionOptimizer.step()
        
        self.soft_update(self.critic, self.critic_target, self.tau)
        self.soft_update(self.actor, self.actor_target, self.tau)


        
        

In [92]:
wandb.config = {
  "learning_rate": 0.001,
  "epochs": 1000,
  "batch_size": 32,
  "tau" : 0.001
}
env = "Humanoid-v4"
model = CARCLA(env)

for epi in range(1000):
    model.train(epi)


  "Agent's minimum action space value is -infinity. This is probably too low."
  "Agent's maximum action space value is infinity. This is probably too high"
  "We recommend you to use a symmetric and normalized Box action space (range=[-1, 1]) "


action space :  (17,)
action range :  [-0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4 -0.4
 -0.4 -0.4 -0.4] [0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4 0.4]
Save Best Reward :  236.8653802931834
Save Best Reward :  261.3112659634742
Epi : 9 	 Avg reward : 4.947486029516779 	 Step : 34
Epi : 19 	 Avg reward : 5.232825932228449 	 Step : 26
Epi : 29 	 Avg reward : 5.206878534835963 	 Step : 41
Epi : 39 	 Avg reward : 5.044311569419822 	 Step : 37
Epi : 49 	 Avg reward : 5.114286039084454 	 Step : 37
Epi : 59 	 Avg reward : 5.206456132884164 	 Step : 35
Save Best Reward :  280.70495302911564
Epi : 69 	 Avg reward : 5.090491334504302 	 Step : 31
Epi : 79 	 Avg reward : 5.143215930056289 	 Step : 29
Epi : 89 	 Avg reward : 5.277297423567667 	 Step : 27
Epi : 99 	 Avg reward : 5.1200449251898545 	 Step : 40
Epi : 109 	 Avg reward : 5.181283760892184 	 Step : 28
Epi : 119 	 Avg reward : 4.989342019432581 	 Step : 34
Epi : 129 	 Avg reward : 5.327933826810

In [77]:
# newmodel = CARCLA(env)
# newmodel.actor.load_state_dict(torch.load('./checkpoint/CARCLA_best.pt'))
env = gym.make(env) 
state = env.reset()
i = 0

while True:
    env.render()
    action = model.actor.select_action(torch.FloatTensor(state).to(device))
    state, reward, done, _= env.step(action.detach().cpu().numpy())
    i+=1
    if(done):
        env.close()
        break

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [88]:
wandb.finish()

0,1
Accumulated Reward,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▄▃▃▄▃▂▄▃▂▂▃▄▅▃▄▅█▄▄▁▄▇▅▂
Avg Reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Step,▁▁▁▁▁▁▁▁▂▂▂▂▂▂▃▃▄▃▃▄▃▂▄▃▂▂▃▄▅▃▄▅█▄▄▁▄▇▅▂

0,1
Accumulated Reward,641.0
Avg Reward,1.0
Step,641.0
