In [1]:
%%capture
!pip install box2d-py

In [2]:
import os
import cv2
import gym
import time
import collections

import numpy as np
import torch as T
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

%matplotlib inline

In [3]:
T.cuda.get_device_name()

'Tesla T4'

## **OU Noise**

In [4]:
class OUActionNoise:
    def __init__(self, mu, sigma=0.15, theta=0.2, dt=1e-2, x0=None):
        self.mu = mu
        self.sigma = sigma
        self.theta = theta
        self.dt = dt
        self.x0 = x0
        self.x_prev = None
        self.reset()

    def __call__(self):
        x = self.x_prev + (self.theta * (self.mu - self.x_prev) * self.dt) +    \
            (self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape))
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

## **Replay Buffer**

In [5]:
class ReplayBuffer:
    def __init__(self, mem_size, observation_shape, n_actions):
        self.mem_size = mem_size
        self.mem_counter = 0
        # DATA
        self.states = np.zeros((mem_size, *observation_shape), dtype=np.float32)
        self.actions = np.zeros((mem_size , n_actions), dtype=np.float32)
        self.rewards = np.zeros(mem_size, dtype=np.float32)
        self.states_ = np.zeros((mem_size, *observation_shape), dtype=np.float32)
        self.terminals = np.zeros(mem_size, dtype=bool)

    # STORE TRANSITIONS IN BUFFER
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_counter % self.mem_size
        self.states[index] = state
        self.actions[index] = action
        self.rewards[index] = reward
        self.states_[index] = state_
        self.terminals[index] = done # 1 if 'done' else 0
        self.mem_counter += 1

    # UNIFORMLY SAMPLES 'BUFFER' AND RETURNS A 'BATCH' OF batch_size
    def sample_batch(self, batch_size):
        max_index = min(self.mem_counter, self.mem_size)
        batch_indices = np.random.choice(max_index, batch_size, replace=False)
        states = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        terminals = self.terminals[batch_indices]
        return (states, actions, rewards, states_, terminals)

## **Networks**

In [6]:
class ActorNetwork(nn.Module):
    def __init__(self, lr, observation_shape, fc1_size, fc2_size, n_actions, model_name, model_dir):
        super().__init__()
        self.model_dir = model_dir
        self.model_file = os.path.join(self.model_dir, model_name+'_ACTOR')
        # ANN
        self.fc1 = nn.Linear(observation_shape[0], fc1_size)
        self.ln1 = nn.LayerNorm(fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.ln2 = nn.LayerNorm(fc2_size)
        self.out = nn.Linear(fc2_size, n_actions)
        # FAN INS & WEIGHT INIT
        f1 = 1/(np.sqrt(self.fc1.weight.data.size()[0])) 
        self.fc1.weight.data.uniform_(-f1,f1)
        self.fc1.bias.data.uniform_(-f1,f1)
        f2 = 1/(np.sqrt(self.fc2.weight.data.size()[0])) 
        self.fc2.weight.data.uniform_(-f2,f2)
        self.fc2.bias.data.uniform_(-f2,f2)
        fo = 0.003
        self.out.weight.data.uniform_(-fo,fo)
        self.out.bias.data.uniform_(-fo,fo)            
        # UTILS
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.to(self.device)

    def forward(self, state):
        mu = self.ln1(self.fc1(state))
        mu = F.relu(mu)
        mu = self.ln2(self.fc2(mu))
        mu = F.relu(mu)
        mu = self.out(mu)
        mu = F.tanh(mu)
        return mu #mu = a

    def save_model(self):
        print("[INFO] ACTOR - Saving model")
        checkpoint = {
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict' : self.optimizer.state_dict()
        }
        T.save(checkpoint, self.model_file)
    
    def load_model(self, cpu=False):
        print("[INFO] ACTOR - Loading model")
        
        map_location = T.device('cpu') if (cpu) else None
        
        checkpoint = T.load(self.model_file, map_location=map_location)
        self.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [7]:
class CriticNetwork(nn.Module):
    def __init__(self, lr, observation_shape, fc1_size, fc2_size, n_actions, model_name, model_dir):
        super().__init__()
        self.model_dir = model_dir
        self.model_file = os.path.join(self.model_dir, model_name+'_CRITIC')
        # ANN
        self.fc1 = nn.Linear(observation_shape[0], fc1_size)
        self.ln1 = nn.LayerNorm(fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.ln2 = nn.LayerNorm(fc2_size)
        self.fcA = nn.Linear(n_actions, fc2_size)
        self.out = nn.Linear(fc2_size, 1)
        # FAN INS & WEIGHT INIT 
        f1 = 1/(np.sqrt(self.fc1.weight.data.size()[0])) 
        self.fc1.weight.data.uniform_(-f1,f1)
        self.fc1.bias.data.uniform_(-f1,f1)
        f2 = 1/(np.sqrt(self.fc2.weight.data.size()[0])) 
        self.fc2.weight.data.uniform_(-f2,f2)
        self.fc2.bias.data.uniform_(-f2,f2)
        fA = 1/(np.sqrt(self.fcA.weight.data.size()[0]))
        self.fcA.weight.data.uniform_(-fA,fA)
        self.fcA.bias.data.uniform_(-fA,fA)
        fo = 0.003
        self.out.weight.data.uniform_(-fo,fo)
        self.out.bias.data.uniform_(-fo,fo)
        # UTILS
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=0.01)
        self.to(self.device)

    def forward(self, state, action):
        s = self.ln1(self.fc1(state))
        s = F.relu(s)
        s = self.ln2(self.fc2(s))
        a = self.fcA(action)
        q = T.add(s, a)
        q = F.relu(q)
        q = self.out(q)
        return q

    def save_model(self):
        print("[INFO] CRITIC - Saving model")
        checkpoint = {
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict' : self.optimizer.state_dict()
        }
        T.save(checkpoint, self.model_file)
    
    def load_model(self, cpu=False):
        print("[INFO] CRITIC - Loading model")
        
        map_location = T.device('cpu') if (cpu) else None
        
        checkpoint = T.load(self.model_file, map_location=map_location)
        self.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

## **Agent**

In [8]:
class DDPGAgent:
    def __init__(self, observation_shape, n_actions, actor_lr, critic_lr, fc1_size, fc2_size, tau, gamma, 
                 mem_size, batch_size, algo_name, env_name, model_dir):
        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.A_LR = actor_lr
        self.C_LR = critic_lr
        self.TAU = tau
        self.GAMMA = gamma
        self.noise = OUActionNoise(mu=np.zeros(n_actions))

        # MEM PARAMS
        self.mem_size = mem_size
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_size, observation_shape, n_actions)

        # MODEL PARAMS
        self.algo_name = algo_name
        self.env_name = env_name
        self.model_dir = model_dir
        # A1
        self.A_STEP = ActorNetwork(actor_lr, observation_shape, fc1_size, fc2_size, n_actions,
                              model_name = env_name+'_'+algo_name+'_STEP',
                              model_dir = model_dir)
        # A2
        self.A_TARGET = ActorNetwork(actor_lr, observation_shape, fc1_size, fc2_size, n_actions,
                              model_name = env_name+'_'+algo_name+'_TARGET',
                              model_dir = model_dir)
        # C1
        self.C_STEP = CriticNetwork(critic_lr, observation_shape, fc1_size, fc2_size, n_actions,
                              model_name = env_name+'_'+algo_name+'_STEP',
                              model_dir = model_dir)
        # C2
        self.C_TARGET = CriticNetwork(critic_lr, observation_shape, fc1_size, fc2_size, n_actions,
                              model_name = env_name+'_'+algo_name+'_TARGET',
                              model_dir = model_dir)
        self.update_target_networks(tau=1) # HARD UPDATE ON INIT
    
    def get_action(self, observation):
        self.A_STEP.eval()
        # OBSERVATION TO TENSOR
        observation = T.tensor(observation, dtype=T.float32).to(self.A_STEP.device)
        state = T.unsqueeze(observation, 0)                                     # T.tensor([array]) = T.unsqueeze(T.tensor(array), 0)
        # GET mu = A1(s)
        mu = self.A_STEP(state)
        mu_PRIME = mu + (T.tensor(self.noise(), dtype=T.float32).to(self.A_STEP.device)) # SHAPE (1,4) + (4)
        self.A_STEP.train()
        
        return mu_PRIME.cpu().detach().numpy()[0]

    def learn(self):
        if (self.memory.mem_counter < self.batch_size): return # return if insufficient samples present
        states, actions, rewards, states_, terminals = self.sample_batch()

        # CALC  Q1 = C1(s, a)
        q1_preds = self.C_STEP(states, actions)
        # CALC  Q2 = y' = R + gamma*(C2(s_, a2_)) where a2_ = A2(s_)
        a2_ = self.A_TARGET(states_)
        q2_next = self.C_TARGET(states_, a2_)
        q2_next[terminals] = 0.0
        q2_next = q2_next.view(-1)
        q2_targets = rewards + (self.GAMMA * q2_next)
        q2_targets = q2_targets.view(self.batch_size, 1)

        # CRITIC LOSS
        critic_loss = F.mse_loss(q1_preds, q2_targets)
        self.C_STEP.optimizer.zero_grad()
        critic_loss.backward()
        self.C_STEP.optimizer.step()
        # ACTOR LOSS
        actor_loss = -(self.C_STEP(states, self.A_STEP(states)))
        actor_loss = T.mean(actor_loss)
        self.A_STEP.optimizer.zero_grad()
        actor_loss.backward()
        self.A_STEP.optimizer.step()
        # UPDATE WEIGHTS OF TARGET NETWORK
        self.update_target_networks()

    def update_target_networks(self, tau=None):
        TAU = self.TAU if tau is None else tau

        A1_state_dict =  dict(self.A_STEP.named_parameters())
        C1_state_dict =  dict(self.C_STEP.named_parameters())
        A2_state_dict =  dict(self.A_TARGET.named_parameters())
        C2_state_dict =  dict(self.C_TARGET.named_parameters())
        # SOFT UPDATE
        for name in A1_state_dict:
            A1_state_dict[name] = (TAU *  A1_state_dict[name].clone()) + ((1-TAU)* A2_state_dict[name].clone())
        for name in C1_state_dict:
            C1_state_dict[name] = (TAU *  C1_state_dict[name].clone()) + ((1-TAU)* C2_state_dict[name].clone())
        # UPDATE TARGET WEIGHTS
        self.A_TARGET.load_state_dict(A1_state_dict)
        self.C_TARGET.load_state_dict(C1_state_dict)

        

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_batch(self):
        states, actions, rewards, states_, terminals = self.memory.sample_batch(self.batch_size)
        states = T.tensor(states).to(self.A_STEP.device)
        actions = T.tensor(actions).to(self.A_STEP.device)
        rewards = T.tensor(rewards).to(self.A_STEP.device)
        states_ = T.tensor(states_).to(self.A_STEP.device)
        terminals = T.tensor(terminals).to(self.A_STEP.device)
        return states, actions, rewards, states_, terminals    

    def save_models(self):
        self.A_STEP.save_model()
        self.A_TARGET.save_model()
        self.C_STEP.save_model()
        self.C_TARGET.save_model()

    def load_models(self, cpu=False):
        self.A_STEP.load_model(cpu)
        self.A_TARGET.load_model(cpu)
        self.C_STEP.load_model(cpu)
        self.C_TARGET.load_model(cpu)

## **Training**

In [9]:
env_name = "LunarLanderContinuous-v2"
env = gym.make(env_name)

N_EPISODES = 1000

In [10]:
agent = DDPGAgent(observation_shape=env.observation_space.shape,
                  n_actions=env.action_space.shape[0],
                  actor_lr=1e-4,
                  critic_lr=1e-3,
                  fc1_size=400,
                  fc2_size=300,
                  tau=0.001,
                  gamma=0.99,
                  mem_size=100000,
                  batch_size=64,
                  algo_name='DDPG',
                  env_name=env_name,
                  model_dir='./')

In [11]:
episode_rewards, episode_lengths, mean_rewards = [],[],[]
best_reward = -np.inf

for episode_n in tqdm(range(N_EPISODES)):
    total_reward, total_moves = 0,0

    done = False
    agent.noise.reset()
    observation = env.reset()

    while not done:
        action = agent.get_action(observation)
        observation_, reward, done, _ = env.step(action)

        total_reward += reward
        total_moves += 1

        # STORE DATA & LEARN
        agent.store_transition(observation, action, reward, observation_, done)
        agent.learn()

        observation = observation_
    
    episode_rewards.append(total_reward)
    episode_lengths.append(total_moves)
    
    mean_reward = np.mean(episode_rewards[-100:])
    mean_rewards.append(mean_reward)
    if(mean_reward > best_reward):
        agent.save_models()
        best_reward = mean_reward
    
    print("ITER: ",episode_n,"\tRWD: ",round(total_reward,2),"\tM_RWD: ",round(mean_reward,2),"\tLEN: ",total_moves)    

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))



[INFO] ACTOR - Saving model
[INFO] ACTOR - Saving model
[INFO] CRITIC - Saving model
[INFO] CRITIC - Saving model
ITER:  0 	RWD:  -448.57 	M_RWD:  -448.57 	LEN:  203
ITER:  1 	RWD:  -1397.77 	M_RWD:  -923.17 	LEN:  118
ITER:  2 	RWD:  -339.07 	M_RWD:  -728.47 	LEN:  50
ITER:  3 	RWD:  -219.08 	M_RWD:  -601.12 	LEN:  62
ITER:  4 	RWD:  -790.99 	M_RWD:  -639.1 	LEN:  70
ITER:  5 	RWD:  -1177.87 	M_RWD:  -728.89 	LEN:  95
ITER:  6 	RWD:  -786.21 	M_RWD:  -737.08 	LEN:  60
ITER:  7 	RWD:  -849.86 	M_RWD:  -751.18 	LEN:  78
ITER:  8 	RWD:  -745.25 	M_RWD:  -750.52 	LEN:  74
ITER:  9 	RWD:  -405.9 	M_RWD:  -716.06 	LEN:  48
ITER:  10 	RWD:  -436.62 	M_RWD:  -690.65 	LEN:  59
ITER:  11 	RWD:  -455.99 	M_RWD:  -671.1 	LEN:  62
ITER:  12 	RWD:  -803.76 	M_RWD:  -681.3 	LEN:  78
ITER:  13 	RWD:  -537.16 	M_RWD:  -671.01 	LEN:  69
ITER:  14 	RWD:  -641.22 	M_RWD:  -669.02 	LEN:  74
ITER:  15 	RWD:  -622.65 	M_RWD:  -666.12 	LEN:  77
ITER:  16 	RWD:  -801.56 	M_RWD:  -674.09 	LEN:  86
ITER:  17 	R

In [11]:
plt.plot(mean_rewards)

## **Testing**

In [None]:
# env_name = "LunarLanderContinuous-v2"
# env = gym.make(env_name)

# agent = DDPGAgent(observation_shape=env.observation_space.shape,
#                   n_actions=env.action_space.shape[0],
#                   actor_lr=1e-4,
#                   critic_lr=1e-3,
#                   fc1_size=400,
#                   fc2_size=300,
#                   tau=0.001,
#                   gamma=0.99,
#                   mem_size=1,
#                   batch_size=1,
#                   algo_name='DDPG',
#                   env_name=env_name,
#                   model_dir='./')

In [None]:
#model.eval()
# with T.no_grad():
#     total_reward, total_moves = 0,0
#     done = False
#     observation = env.reset()

#     while not done:
#         time.sleep(0.0001)
#         env.render()

#         # e_GREEDY ACTION
#         action = agent.get_action(observation)
#         observation_, reward, done, _ = env.step(action)

#         total_reward += reward
#         total_moves += 1

#         observation = observation_
#     print("RWD: ",total_reward,"\tLEN: ",total_moves)
#     env.close()