In [1]:
%%capture
!pip install box2d-py

In [7]:
import os
import cv2
import gym
import time
import collections

import numpy as np
import torch as T
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

%matplotlib inline

In [3]:
T.cuda.get_device_name()

'Tesla T4'

## **ReplayBuffer**

In [2]:
class ReplayBuffer:
    def __init__(self, mem_size, observation_shape, n_actions, alpha):
        self.mem_size = mem_size
        self.mem_counter = 0
        self.ALPHA = alpha
        # DATA
        self.states = np.zeros((mem_size, *observation_shape), dtype=np.float32)
        self.actions = np.zeros(mem_size, dtype=np.int64)
        self.rewards = np.zeros(mem_size, dtype=np.int64)
        self.states_ = np.zeros((mem_size, *observation_shape), dtype=np.float32)
        self.terminals = np.zeros(mem_size, dtype=bool)
        self.priorities = np.zeros(mem_size, dtype=np.float32)

    # STORE TRANSITIONS IN BUFFER
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_counter % self.mem_size
        self.states[index] = state
        self.actions[index] = action
        self.rewards[index] = reward
        self.states_[index] = state_
        self.terminals[index] = done    # 1 if 'done' else 0
        self.priorities[index] = self.priorities.max() if (self.mem_counter>0) else 1.0
        self.mem_counter += 1
    
    # UPDATE PRIORITIES LIST
    def update_priotities(self, indices, errors, offset):
        priorities = abs(errors) + offset
        self.priorities[indices] = priorities

    # UNIFORMLY SAMPLES 'BUFFER' AND RETURNS A 'BATCH' OF batch_size
    def sample_batch(self, batch_size, beta):
        max_index = min(self.mem_counter, self.mem_size) 
        priorities = self.priorities[:max_index]
        probabilities = (priorities ** self.ALPHA) / ((priorities ** self.ALPHA).sum())  # Pr = pi^a/P^a
        batch_indices = np.random.choice(max_index, batch_size, p=probabilities)

        importance = (max_index * probabilities[batch_indices]) ** (-beta)               # (1/N * 1/Pr)^b
        importance = importance / importance.max()
        importance = np.array(importance, dtype=np.float32)

        states = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        terminals = self.terminals[batch_indices]
        return (batch_indices, states, actions, rewards, states_, terminals, importance)

## **Network**

In [3]:
class DuelingDeepQNetwork(nn.Module):
    def __init__(self, lr, observation_shape, n_actions, model_name, model_dir):
        super().__init__()
        self.model_dir = model_dir
        self.model_file = os.path.join(self.model_dir, model_name)
        # ANN
        self.fc1 = nn.Linear(observation_shape[0], 512)
        self.fc2 = nn.Linear(512, 1024)
        self.fc3 = nn.Linear(1024, 256)
        # DUELING
        self.V = nn.Linear(256, 1)
        self.A = nn.Linear(256, n_actions)
        # UTILS
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.optimizer = optim.RMSprop(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.to(self.device)
    
    def forward(self, state):
        t = F.relu(self.fc1(state))
        t = F.relu(self.fc2(t))
        t = F.relu(self.fc3(t))
        V = self.V(t)
        A = self.A(t)
        return V,A

    def save_model(self):
        print("[INFO] Saving model")
        checkpoint = {
            'model_state_dict': self.state_dict(),
            'optimizer_state_dict' : self.optimizer.state_dict()
        }
        T.save(checkpoint, self.model_file)
    
    def load_model(self, cpu=False):
        print("[INFO] Loading model")
        
        map_location = T.device('cpu') if (cpu) else None
        
        checkpoint = T.load(self.model_file, map_location=map_location)
        self.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

## **Agent**

In [4]:
class DuelingDDQNAgent:
    def __init__(self, observation_shape, n_actions, lr, gamma, epsilon, epsilon_min, epsilon_decay, beta, beta_max, beta_increment,
                 mem_size, mem_alpha, batch_size, Q_TARGET_replace_interval, warmup, algo_name, env_name, model_dir):
        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.LR = lr
        self.GAMMA = gamma
        self.EPSILON = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        # MEM PARAMS
        self.BETA = beta
        self.beta_max = beta_max
        self.beta_increment = beta_increment
        self.mem_size = mem_size
        self.batch_size = batch_size
        self.memory = ReplayBuffer(mem_size, observation_shape, n_actions, mem_alpha)

        # MODEL PARAMS
        self.warmup = warmup
        self.move_counter = 0
        self.learn_counter = 0 # TO UPDATE TARGET NETWORK
        self.algo_name = algo_name
        self.env_name = env_name
        self.model_dir = model_dir
        self.Q_TARGET_replace_interval = Q_TARGET_replace_interval
        # Q1
        self.Q_STEP = DuelingDeepQNetwork(lr, observation_shape, n_actions,
                              model_name = env_name+'_'+algo_name+'_Q_STEP',
                              model_dir = model_dir)
        # Q2
        self.Q_TARGET = DuelingDeepQNetwork(lr, observation_shape, n_actions,
                              model_name = env_name+'_'+algo_name+'_Q_TARGET',
                              model_dir = model_dir)

    # e-GREEDY POLICY
    def get_action(self, observation, greedy=False):
        if ( (np.random.uniform() >= self.EPSILON) or greedy):
            observation = T.tensor(observation, dtype=T.float32).to(self.Q_STEP.device)
            state = T.unsqueeze(observation, 0)
            _,A = self.Q_STEP(state)
            action = T.argmax(A).item()
        else:
            action = env.action_space.sample()
        return action

    def learn(self):
        if (self.move_counter < self.warmup): return # return if not explored enough
        if (self.memory.mem_counter < self.batch_size): return # return if insufficient samples present
        # RESET TARGET NETWORK (every 1000 steps)
        self.update_Q_TARGET()

        self.learn_counter += 1
        batch_indices, states, actions, rewards, states_, terminals, importance = self.sample_batch()
        # PREDICT Q1(s,a)
        v1,a1 = self.Q_STEP(states)
        q1 = v1 + (a1 - a1.mean(dim=1, keepdim=True)) # q - batch_size * n_actions
        indices = np.arange(len(actions))
        q1_preds = q1[indices,actions]

        # GET V1,A2(s_,A) and V2,A2(s_,A)
        v1_, a1_ = self.Q_STEP(states_)
        v2_, a2_ = self.Q_TARGET(states_)
        # GET Q1(s_,A) and Q2(s_,A)
        q1_ = v1_ + (a1_ - a1_.mean(dim=1, keepdim=True))
        q2_ = v2_ + (a2_ - a2_.mean(dim=1, keepdim=True))
        # argmax(Q1(s_,A)) - (max)a_
        # Q2(s_, (max)a_) - TARGETS
        a_ = T.argmax(q1_, dim=1)
        indices = np.arange(len(a_))
        q2_next = q2_[indices, a_]
        q2_next[terminals] = 0.0                      # Q2(s_) = 0 where terminal=1
        q2_targets = rewards + (self.GAMMA * q2_next)

        # CALC LOSS & BACKPROP
        errors = (q1_preds - q2_targets)
        loss = ((errors ** 2) * importance)
        loss = loss.mean()

        self.Q_STEP.optimizer.zero_grad()
        loss.backward()
        self.Q_STEP.optimizer.step()

        self.decay_epsilon()
        self.increment_beta()
        self.memory.update_priotities(batch_indices, errors.cpu().detach().numpy(), offset=0.1)

    def update_Q_TARGET(self):
        if ((self.learn_counter % self.Q_TARGET_replace_interval) == 0):
            self.Q_TARGET.load_state_dict(self.Q_STEP.state_dict())
    
    def decay_epsilon(self):
        if (self.EPSILON > self.epsilon_min):
            self.EPSILON -= self.epsilon_decay
        else:
            self.EPSILON = self.epsilon_min

    def increment_beta(self):
        if(self.BETA < self.beta_max):
            self.BETA += self.beta_increment
        else:
            self.BETA = self.beta_max
    
    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_batch(self):
        batch_indices, states, actions, rewards, states_, terminals, importance = self.memory.sample_batch(self.batch_size, self.BETA)
        states = T.tensor(states).to(self.Q_STEP.device)
        actions = T.tensor(actions).to(self.Q_STEP.device)
        rewards = T.tensor(rewards).to(self.Q_STEP.device)
        states_ = T.tensor(states_).to(self.Q_STEP.device)
        terminals = T.tensor(terminals).to(self.Q_STEP.device)
        importance = T.tensor(importance).to(self.Q_STEP.device)
        return batch_indices, states, actions, rewards, states_, terminals, importance
        
    def save_models(self):
        self.Q_STEP.save_model()
        self.Q_TARGET.save_model()
    
    def load_models(self, cpu=False):
        self.Q_STEP.load_model(cpu)
        self.Q_TARGET.load_model(cpu)

## **Training**

In [17]:
env_name = "LunarLander-v2"
env = gym.make(env_name)

N_EPISODES = 3000

In [18]:
agent = DuelingDDQNAgent(observation_shape=env.observation_space.shape,
                         n_actions=env.action_space.n,
                         lr=1e-4,
                         gamma=0.99,
                         epsilon=1.0,
                         epsilon_min=0.01,
                         epsilon_decay=1e-5,
                         beta=0.4,
                         beta_max=1.0,
                         beta_increment=1e-4,
                         mem_size=20000,
                         mem_alpha=0.6,
                         batch_size=64,
                         Q_TARGET_replace_interval=1000,
                         warmup = 100,
                         algo_name='DuelingDDQN',
                         env_name=env_name,
                         model_dir='./weights')

In [None]:
episode_rewards, episode_lengths, episode_epsilons, mean_rewards = [],[],[],[]
best_reward = -np.inf

for episode_n in tqdm(range(N_EPISODES)):
    total_reward, total_moves = 0,0

    done = False
    observation = env.reset()

    while not done:
        agent.move_counter+=1
        # e_GREEDY ACTION
        action = agent.get_action(observation)
        observation_, reward, done, _ = env.step(action)

        total_reward += reward
        total_moves += 1

        # STORE DATA & LEARN
        agent.store_transition(observation, action, reward, observation_, done)
        agent.learn()

        observation = observation_

    episode_rewards.append(total_reward)
    episode_lengths.append(total_moves)
    episode_epsilons.append(agent.EPSILON)

    mean_reward = np.mean(episode_rewards[-100:])
    mean_rewards.append(mean_reward)
    if(mean_reward > best_reward):
        agent.save_models()
        best_reward = mean_reward

    print("ITER: ",episode_n,"\tRWD: ",total_reward,"\tM_RWD: ",round(mean_reward,2),"\tLEN: ",total_moves,"\tEPS: ",round(agent.EPSILON,4))

In [None]:
plt.plot(mean_rewards)

## Testing

In [5]:
env_name = "LunarLander-v2"
env = gym.make(env_name)

agent = DuelingDDQNAgent(observation_shape=env.observation_space.shape,
                         n_actions=env.action_space.n,
                         lr=1e-4,
                         gamma=0.99,
                         epsilon=1.0,
                         epsilon_min=0.01,
                         epsilon_decay=1e-5,
                         beta=0.4,
                         beta_max=1.0,
                         beta_increment=1e-4,
                         mem_size=1,
                         mem_alpha=0.6,
                         batch_size=1,
                         Q_TARGET_replace_interval=1000,
                         initial_exploration_steps = 10,
                         algo_name='DuelingDDQN',
                         env_name=env_name,
                         model_dir='./weights')

agent.load_models(cpu=True)

[INFO] Loading model
[INFO] Loading model


In [16]:
with T.no_grad():
    total_reward, total_moves = 0,0
    done = False
    observation = env.reset()

    while not done:
        time.sleep(0.0001)
        env.render()

        # e_GREEDY ACTION
        action = agent.get_action(observation, greedy=True)
        observation_, reward, done, _ = env.step(action)

        total_reward += reward
        total_moves += 1

        observation = observation_
    print("RWD: ",total_reward,"\tLEN: ",total_moves)
    env.close()

RWD:  157.91487961932543 	LEN:  1000
