In [None]:
import gym
import csv
import numpy as np
import random
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm as _tqdm
import matplotlib.pyplot as plt
from scipy.stats import f_oneway
from collections import defaultdict

def tqdm(*args, **kwargs):
    return _tqdm(*args, **kwargs, mininterval=1)  # Safety, do not overflow buffer

## Model Definitions

In [None]:
class QNetwork(nn.Module):
    
    def __init__(self, input_dim=4, output_dim=2, num_hidden=128, weight_mean = 0, weight_std = 0.1, device="cpu"):
        nn.Module.__init__(self)
        self.l1 = nn.Linear(input_dim, 24)
        self.l3 = nn.Linear(24, 48)
        self.l2 = nn.Linear(48, output_dim)
        self.ReLU = nn.ReLU()
        
        self.init_weights(weight_mean, weight_std)
        self.model = torch.nn.Sequential(self.l1,
                                         nn.ReLU(),
                                         self.l3,
                                         self.ReLU,
                                         self.l2).to(device)

    def forward(self, x):
        return self.model.forward(x)

    def init_weights(self, mean, std):
        self.l1.weight.data.normal_(mean, std)   # initialization
        self.l2.weight.data.normal_(mean, std)   # initialization
        

In [None]:
class ReplayMemory:
    
    def __init__(self, capacity, env):
        self.capacity = capacity
        self.memory = []
        self.env = env

    def push(self, transition):
        if len(self.memory)+1 > self.capacity:
            self.memory.pop()
            
        self.memory.append(transition)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def fill_memory(self):
        s = self.env.reset()
        a = self.env.action_space.sample()
        s_next, r, done, _ = self.env.step(a)
        transition = (s, a, r, s_next, done)
        
        for i in range(self.capacity):
            self.push(transition)

    def __len__(self):
        return len(self.memory)

In [None]:
def get_epsilon(it):
    epsilon = 1-(.95*it/1000)
    
    if epsilon <= 0.05:
        epsilon = 0.05
    return epsilon

class EpsilonGreedyPolicy(object):
    """
    A simple epsilon greedy policy.
    """
    def __init__(self, Q, epsilon, num_actions, device):
        self.Q = Q
        self.epsilon = epsilon
        self.num_actions = num_actions
        self.device = device
    
    def sample_action(self, obs):
        """
        This method takes a state as input and returns an action sampled from this policy.  

        Args:
            obs: current state

        Returns:
            An action (int).
        """
        random = np.random.uniform()

        if random > self.epsilon:
            with torch.no_grad():
                q_vals = self.Q(torch.tensor(obs).unsqueeze(0).float().to(self.device))
                action = torch.argmax(q_vals).item()
        else:
            action = np.random.randint(self.num_actions)
        return int(action)
        
    def set_epsilon(self, epsilon):
        self.epsilon = epsilon

In [None]:
def compute_q_vals(Q, states, actions):
    """
    This method returns Q values for given state action pairs.
    
    Args:
        Q: Q-net
        states: a tensor of states. Shape: batch_size x obs_dim
        actions: a tensor of actions. Shape: Shape: batch_size x 1

    Returns:
        A torch tensor filled with Q values. Shape: batch_size x 1.
    """
    q_vals_all_actions = Q.forward(states)
    return torch.gather(q_vals_all_actions, 1, actions)
        
    
def compute_targets(Q, rewards, next_states, dones, discount_factor):
    """
    This method returns targets (values towards which Q-values should move).
    
    Args:
        Q: Q-net
        rewards: a tensor of actions. Shape: Shape: batch_size x 1
        next_states: a tensor of states. Shape: batch_size x obs_dim
        dones: a tensor of boolean done flags (indicates if next_state is terminal) Shape: batch_size x 1
        discount_factor: discount
    Returns:
        A torch tensor filled with target values. Shape: batch_size x 1.
    """
    q_vals_next_states = Q.forward(next_states)
    
    # Take max over next states, and set all terminal states to zero
    q_max_vals_next_states = q_vals_next_states.max(dim=1)[0].unsqueeze(dim=1)
    q_max_vals_next_states[(dones==1)] = 0
    return (rewards + discount_factor * q_max_vals_next_states)
        

def train(Q, memory, optimizer, batch_size, discount_factor, full_grad, device):
    
    # don't learn without some decent experience
    if len(memory) < batch_size:
        return None

    # random transition batch is taken from experience replay memory
    transitions = memory.sample(batch_size)
    
    # transition is a list of 4-tuples, instead we want 4 vectors (as torch.Tensor's)
    state, action, reward, next_state, done = zip(*transitions)
    
    # convert to PyTorch and define types
    state = torch.tensor(state, dtype=torch.float).to(device)
    action = torch.tensor(action, dtype=torch.int64)[:, None].to(device)  # Need 64 bit to use them as index
    next_state = torch.tensor(next_state, dtype=torch.float).to(device)
    reward = torch.tensor(reward, dtype=torch.float)[:, None].to(device)
    done = torch.tensor(done, dtype=torch.uint8)[:, None].to(device)
    
    # compute the q value
    q_val = compute_q_vals(Q, state, action)
    if full_grad:
        target = compute_targets(Q, reward, next_state, done, discount_factor)
    else:
        with torch.no_grad():  # Don't compute gradient info for the target (semi-gradient)
            target = compute_targets(Q, reward, next_state, done, discount_factor)
    
    # loss is measured from error between current and newly expected Q values
    loss = F.smooth_l1_loss(q_val, target)

    # backpropagation of loss to Neural Network (PyTorch magic)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(Q.parameters(), 0.25)
    
    optimizer.step()
    
    return loss.item()  # Returns a Python scalar, and releases history (similar to .detach())

## Model Setup and Training

In [None]:
def run_episodes(Q, optimizer, policy, memory, env, num_episodes, num_steps, batch_size, 
                 discount_factor, learn_rate, full_grad, print_bool=False, device = "cpu"):
    
    mountain_car_bool = env.unwrapped.spec.id == "MountainCar-v0"
    
    global_steps = 0  
    episode_durations = []  
    for i in range(num_episodes):
        state = env.reset()
        
        steps = 0
        current_episode = []
        while True:
            policy.set_epsilon(get_epsilon(global_steps))
            
            action = policy.sample_action(state)
            next_state, reward, done, _ = env.step(action)
            
            
            if mountain_car_bool and done:
                # Giving extra reward to achieving the objective and add more duplicates 
                # of this experience in the replay memory
                reward += 100
                for _ in range(100):
                    memory.push((state, action, reward, next_state, done))
                # We also add more examples of the last 50 states from this 
                # fruitfull episode to the replay memory
                for _ in range(5):
                    for s in current_episode[-50:]:
                        memory.push(s)
            
            memory.push((state, action, reward, next_state, done))
            loss = train(Q, memory, optimizer, batch_size, discount_factor, full_grad, device)
            
            current_episode.append((state, action, reward, next_state, done))
            state = next_state
            global_steps += 1
            steps += 1
            
            # Give extra negative reward if agent did not find the objective
            # in the mountain car case.
            if mountain_car_bool and steps > num_steps:
                reward -= 100
                done = True
            elif steps > num_steps:
                done = True
            
            if done:
                if i % 10 == 0 and print_bool:
                    print("{2} Episode {0} finished after {1} steps"
                          .format(i, steps, '\033[92m' if steps >= 195 else '\033[99m'))
                episode_durations.append(steps)
                break
    return episode_durations


def model_setup_and_training(env, num_episodes, num_steps, batch_size, memory_capacity, 
                             discount_factor, learn_rate, num_hidden = 128, weight_std = 0.1, 
                             full_grad = True, print_bool = False, device = "cpu"):
    sample = env.reset()
    input_dim = len(sample)
    output_dim = env.action_space.n
    
    # Model and Optimizer
    Q_net = QNetwork(input_dim=input_dim, output_dim=output_dim, num_hidden = num_hidden, weight_std = weight_std).to(device)
    policy = EpsilonGreedyPolicy(Q_net, 0.1, output_dim, device)
    memory = ReplayMemory(memory_capacity, env)
    optimizer = optim.Adam(Q_net.parameters(), learn_rate)
    
    return run_episodes(Q_net, optimizer, policy, memory, env, num_episodes, num_steps, batch_size, 
                 discount_factor, learn_rate, full_grad, print_bool, device = "cpu")

## Plot Functions

In [None]:
# And see the results
def smooth(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

## Training different evironments

In [None]:
# Seeds
seed = 42
random.seed(seed)
torch.manual_seed(seed)

#### Cartpole v1

In [None]:
# Parameters
num_hidden = 128
batch_size = 64
discount_factor = 0.8
learn_rate = 1e-3
memory_capacity = 10000
num_episodes = 200
num_steps = 1500

# Env
cart_pole_env = gym.envs.make("CartPole-v1")
cart_pole_env.seed(seed)

# Checking for cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Training a full grad model and plotting its performance

episode_durations_full = model_setup_and_training(cart_pole_env, num_episodes, num_steps, batch_size, memory_capacity, 
                             discount_factor, learn_rate, full_grad = True, print_bool = True)

plt.plot(smooth(episode_durations_full, 10))
plt.title('Episode durations per episode')

In [None]:
# Training a semi grad model and plotting its performance
episode_durations_semi = model_setup_and_training(cart_pole_env, num_episodes, num_steps, batch_size, memory_capacity, 
                             discount_factor, learn_rate, full_grad = False, print_bool = True)

plt.plot(smooth(episode_durations_semi, 10))
plt.title('Episode durations per episode')

#### Mountaincar v0

In [None]:
# Parameters
num_hidden = 128
batch_size = 64
discount_factor = 0.99
learn_rate = 4.561407e-04
memory_capacity = 1000
num_episodes = 1000
num_steps = 10

# Env
mount_env = gym.envs.make("MountainCar-v0").env
mount_env.seed(seed)

# Checking for cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Training a full grad model and plotting its performance
episode_durations_full = model_setup_and_training(mount_env, num_episodes, num_steps, batch_size, memory_capacity, 
                                                  discount_factor, learn_rate, full_grad = True, print_bool = True)

plt.plot(smooth(episode_durations_full, 10))
plt.title('Episode durations per episode')

In [None]:
# Training a semi grad model and plotting its performance
episode_durations_semi = model_setup_and_training(mount_env, num_episodes, num_steps, batch_size, memory_capacity, 
                                                  discount_factor, learn_rate, full_grad = False, print_bool = True)

plt.plot(smooth(episode_durations_semi, 10))
plt.title('Episode durations per episode')

## Experiment

In [None]:
# Checking for cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Specify number of episodes each test needs to be runned
# The maximal allowd number of steps
# And the batch size
num_episodes=500
num_steps = 1500
batch_size = 64
memory_capacity = 10000
discount_factor = 0.99
learn_rate = 4.561407e-04


## Environments
env = gym.envs.make("MountainCar-v0").env
# env = gym.envs.make("CartPole-v1")
env.seed(seed)

results = defaultdict() 
for gradient in [True, False]: 
    name = 'full' if gradient == True else 'semi'
    for weight_std in [1.0]:
        seed_list = []
        for seed in range(5): 
            print(f"Running test for gradient={name}, weight_std={weight_std}, seed={seed}")
            env.seed(seed)
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)

            result = model_setup_and_training(env, num_episodes, num_steps, batch_size, memory_capacity, 
                                              discount_factor, learn_rate, num_hidden, 
                                              weight_std, gradient, False, device)

            seed_list.append(result)

        title = f"{name}-grad_std={weight_std}_gamma={discount_factor}_learn_rate={learn_rate}"

        with open(f'test_results_dqn/mountain_final/{title}.csv', 'w') as f:   
            write = csv.writer(f) 
            write.writerow(seed_list)  

In [None]:
# Plotting the gather results with the param search of the cell below. 
# To get the right plots the directory needs to be changed to the right 
# experiment
import glob

env_name = "mountain"
env_name = "cart_pole"

gather = 10
file_list = glob.glob(f"test_results_dqn/{env_name}_final/*.csv")
semi_file_list = [file for file in file_list if "semi-grad" in file]
full_file_list = [file for file in file_list if "full-grad" in file]


for semi_file in semi_file_list:
    full_file = semi_file.replace("semi", "full")
    title = semi_file.split("/")[-1].split(".csv")[0].split("grad_")
    
    plt.figure(figsize=(6, 4))
    for file in [semi_file, full_file]:
        with open(file, 'r') as f:
            reader = list(csv.reader(f, delimiter=','))
            for i, row in enumerate(reader):
                result = [k.strip("[]").split(",") for k in row]
                seed_list = [[int(res) for res in seed] for seed in result]
        
        # Gather data points by taking the mean over a certain amount of steps
        # so we can smooth the plots
        gathered_data = []        
        for row in seed_list: 
            row = list(np.mean(np.array(row).reshape(-1, gather), axis=1))
            gathered_data.append(row)

        result = np.array(gathered_data)
        x = list(range(0, result.shape[1]*gather, gather))
        
        # Calculate std of the data, so we can plot the variance
        mu = result.mean(axis=0)
        min_std = mu - result.std(axis=0)
        max_std = mu + result.std(axis=0)
        plt.plot(x, mu)
        plt.fill_between(x, max_std, min_std, alpha=0.3)

    plt.xlabel("episodes")
    plt.ylabel("timesteps")
    plt.title("semi (blue) and full (orange) gradient with std of 1.0")
    plt.savefig(f'plots/{env_name}_{title}.png')
    plt.show()
    


In [None]:
#