In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import gym

In [2]:
env_name='CartPole-v0'
train_env = gym.make(env_name)
test_env = gym.make(env_name)

assert isinstance(train_env.observation_space, gym.spaces.Box)
assert isinstance(train_env.action_space, gym.spaces.Discrete)

In [3]:
SEED = 1234

train_env.seed(SEED);
test_env.seed(SEED);
np.random.seed(SEED);
torch.manual_seed(SEED);

In [4]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.0):
        super().__init__()

        self.fc_1 = nn.Linear(input_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc_1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x

In [5]:
INPUT_DIM = train_env.observation_space.shape[0]
HIDDEN_DIM = 256
OUTPUT_DIM = train_env.action_space.n

actor = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
critic = MLP(INPUT_DIM, HIDDEN_DIM, 1)

In [6]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0)
        
actor.apply(init_weights)
critic.apply(init_weights)

MLP(
  (fc_1): Linear(in_features=4, out_features=256, bias=True)
  (fc_2): Linear(in_features=256, out_features=1, bias=True)
  (dropout): Dropout(p=0.0, inplace=False)
)

In [7]:
LEARNING_RATE = 0.01

actor_optimizer = optim.Adam(actor.parameters(), lr=3e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=3e-4)

In [8]:
def train(env, actor, critic, actor_optimizer, critic_optimizer, n_steps, discount_factor):
    
    log_prob_actions = torch.zeros(n_steps)
    entropies = torch.zeros(n_steps)
    values = torch.zeros(n_steps)
    rewards = torch.zeros(n_steps)
    masks = torch.zeros(n_steps)
    episode_reward = 0

    state = env.state

    for step in range(n_steps):

        state = torch.FloatTensor(state).unsqueeze(0)
        
        action_preds = actor(state)
        value_pred = critic(state).squeeze(-1)

        action_probs = F.softmax(action_preds, dim = -1)
                
        dist = distributions.Categorical(action_probs)

        action = dist.sample()
        
        log_prob_action = dist.log_prob(action)
        
        entropy = dist.entropy()
        
        state, reward, done, _ = env.step(action.item())

        log_prob_actions[step] = log_prob_action
        entropies[step] = entropy
        values[step] = value_pred
        rewards[step] = reward
        masks[step] = 1 - done
    
        if done:
            state = env.reset()
    
    next_value = critic(torch.FloatTensor(state).unsqueeze(0)).squeeze(-1)
    returns = calculate_returns(rewards, next_value, masks, discount_factor)
    advantages = calculate_advantages(returns, values)
    
    policy_loss, value_loss = update_policy(advantages, log_prob_actions, returns, values, entropies, actor_optimizer, critic_optimizer)

    return policy_loss, value_loss

In [9]:
def calculate_returns(rewards, next_value, masks, discount_factor, normalize = False):
    
    returns = torch.zeros_like(rewards)
    R = next_value.item()
    
    for i, (r, m) in enumerate(zip(reversed(rewards), reversed(masks))):
        R = r + R * discount_factor * m
        returns[i] = R
    
    if normalize:
        
        returns = (returns - returns.mean()) / returns.std()
        
    return returns

In [10]:
def calculate_advantages(returns, values, normalize = False):
    
    advantages = returns - values
    
    if normalize:
        
        advantages = (advantages - advantages.mean()) / advantages.std()
        
    return advantages

In [11]:
def update_policy(advantages, log_prob_actions, returns, values, entropies, actor_optimizer, critic_optimizer):
        
    advantages = advantages.detach()
    returns = returns.detach()
        
    policy_loss = - (advantages * log_prob_actions).mean() - 0.001 * entropies.mean()
    
    value_loss = 0.5 * F.smooth_l1_loss(returns, values).mean()
        
    actor_optimizer.zero_grad()
    critic_optimizer.zero_grad()
    
    policy_loss.backward()
    value_loss.backward()
    
    actor_optimizer.step()
    critic_optimizer.step()
    
    return policy_loss.item(), value_loss.item()

In [12]:
def evaluate(env, actor, critic):
    
    done = False
    episode_reward = 0
    
    state = env.reset()
    
    while not done:
        
        state = torch.FloatTensor(state).unsqueeze(0)
        
        action_preds = actor(state)
        
        action_probs = F.softmax(action_preds, dim = -1)
        
        dist = distributions.Categorical(action_probs)

        action = dist.sample() 
        
        state, reward, done, _ = env.step(action.item())
        
        episode_reward += reward
        
    return episode_reward

In [13]:
import pickle 
def save_results(rewards, std_dev, timestamps, seed, time_start, env_name, n_run, n_steps, name='A2C'):
    run_dict = {'name': name, 
                'avg_ret': rewards,
                'std_dev': std_dev,
                'time_start': time_start, 
                'timestamps': timestamps,
                'seed': seed,
                'n_run': n_run,
                'env_name': env_name,
                'n_steps': n_steps
               }
    
    filename = 'run_time_%s_%s_%i.pickle' % (name, seed, n_steps)
    with open(filename, 'wb') as handle:
        pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [14]:
import time
def train_one_agent(MAX_STEPS, N_UPDATE_STEPS, DISCOUNT_FACTOR):
    _ = train_env.reset()
    actor.apply(init_weights)
    critic.apply(init_weights)
    
    episode_rewards = []
    timestamps = []
    time_start = time.time()
    for step in range(MAX_STEPS):

        policy_loss, value_loss = train(train_env, actor, critic, actor_optimizer, critic_optimizer, N_UPDATE_STEPS, DISCOUNT_FACTOR)
        e_r = evaluate(test_env, actor, critic)
        episode_rewards.append(e_r)
        if step % 100 == 0:
            print('Step: {}, Rewards: {}'.format(step,e_r))
        timestamps.append(time.time())
        
    return time_start, timestamps, episode_rewards

In [15]:
MAX_STEPS = 500
N_UPDATE_STEPS =  32
DISCOUNT_FACTOR = 0.99

list_time_start = []
list_timestamps = []
list_rewards = []

num_agents = 10

for run in tqdm(range(num_agents)):

    time_start, timestamps, episode_rewards = train_one_agent(MAX_STEPS, N_UPDATE_STEPS, DISCOUNT_FACTOR)

    list_time_start.append(time_start)
    list_timestamps.append(timestamps)
    list_rewards.append(episode_rewards)
    
mean_trial_rewards = np.array(list_rewards).mean(axis=0).tolist()
std_trial_rewards = np.array(list_rewards).std(axis=0).tolist()
timestamps = np.array(list_timestamps).mean(axis=0).tolist()
time_start = np.array(list_time_start).mean().tolist()

save_results(mean_trial_rewards, std_trial_rewards, timestamps, SEED, time_start, env_name, num_agents, N_UPDATE_STEPS)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Step: 0, Rewards: 20.0
Step: 100, Rewards: 16.0
Step: 200, Rewards: 17.0
Step: 300, Rewards: 22.0
Step: 400, Rewards: 11.0
Step: 500, Rewards: 31.0
Step: 600, Rewards: 23.0
Step: 700, Rewards: 12.0
Step: 800, Rewards: 137.0
Step: 900, Rewards: 107.0
Step: 1000, Rewards: 28.0
Step: 1100, Rewards: 124.0
Step: 1200, Rewards: 119.0
Step: 1300, Rewards: 51.0
Step: 1400, Rewards: 200.0
Step: 1500, Rewards: 200.0
Step: 1600, Rewards: 135.0
Step: 1700, Rewards: 64.0
Step: 1800, Rewards: 200.0
Step: 1900, Rewards: 122.0
Step: 2000, Rewards: 200.0
Step: 2100, Rewards: 200.0
Step: 2200, Rewards: 200.0
Step: 2300, Rewards: 156.0
Step: 2400, Rewards: 191.0
Step: 0, Rewards: 12.0
Step: 100, Rewards: 12.0
Step: 200, Rewards: 13.0
Step: 300, Rewards: 29.0
Step: 400, Rewards: 27.0
Step: 500, Rewards: 24.0
Step: 600, Rewards: 24.0
Step: 700, Rewards: 27.0
Step: 800, Rewards: 45.0
Step: 900, Rewards: 98.0
Step: 1000, Rewards: 63.0
Step: 1100, Rewards: 18.0
Step: 1200, Rewards: 49.0
Step: 1300, Rewards: 1