https://datascience.stackexchange.com/questions/32480/how-does-generalised-advantage-estimation-work
https://lilianweng.github.io/lil-log/2018/02/19/a-long-peek-into-reinforcement-learning.html#combining-td-and-mc-learning
https://arxiv.org/pdf/1506.02438.pdf
https://github.com/higgsfield/RL-Adventure-2
http://www.breloff.com/DeepRL-OnlineGAE/
https://arxiv.org/pdf/1804.02717.pdf
https://ewrl.files.wordpress.com/2015/02/ewrl12_2015_submission_18.pdf
https://github.com/Kaixhin/Dist-A3C
https://github.com/Kaixhin/Dist-A3C/blob/master/client.py

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.distributions as distributions

import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import numpy as np
import gym

In [33]:
env_name = 'CartPole-v0'
#env_name = 'FrozenLake-v0'
env = gym.make(env_name)

#assert isinstance(env.observation_space, gym.spaces.Box)
#assert isinstance(env.action_space, gym.spaces.Discrete)

In [34]:
SEED = 43
init = True
model_path = ['actor_weights.pt', 'critic_weights.pt']
save = False
env.seed(SEED);
np.random.seed(SEED);
torch.manual_seed(SEED);

In [35]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout = 0.25):
        super().__init__()

        self.fc_1 = nn.Linear(input_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.fc_1(x)
        x = self.dropout(x)
        x = F.relu(x)
        x = self.fc_2(x)
        return x

In [40]:
#INPUT_DIM = env.observation_space.shape[0]
if env_name == 'FrozenLake-v0':
    INPUT_DIM = env.observation_space.n
else:
    INPUT_DIM = env.observation_space.shape[0]
print(INPUT_DIM)
HIDDEN_DIM = 128
OUTPUT_DIM = env.action_space.n

actor = MLP(INPUT_DIM, HIDDEN_DIM, OUTPUT_DIM)
critic = MLP(INPUT_DIM, HIDDEN_DIM, 1)

4


In [41]:
def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0)

actor.apply(init_weights)
critic.apply(init_weights)
if not init:
    actor.load_state_dict(torch.load(model_path[0]))
    critic.load_state_dict(torch.load(model_path[1]))

In [42]:
LEARNING_RATE = 0.01

actor_optimizer = optim.Adam(actor.parameters(), lr = LEARNING_RATE)
critic_optimizer = optim.Adam(critic.parameters(), lr = LEARNING_RATE)

In [43]:
def train(env, actor, critic, actor_optimizer, critic_optimizer, discount_factor, trace_decay, init):
    
    log_prob_actions = []
    values = []
    rewards = []
    done = False
    episode_reward = 0
    
    state = env.reset()

    while not done:

        state = torch.FloatTensor(state).unsqueeze(0)

        action_preds = actor(state)
        value_pred = critic(state)
                
        action_probs = F.softmax(action_preds, dim = -1)
                
        dist = distributions.Categorical(action_probs)

        action = dist.sample()
        
        log_prob_action = dist.log_prob(action)
        
        state, reward, done, _ = env.step(action.item())

        log_prob_actions.append(log_prob_action)
        values.append(value_pred)
        rewards.append(reward)

        episode_reward += reward
    
    log_prob_actions = torch.cat(log_prob_actions)
    values = torch.cat(values).squeeze(-1)
    
    returns = calculate_returns(rewards, discount_factor)
    advantages = calculate_advantages(rewards, values, discount_factor, trace_decay)
    
    policy_loss, value_loss = update_policy(advantages, log_prob_actions, returns, values, actor_optimizer, critic_optimizer, init)

    return policy_loss, value_loss, episode_reward

def calculate_returns(rewards, discount_factor, normalize = True):
    
    returns = []
    R = 0
    
    for r in reversed(rewards):
        R = r + R * discount_factor
        returns.insert(0, R)
        
    returns = torch.tensor(returns)
    
    if normalize:
        
        returns = (returns - returns.mean()) / returns.std()
        
    return returns

def calculate_advantages(rewards, values, discount_factor, trace_decay, normalize = True):
    
    advantages = []
    advantage = 0
    next_value = 0
    
    for r, v in zip(reversed(rewards), reversed(values)):
        td_error = r + next_value * discount_factor - v
        advantage = td_error + advantage * discount_factor * trace_decay
        next_value = v
        advantages.insert(0, advantage)
        
    advantages = torch.tensor(advantages)
    advantages *= 1 - trace_decay
    if normalize:
        advantages = (advantages - advantages.mean()) / advantages.std()
        
    return advantages

def update_policy(advantages, log_prob_actions, returns, values, actor_optimizer, critic_optimizer, init):
        
    advantages = advantages.detach()
    returns = returns.detach()
    policy_loss = - (advantages * log_prob_actions).mean()
    
    value_loss = F.smooth_l1_loss(returns, values).mean()
        
    actor_optimizer.zero_grad()
    critic_optimizer.zero_grad()
    
    policy_loss.backward(retain_graph=True)
    value_loss.backward(retain_graph=True)
    if init:
        actor_optimizer.step()
        critic_optimizer.step()
    
    return policy_loss.item(), value_loss.item()

def save_results(mean_return, std_return, timestamps, time_start, seed, env_name, init, name='gae'):
    run_dict = {'name': name, 
                'avg_ret': mean_return,
                'std_dev': std_return,
                'timestamps': timestamps,
                'time_start': time_start,
                'seed': seed,
                'env_name': env_name}
    if not init:
        filename = 'run_time_%s_%s_%s.pickle' % (name, seed, 'pretrained')
    else:
        filename = 'run_time_%s_%s.pickle' % (name, seed)
    with open(filename, 'wb') as handle:
        pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

import time
import pickle

MAX_EPISODES = 500
DISCOUNT_FACTOR = 0.99
TRACE_DECAY = 0.84
N_TRIALS = 25
PRINT_EVERY = 10
REWARD_THRESHOLD = 195

episode_rewards = []
timestamps = []
time_start = time.time()
avg_ret = []
std_dev = []
for episode in range(1, MAX_EPISODES+1):
    
    policy_loss, value_loss, episode_reward = train(env, actor, critic, actor_optimizer, critic_optimizer, DISCOUNT_FACTOR, TRACE_DECAY, init)

    episode_rewards.append(episode_reward)
    mean_trial_rewards = np.mean(episode_rewards[-N_TRIALS:])
    std_trial_rewards = np.std(episode_rewards[-N_TRIALS:])
    
    timestamps.append(time.time())
    avg_ret.append(mean_trial_rewards)
    std_dev.append(std_trial_rewards)
    
    if episode % PRINT_EVERY == 0:
    
        print(f'| Episode: {episode:4} | Mean Rewards: {episode_reward:6.2f} ')
    
    if mean_trial_rewards >= REWARD_THRESHOLD:
        
        print(f'Reached reward threshold in {episode} episodes')
        
        break

        
save_results(avg_ret, std_dev, timestamps, time_start, SEED, env_name, init)
if save == True:
    torch.save(actor.state_dict(), 'actor_weights.pt')
    torch.save(critic.state_dict(), 'critic_weights.pt')

| Episode:   10 | Mean Rewards:  13.00 
| Episode:   20 | Mean Rewards:  29.00 
| Episode:   30 | Mean Rewards:  34.00 
| Episode:   40 | Mean Rewards:  34.00 
| Episode:   50 | Mean Rewards:  74.00 
| Episode:   60 | Mean Rewards: 109.00 
| Episode:   70 | Mean Rewards:  87.00 
| Episode:   80 | Mean Rewards: 127.00 
| Episode:   90 | Mean Rewards: 200.00 
| Episode:  100 | Mean Rewards: 200.00 
Reached reward threshold in 108 episodes


In [None]:
plt.figure(figsize=(12,8))
plt.plot(episode_rewards)
plt.xlabel('Episode', fontsize=20)
plt.ylabel('Reward', fontsize=20)
plt.hlines(REWARD_THRESHOLD, 0, len(episode_rewards), color='r')
plt.grid()