In [2]:
import torch 
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import gym
import os
import time
from multiprocessing import Process
%matplotlib inline

In [3]:
class MLP(nn.Module):
    def __init__(self, input_size, n_layers, size, output_size):
        self.n_layers = n_layers
        self.size = size
        self.input_size = input_size
        self.activation = activation
        self.output_activ = output_activ
        self.output_size = output_size
    
    
        # Create the nn Linear Layers
        self.hidden_layers = nn.ModuleList([nn.Linear(self.input_size, self.size)])
        self.relu = nn.ReLU(inplace=True)
        self.output_layer = nn.Linear(self.size, self.output_size)
        
    def forward(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
            x = self.relu(x)
            
        output = self.output_layer(x)
        return output

In [5]:
def path_length(path):
    return len(path['reward'])

def normalize(data, mean=0.0, std=1.0):
    n_data = (data - np.mean(data)) / (np.std(data) + 1e-8)
    return n_data * (std + 1e-8) + mean

In [7]:
# Training the policy gradient
def train_policy_gradient(exp_name, env_name, optimizer, critic_optimizer, n_epochs=100, gamma=1.0, min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32):
    
    start = time.time()
    optimizer.zero_grad()
    critic_optimizer.zero_grad()
    
    # Setup the openai gym environment
    env = gym.make(env_name)
    
    # Is this env discrete or continuous ?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    
    # Maximum length of episodes
    max_eps_length = max_path_length or env.spec.max_episode_steps
    
    #Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    if discrete:
        ac_dim = env.action_space.n
    else:
        ac_dim = env.action_space.shape[0]
    
    policy_network = MLP(input_size=ob_dim, n_layers=n_layers, size=size, output_size=ac_dim)      
        

    # Training Loop
    for epoch in range(n_epochs):
        print("********** Iteration %i ************"%epoch)
        
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            log_probs = []
            steps = 0
            while True:
                obs.append(ob)
                if discrete:
                    logits = policy_network(obs)
                    sampled_ac = torch.multinomial(logits, 1)
                    sampled_ac = sampled_ac.view((-1))
                    cross_entropy_loss = nn.CrossEntropyLoss()
                    logprob = cross_entropy_loss(logits, ac)
                else:
                    mean = policy_network(obs)
                    logstd = nn.Parameter(torch.zeros(shape=[1, ac_dim]))
                    std = torch.exp(logstd)
                    sampled_z = torch.randn(mean.shape)
                    sampled_ac = mean+std*sampled_z
                    z = (ac - mean)/std
                    logprob = -0.5 * torch.add(z**2)
                   
                # Get the actions from the policy network
                ac = sampled_ac
                ac = ac[0]
                acs.append(ac)
                log_probs.append(logprob)
                # Step through the simulator to get the next state and reward
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps +=1 
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                   "log_probs": np.array(log_probs)}
            paths.append(path)
            timesteps_this_batch+=path_length(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch  
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        log_probs = np.concatenate([path['log_probs'] for path in paths])
        
        # Computing the Q Values
        q_n = []
        for path in paths:
            q = 0
            q_path = []

            # Dynamic programming over reversed path
            for rew in reversed(path["reward"]):
                q = rew + gamma * q
                q_path.append(q)
            q_path.reverse()

            # Append these q values
            if not reward_to_go:
                q_path = [q_path[0]] * len(q_path)
        
            q_n.extend(q_path)
                
        if nn_baseline:
            b_n = value_function_approximator(ob_no)
            b_n = normalize(b_n, np.mean(q_n), np.std(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n
            
        if normalize_advantages:
            adv_n = normalize(adv_n, 0.0, 1.0)
            
        if nn_baseline:
            q_normalized_n = normalize(q_n)
            value_function_approximator = MLP(input_size=ob_dim, n_layers=n_layers, size=size, output_size=1)
            baseline_loss = nn.MSELoss()
            loss_b = baseline_loss(value_function_approximator(obs), q_normalized_n)
            loss_b.backward()
            critic_optimizer.step()   
            
        
        # Final loss function
        loss = torch.mean(log_probs*adv)
        loss.backward()
        optimizer.step()