# REINFORCE in pytorch

Just like we did before for q-learning, this time we'll design a lasagne network to learn `CartPole-v0` via policy gradient (REINFORCE).

Most of the code in this notebook is taken from approximate qlearning, so you'll find it more or less familiar and even simpler.

In [1]:
%env THEANO_FLAGS = 'floatX=float32'
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    %env DISPLAY = : 1

env: THEANO_FLAGS='floatX=float32'


In [2]:
import gym
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
env = gym.make("MountainCar-v0").env

In [4]:
state = env.reset()
n_actions = env.action_space.n
state_dim = env.observation_space.shape

# Featurizing state

In [5]:
state_dim

(2,)

# Building the network for REINFORCE

For REINFORCE algorithm, we'll need a model that predicts action probabilities given states. Let's define such a model below.

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.dense1 = nn.Linear(state_dim[0], 200)
        self.dense2 = nn.Linear(200, 500)
        self.dense3 = nn.Linear(500, 1000)
        self.dense4 = nn.Linear(1000, 500)
        self.dense5 = nn.Linear(500, 200)
        self.dense6 = nn.Linear(200, n_actions)
        
    def forward(self, x):
        x = F.relu(self.dense1(x))
        # x = F.relu(self.dense2(x))
        # x = F.tanh(self.dense3(x))
        # x = F.tanh(self.dense4(x))
        # x = F.relu(self.dense5(x))
        x = self.dense6(x)
        
        return x

In [8]:
agent = Network()

#### Predict function

In [9]:
def predict_proba(states):
    """ 
    Predict action probabilities given states.
    :param states: numpy array of shape [batch, state_shape]
    :returns: numpy array of shape [batch, n_actions]
    """
    # convert states, compute logits, use softmax to get probability
    states = torch.tensor(states, dtype=torch.float32)
    logits = agent(states)
    policy = F.softmax(logits)
    return policy.data.numpy()

In [10]:
test_states = np.array([env.reset() for _ in range(5)])
test_probas = predict_proba(test_states)
assert isinstance(
    test_probas, np.ndarray), "you must return np array and not %s" % type(test_probas)
assert tuple(test_probas.shape) == (
    test_states.shape[0], n_actions), "wrong output shape: %s" % np.shape(test_probas)
assert np.allclose(np.sum(test_probas, axis=1),
                   1), "probabilities do not sum to 1"

  # Remove the CWD from sys.path while we load stuff.


### Play the game

We can now use our newly built agent to play the game.

In [11]:
def generate_session(t_max=1000):
    """ 
    play a full session with REINFORCE agent and train at the session end.
    returns sequences of states, actions and rewards
    """

    # arrays to record session
    states, actions, rewards = [], [], []

    s = env.reset()

    for t in range(t_max):

        # action probabilities array aka pi(a|s)
        action_probas = predict_proba(np.array([s]))[0]

        a = np.random.choice(n_actions, p=action_probas)

        new_s, r, done, info = env.step(a)

        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r + np.sin(3*s[0]) + 2)

        s = new_s
        if done:
            break

    return states, actions, rewards

In [12]:
# test it
states, actions, rewards = generate_session()

  # Remove the CWD from sys.path while we load stuff.


### Computing cumulative rewards

In [13]:
def get_cumulative_rewards(rewards,  # rewards at each step
                           gamma=0.99  # discount for reward
                           ):
    """
    take a list of immediate rewards r(s,a) for the whole session 
    compute cumulative returns (a.k.a. G(s,a) in Sutton '16)
    G_t = r_t + gamma*r_{t+1} + gamma^2*r_{t+2} + ...

    The simple way to compute cumulative rewards is 
    to iterate from last to first time tick
    and compute G_t = r_t + gamma*G_{t+1} recurrently

    You must return an array/list of cumulative rewards with as many elements 
    as in the initial rewards.
    """
    def G_t(reward_arr, gamma):
        return sum([gamma**index*r for index, r in enumerate(reward_arr)])
    
    G = [G_t(rewards[index:], gamma) for index, r in enumerate(rewards)]
    
    return G

In [14]:
get_cumulative_rewards(rewards)
print(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9))
assert len(get_cumulative_rewards(list(range(100)))) == 100, "check cumulative reward length"
assert np.allclose(get_cumulative_rewards([0, 0, 1, 0, 0, 1, 0], gamma=0.9), [
                   1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0])
assert np.allclose(get_cumulative_rewards(
    [0, 0, 1, -2, 3, -4, 0], gamma=0.5), [0.0625, 0.125, 0.25, -1.5, 1.0, -4.0, 0.0])
assert np.allclose(get_cumulative_rewards(
    [0, 0, 1, 2, 3, 4, 0], gamma=0), [0, 0, 1, 2, 3, 4, 0])
print("looks good!")

[1.40049, 1.5561, 1.729, 0.81, 0.9, 1.0, 0.0]
looks good!


#### Loss function and updates

We now need to define objective and update over policy gradient.

Our objective function is

$$ J \approx  { 1 \over N } \sum  _{s_i,a_i} \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$


Following the REINFORCE algorithm, we can define our objective as follows: 

$$ \hat J \approx { 1 \over N } \sum  _{s_i,a_i} log \pi_\theta (a_i | s_i) \cdot G(s_i,a_i) $$

When you compute gradient of that function over network weights $ \theta $, it will become exactly the policy gradient.


In [15]:
def to_one_hot(y_tensor, n_dims=None):
    """ helper: take an integer vector and convert it to 1-hot matrix. """
    y_tensor = y_tensor.type(torch.LongTensor).view(-1, 1)
    n_dims = n_dims if n_dims is not None else int(torch.max(y_tensor)) + 1
    y_one_hot = torch.zeros(
        y_tensor.size()[0], n_dims).scatter_(1, y_tensor, 1)
    return y_one_hot

In [16]:
# Your code: define optimizers
opt = torch.optim.Adam(agent.parameters(), lr=1e-5)


def train_on_session(states, actions, rewards, gamma=0.99):
    """
    Takes a sequence of states, actions and rewards produced by generate_session.
    Updates agent's weights by following the policy gradient above.
    Please use Adam optimizer with default parameters.
    """

    # cast everything into torch tensors
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.int32)
    cumulative_returns = np.array(get_cumulative_rewards(rewards, gamma))
    cumulative_returns = torch.tensor(cumulative_returns, dtype=torch.float32)

    # predict logits, probas and log-probas using an agent.
    logits = agent(states)
    probas = F.softmax(logits)
    logprobas = F.log_softmax(logits)

    assert all(isinstance(v, torch.Tensor) for v in [logits, probas, logprobas]), \
        "please use compute using torch tensors and don't use predict_proba function"

    # select log-probabilities for chosen actions, log pi(a_i|s_i)
    logprobas_for_actions = torch.sum(logprobas * to_one_hot(actions), dim=1)

    # REINFORCE objective function
    J_hat = torch.mean(logprobas_for_actions*cumulative_returns)

    # regularize with entropy
    # <compute mean entropy of probas. Don't forget the sign!>
    entropy_reg = torch.mean(probas*torch.log(probas))

    loss = - J_hat - entropy_reg

    # Gradient descent step
    loss.backward()
    opt.step()
    opt.zero_grad()

    # technical: return session rewards to print them later
    return np.sum(rewards)

### The actual training

In [20]:
%%time
for i in range(100):

    rewards = [train_on_session(*generate_session(t_max=2000))
               for _ in range(100)]  # generate new sessions

    print("mean reward:%.3f" % (np.mean(rewards)))

    if np.mean(rewards) > 90:
        print("You Win!")  # but you can train even further

  # Remove the CWD from sys.path while we load stuff.


mean reward:8.024
mean reward:8.583
mean reward:7.421
mean reward:7.727
mean reward:8.366


KeyboardInterrupt: 

In [None]:
def generate_session(t_max=1000):
    """ 
    play a full session with REINFORCE agent and train at the session end.
    returns sequences of states, actions and rewards
    """

    # arrays to record session
    states, actions, rewards = [], [], []

    s = env.reset()

    for t in range(t_max):

        # action probabilities array aka pi(a|s)
        action_probas = predict_proba(np.array([s]))[0]

        a = np.random.choice(n_actions, p=action_probas)

        new_s, r, done, info = env.step(a)

        # record session history to train later
        states.append(s)
        actions.append(a)
        rewards.append(r + np.sin(3*s[0]) + 2)

        s = new_s
        if done:
            break

    return states, actions, rewards

### Video

In [18]:
# record sessions
import gym.wrappers
env = gym.wrappers.Monitor(gym.make("MountainCar-v0"),
                           directory="videos", force=True)
sessions = [generate_session() for _ in range(100)]
env.close()

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [19]:
# show video
from IPython.display import HTML
import os

video_names = list(
    filter(lambda s: s.endswith(".mp4"), os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1]))  # this may or may not be _last_ video. Try other indices