# REINFORCE and Actor-Critic

## 0. Setups

Import required libraries

In [1]:
import numpy as np
import gym
import time
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from sklearn.linear_model import LinearRegression, Lasso

# For Colab users, turn this into true
colab = False

Select hardware to use - GPU or CPU

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print(device)

cpu


  return torch._C._cuda_getDeviceCount() > 0


For rendering **[COLAB USE ONLY!]**

In [None]:
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1
    !pip3 install box2d-py
    !pip3 install gym[Box_2D]

Build Environment and check MDP size

In [3]:
env = gym.make('CartPole-v1')
env.seed(500)
torch.manual_seed(500)

# Configure MDP
gamma = 0.99
state_dim = env.observation_space.low.size
num_action = env.action_space.n
print('Dimension of state space / number of actions : %d / %d'%(state_dim, num_action))

Dimension of state space / number of actions : 4 / 2


## 1. Create an policy and value function instance

 Define policy network

In [4]:
class Policy(nn.Module):
    def __init__(self, state_dim, num_action, hidden_size1, hidden_size2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, num_action)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        action_score = self.fc3(x)
        return F.softmax(action_score, dim=1)

## 2. REINFORCE loop

```python 
m = Categorial(probs)
```
makes neural network output computation graph (gradient) into discrete probability distribution, thus it is possible to calculate $\nabla_\theta\log{\pi_\theta(a|s)}$

In [5]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    state = state.to(device)
    probs = pi(state)
    
    m = Categorical(probs)
    action = m.sample()
    
    return action.item(), m.log_prob(action)

def sample_trajectory(data, T):
    # Reset environment to get new trajectory
    state = env.reset()
    r_sum, r_sum_discount = 0, 0
    
    for t in range(T):
        # Get action from current policy and rollout
        action, log_prob = select_action(state)
        next_state, reward, done, _ = env.step(action)
        r_sum += reward
        r_sum_discount += reward * (gamma ** t) 

        # Store data
        data['log_pi'].append(-log_prob) # (-) sign for gradient ascent
        data['state'].append(state)
        data['next_state'].append(next_state)
        data['reward'].append(reward)
        
        # Step
        state = next_state
        if done:
            break
    
    return r_sum, r_sum_discount

REINFORCE algorithm approximate gradient for policy parameter $\theta$ with sampled trajectory
$$ \nabla_\theta J(\theta) \approx \frac{1}{N} \sum^N_{i=1} \big( \sum^T_{t=0}\nabla_\theta\log{\pi_\theta}(a_t|s_t) \big) \big( \sum^T_{t=0} \gamma^t r(s_t,a_t) \big)$$

With further approximation and use of baseline,
$$ \nabla_\theta J(\theta) \approx \frac{1}{N} \sum^N_{i=1} \sum^T_{t=0} \big( \nabla_\theta\log{\pi_\theta(a_t|s_t)} \big) \big( Q(s_t,a_t) - v(s_t) \big)$$

For REINFORCE, we use $Q(s_t, a_t) \approx \sum^T_{t'=t} \gamma^t r(s_{t'}, a_{t'})$, baseline $v^{\pi_\theta}(s_0) \approx \mathbb{E}_{s_0, \pi_\theta} \big[ \sum^T_{t=0} \gamma^t r_t \big]$

In [14]:
def calculate_PG(pi_returns_discounted, dataset):
    pi_loss = 0
    for data in dataset:
        advantage, DCR = [], 0
        for r in reversed(data['reward']):
            # TODO : Caculate discounted redataset.append(data)turn from t=i
            # Hint : reversed() will give saved rewards in reversed order
            #DCR =
            
            # Q(s,a) is replaced with discounted sum of rewards (DCR)
            # v(s) is replaced with empirical v(s_0)
            advantage.insert(0, DCR - np.mean(pi_returns_discounted))

        # TODO : alternate between two losses to see difference!
        pi_loss_vanilla = [log_pi * DCR for log_pi in data['log_pi']]
        pi_loss_baseline = [log_pi * a for log_pi, a in zip(data['log_pi'], advantage)]
        
        # Take mean value
        pi_loss += torch.cat(pi_loss_baseline).sum()
        
    return pi_loss / num_trajs

In [15]:
num_epochs = 100
num_trajs = 100
T = 10000
log_interval = 5
total_time = []

pi = Policy(state_dim, num_action, 128, 128).to(device)
optimizer_pi = optim.Adam(pi.parameters(), lr=1e-3)

# For logging
pi_returns, pi_returns_discounted = [], []

for epoch in range(num_epochs):
    start_epoch = time.time()
    
    # On-policy dataset
    dataset = []
    
    # Collect trajectories to perform gradient step
    for N in range(num_trajs):
        data = {'log_pi':[], 'state':[], 'next_state':[], 'reward':[]}
        r_sum, r_sum_discount = sample_trajectory(data, T)
        dataset.append(data)

        # For logging - store most recent N trajectories
        pi_returns.append(r_sum)
        pi_returns_discounted.append(r_sum_discount)
        if len(pi_returns) > num_trajs:
            pi_returns.pop(0)
            pi_returns_discounted.pop(0)
    
    # Perform pocliy gradient step
    optimizer_pi.zero_grad()
    pi_loss = calculate_PG(pi_returns_discounted, dataset)
    pi_loss.backward()
    optimizer_pi.step()
    
    # Logging - print most recent epoch result
    epoch_time = time.time() - start_epoch
    total_time.append(epoch_time)
    if epoch % log_interval == 0:
        time_elapsed = np.sum(total_time)
        time_remain = np.mean(total_time) * num_epochs - time_elapsed
        print('Epoch {}\tReturn_mean: {:.2f}\tReturn_std: {:.2f}\tTime(Elapsed/Remain): {:.2f}/{:.2f} (mins)'.format(
            epoch, np.mean(pi_returns), np.std(pi_returns), time_elapsed/60, time_remain/60))

Epoch 0	Return_mean: 19.76	Return_std: 8.49	Time(Elapsed/Remain): 0.05/5.17 (mins)
Epoch 5	Return_mean: 29.37	Return_std: 18.32	Time(Elapsed/Remain): 0.24/3.73 (mins)
Epoch 10	Return_mean: 40.56	Return_std: 25.14	Time(Elapsed/Remain): 0.46/3.71 (mins)
Epoch 15	Return_mean: 53.07	Return_std: 26.29	Time(Elapsed/Remain): 0.75/3.95 (mins)
Epoch 20	Return_mean: 76.31	Return_std: 44.41	Time(Elapsed/Remain): 1.15/4.32 (mins)
Epoch 25	Return_mean: 131.40	Return_std: 71.74	Time(Elapsed/Remain): 1.77/5.05 (mins)
Epoch 30	Return_mean: 192.59	Return_std: 94.73	Time(Elapsed/Remain): 2.79/6.21 (mins)
Epoch 35	Return_mean: 279.06	Return_std: 117.43	Time(Elapsed/Remain): 4.17/7.41 (mins)
Epoch 40	Return_mean: 325.63	Return_std: 113.61	Time(Elapsed/Remain): 6.04/8.69 (mins)
Epoch 45	Return_mean: 399.72	Return_std: 122.64	Time(Elapsed/Remain): 8.45/9.92 (mins)
Epoch 50	Return_mean: 429.90	Return_std: 97.34	Time(Elapsed/Remain): 11.10/10.66 (mins)
Epoch 55	Return_mean: 441.48	Return_std: 86.38	Time(Elaps

In [None]:
from numpy import linalg as LA

# Calculate feature vector
#state[0] : Cart pos
#state[1] : Cart speed
#state[2] : Pole angle
#state[3] : Pole velocity at tip

state2 = [-0.12, 0, 0.12] # termination condition
state3 = [-1, 0, 1]

mu = []
for s2 in state2:
    for s3 in state3:
        mu.append([s2, s3])

def state2feature(state):
    phi = []
    for f in mu:
        rad_base = LA.norm(np.array(state[-2:])-np.array(f)) ** 2
        phi.append(np.exp(-0.5*rad_base))
    return np.array(phi)


def calculate_vf(dataset, vf):
    X, y = [], []
    
    for data in dataset:
        for s, next_s, r in zip(data['state'], data['next_state'], data['reward']):
            v = state2feature(s)
            Q = r
            if vf is not None:
                Q = r + gamma * vf.predict(state2feature(next_s).reshape(1, -1))[0]
            X.append(v)
            y.append(Q)
    
    return X, y


def get_advantage(data, vf):
    advantage, baseline = [], []
    
    for s, next_s, r in zip(data['state'], data['next_state'], data['reward']):
        v = vf.predict(state2feature(s).reshape(1, -1))[0]
        v_next = vf.predict(state2feature(next_s).reshape(1, -1))[0]
        # TODO: Complete advantage calculation by calculating Q-value
        #Q = 
        A = Q - v
        
        advantage.append(A)
        baseline.append(v)
    
    return advantage, baseline


def calculate_AC_PG(vf, pi_returns_discounted, dataset):
    pi_loss = 0
    for data in dataset:
        # For linear Actor-Critic
        advantage = []
        _, v = get_advantage(data, vf)
        DCR = 0
        for i, r in enumerate(reversed(data['reward'])):
            DCR = r + gamma * DCR
            advantage.insert(0, DCR - v[i]) # For practical algorithm, we just adopt baseline

        # Compute each element of gradient
        pi_loss_linear_vf = [log_pi * a for log_pi, a in zip(data['log_pi'], advantage)]
        
        # Sums up log_prob * weight
        pi_loss += torch.cat(pi_loss_linear_vf).sum()
        
    return pi_loss / num_trajs

In [None]:
num_epochs = 100
num_trajs = 100
T = 10000
log_interval = 5
total_time = []

pi = Policy(state_dim, num_action, 128, 128).to(device)
optimizer_pi = optim.Adam(pi.parameters(), lr=1e-3)
vf = None

# For logging
pi_returns, pi_returns_discounted = [], []

dataset_vf = []
for epoch in range(num_epochs):
    start_epoch = time.time()
    
    # On-policy dataset
    dataset = []
    
    # Collect trajectories to perform gradient step
    for N in range(num_trajs):
        data = {'log_pi':[], 'state':[], 'next_state':[], 'reward':[]}
        r_sum, r_sum_discount = sample_trajectory(data, T)
        dataset.append(data)
        dataset_vf.append(data)

        # For logging - store most recent N trajectories
        pi_returns.append(r_sum)
        pi_returns_discounted.append(r_sum_discount)
        if len(pi_returns) > num_trajs:
            pi_returns.pop(0)
            pi_returns_discounted.pop(0)

    ### NEW : update critic ###
    X, y = calculate_vf(dataset_vf, vf)
    vf = LinearRegression().fit(X, y)
    
    # Perform pocliy gradient step
    optimizer_pi.zero_grad()
    pi_loss = calculate_AC_PG(vf, pi_returns_discounted, dataset)
    pi_loss.backward()
    optimizer_pi.step()
    
    # Logging - print most recent epoch result
    epoch_time = time.time() - start_epoch
    total_time.append(epoch_time)
    if epoch % log_interval == 0:
        dataset_vf = []
        time_elapsed = np.sum(total_time)
        time_remain = np.mean(total_time) * num_epochs - time_elapsed
        print('Epoch {}\tReturn_mean: {:.2f}\tReturn_std: {:.2f}\tTime(Elapsed/Remain): {:.2f}/{:.2f} (mins)'.format(
            epoch, np.mean(pi_returns), np.std(pi_returns), time_elapsed/60, time_remain/60))

## Visualize result

For rendering **[COLAB USE ONLY!]**

In [None]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
            loop controls style="height: 400px;">
            <source src="data:video/mp4;base64,{0}" type="video/mp4" />
            </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")
    
def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

if colab:
    display = Display(visible=0, size=(1400, 900))
    display.start()

    env = wrap_env(env)

In [None]:
state = env.reset()

while True:
    env.render()
    time.sleep(0.01)
    action, log_prob = select_action(state)
    state, reward, done, info = env.step(action)
    if done: 
        break
            
env.close()
if colab:
    show_video()