# REINFORCE

This an implementation of REINFORCE (also known as Monte Carlo Policy Gradients) with OpenAI Gym's Cartpole environment.

### 1. Import the Necessary Packages

In [3]:
import gym
gym.logger.set_level(40)
import numpy as np
from collections import deque
import time
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

### 2. Define the Architecture of the Policy

In [5]:
env = gym.make('CartPole-v0')
env.seed(0)
print('Observation space:', env.observation_space)
print('Action space:', env.action_space)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Observation space: Box(4,)
Action space: Discrete(2)


In [None]:
class Policy(nn.Module):
    def __init__(self, s_size=4, h_size=16, a_size=2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categoriacal(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

### 3. Train the Agent with REINFORCE

In [None]:
policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

In [None]:
def reinforce(n_episodes=1000,max_t=1000,gamma=1.0,print_every=100):
    start = time.time()
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episode+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        for t in range(max_t):
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a, b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 195.0:
            print('Episode solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            torch.save(agent.)
            break
    
    print()
    print('Training Time: {:.2} minutes'.format((time.time()-start)/60))
    
    return scores
        