# REINFORCE

---

In this notebook, we will train REINFORCE with OpenAI Gym's Cartpole environment.

### 1. Import the Necessary Packages

In [1]:
import gym
gym.logger.set_level(40) # suppress warnings (please remove if gives error)
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(0) # set random seed
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical


from PIL import Image
import uuid
import glob
from sklearn.decomposition import PCA

In [2]:
def train_pca():
    image_list = []
    for filename in glob.glob('game_screens/*.jpeg'):
        im = Image.open(filename)
        image_list.append(np.array(im).flatten())
        im.close()
    image_list = np.array(image_list)
    pca = PCA(10).fit(image_list)
    
    return pca

pca = train_pca()

### 2. Define the Architecture of the Policy

In [3]:
env = gym.make('CartPole-v0')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class Policy(nn.Module):
    def __init__(self, s_size=20, h_size=16, a_size=2):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

observation space: Box(4,)
action space: Discrete(2)


### 3. Train the Agent with REINFORCE

In [4]:
policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

def reinforce(n_episodes=10000, max_t=1000, gamma=1.0, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        state = env.reset()
        pixels_state_prev = []
        for t in range(max_t):
            pixels_state = Image.fromarray(env.render(mode='rgb_array'))
            pixels_state = pixels_state.resize((120,80))
            pixels_state = pixels_state.convert(mode='L')
            pixels_state = np.array(pixels_state).flatten()
            pixels_state = pca.transform([pixels_state])[0]
            
            if len(pixels_state_prev) == 0:
                # first run
                pixels_state_prev = pixels_state
            state = np.concatenate((pixels_state,pixels_state_prev))
            pixels_state_prev = pixels_state
            
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            if done:
                break 
        scores_deque.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores
    
scores = reinforce()

Episode 100	Average Score: 16.93
Episode 200	Average Score: 13.16
Episode 300	Average Score: 14.93
Episode 400	Average Score: 15.39
Episode 500	Average Score: 16.92
Episode 600	Average Score: 16.08
Episode 700	Average Score: 12.19
Episode 800	Average Score: 9.67
Episode 900	Average Score: 10.05
Episode 1000	Average Score: 9.75
Episode 1100	Average Score: 9.50
Episode 1200	Average Score: 9.44
Episode 1300	Average Score: 9.24
Episode 1400	Average Score: 9.48
Episode 1500	Average Score: 9.38
Episode 1600	Average Score: 9.45
Episode 1700	Average Score: 9.38
Episode 1800	Average Score: 9.30
Episode 1900	Average Score: 9.45
Episode 2000	Average Score: 9.42
Episode 2100	Average Score: 9.24
Episode 2200	Average Score: 9.40
Episode 2300	Average Score: 9.52
Episode 2400	Average Score: 9.37
Episode 2500	Average Score: 9.40
Episode 2600	Average Score: 9.37
Episode 2700	Average Score: 9.39
Episode 2800	Average Score: 9.43
Episode 2900	Average Score: 9.57
Episode 3000	Average Score: 9.53
Episode 310

KeyboardInterrupt: 

### 4. Plot the Scores

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()

### 5. Watch a Smart Agent!

In [None]:
env = gym.make('CartPole-v0')

state = env.reset()
for t in range(1000):
    action, _ = policy.act(state)
    env.render()
    state, reward, done, _ = env.step(action)
    if done:
        break 

env.close()