# Reinforce (Monte Carlo Policy Gradient) Method

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import deque

In [None]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

In [None]:
plt.style.use('ggplot')

## Set Configs

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [None]:
GAMMA = 1.0
PRINT_EVERY = 100
LR = 1e-2

## Set Environment

In [None]:
ENV_NAME = 'CartPole-v0'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

In [None]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

## Define The Policy

In [None]:
class Policy(nn.Module):
    """Define non-linear Policy."""
    
    def __init__(self, env, hidden_size=16):
        
        super(Policy, self).__init__()
        
        state_size = env.observation_space.shape[0]
        action_size = env.action_space.n
        
        self.fc1_layer = nn.Linear(state_size, hidden_size)
        self.fc2_layer = nn.Linear(hidden_size, action_size)
    
    def act(self, state):
        
        state = torch.from_numpy(state).float().unsqueeze(0)
        state.to(device)
        
        probs = self.forward(state)
        m = Categorical(probs.cpu())
        
        action = m.sample()
        return action.item(), m.log_prob(action)
    
    def forward(self, x):
        
        x = F.relu(self.fc1_layer(x))
        x = self.fc2_layer(x)
        x = F.softmax(x, dim=1)
        
        return x

In [None]:
policy = Policy(env).to(device)

## Set Optimizer

In [None]:
optimizer = optim.Adam(policy.parameters(), lr=LR)

## Train The Agent

In [None]:
def train_agent(num_episodes=1000, max_time=1000, gamma=1.0):
    
    scores = []
    scores_window = deque(maxlen=100)
    
    for i_episode in range(1, num_episodes+1):
        
        state = env.reset()
        
        saved_log_probs = []
        rewards = []
        
        for time in range(max_time):
            
            action, log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            
            next_state, reward, done, _ = env.step(action)
            
            state = next_state
            rewards.append(reward)
            if done:
                break
                
        scores_window.append(sum(rewards))
        scores.append(sum(rewards))
        
        discounts = [gamma**i for i in range(len(rewards)+1)]
        R = sum([a*b for a,b in zip(discounts, rewards)])
        
        policy_loss = []
        for log_prob in saved_log_probs:
            policy_loss.append(-log_prob * R)
        
        policy_loss = torch.cat(policy_loss).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}', end='')
        
        if i_episode % PRINT_EVERY == 0:
            print(f'\rEpisode: {i_episode}, Average Score: {np.mean(scores_window):.2f}')
        if np.mean(scores_window)>=195.0:
            print(f'\nEnvironment solved in {i_episode-100:d} episodes! Average Score: {np.mean(scores_window):.2f}')
            if not os.path.exists('./agents/'): os.makedirs('./agents/')
            torch.save(policy.state_dict(), f'./agents/REINFORCE_{ENV_NAME}.pth')
            break
            
    print('Training completed.')
    return scores

In [None]:
scores = train_agent(num_episodes=1000, max_time=1000)

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_hill_climbing_policy_evaluation.png')
plt.show()

## 🎬 Watch The Smart Agen

In [None]:
# load the weights of smart agent
policy.load_state_dict(torch.load(f'./agents/REINFORCE_{ENV_NAME}.pth'));

In [None]:
num_episodes = 5

for i_episode in range(1, num_episodes+1):
    
    state = env.reset()
    rewards = []
    for time_step in range(1000):
        
        env.render() # render the screen
        
        action = policy.act(state) # select an action
        next_state, reward, done, _ = env.step(action)
        
        state = next_state
        rewards.append(reward)
        if done:
            break
            
    print(f'\rEpisode: {i_episode}, Average Score: {sum(rewards):.2f}')

env.close()

---