# Trial implementation of REINFORCE from scratch
- Env - Carpole

### Imports

In [32]:
import numpy as np
import gym
import matplotlib.pyplot as plt
%matplotlib inline

import torch
torch.manual_seed(0) # set initial seed
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

from collections import deque

## Define the environment

In [33]:
env = gym.make('CartPole-v0')
env.seed(0)
print('Observation space:',env.observation_space)
print('Action space:',env.action_space)

Observation space: Box(4,)
Action space: Discrete(2)


## Lets try first with a random policy

In [65]:
class Policy():
    def act(self,state):
        return np.random.choice(2),None

### Implement a basic episode

In [66]:
policy = Policy()
def reinforce(n_episodes = 1000, max_t = 1000, epsilon = 1.0, deque_len = 100):
    
    scores = []
    scores_deque = deque(maxlen=deque_len) # This is there to track the scores achieved in the last deque_len episodes only
    
    for i_episode in range(1,n_episodes+1):
        state = env.reset()
        rewards = []
        for t in range(max_t):
            action, action_probs = policy.act(state)
            next_state,reward,done,info = env.step(action)
            rewards.append(reward)
            if done==True:
                break
        scores.append(sum(rewards))
        scores_deque.append(sum(rewards))
        
        if i_episode%100==0:
            print('Episodes: {}\t Sum of rewards in the past {} episodes = {}'.format(i_episode, deque_len, np.mean(scores_deque)))
            
    return scores

In [67]:
scores = reinforce()

Episodes: 100	 Sum of rewards in the past 100 episodes = 22.67
Episodes: 200	 Sum of rewards in the past 100 episodes = 21.69
Episodes: 300	 Sum of rewards in the past 100 episodes = 21.91
Episodes: 400	 Sum of rewards in the past 100 episodes = 23.27
Episodes: 500	 Sum of rewards in the past 100 episodes = 24.01
Episodes: 600	 Sum of rewards in the past 100 episodes = 23.24
Episodes: 700	 Sum of rewards in the past 100 episodes = 22.35
Episodes: 800	 Sum of rewards in the past 100 episodes = 21.75
Episodes: 900	 Sum of rewards in the past 100 episodes = 22.04
Episodes: 1000	 Sum of rewards in the past 100 episodes = 22.92


# Lets write a better policy

In [70]:
class Policy(nn.Module):
    """
    This class implements a Neural Network as a function approximator for a policy.
    """
    
    def __init__(self, s_size=4, h_size=16, a_size=2):
        super(Policy, self).__init__() # TODO: Write what this does?
        self.fc1 = nn.Linear(s_size, h_size) # First layer
        self.fc2 = nn.Linear(h_size, a_size) # Second layer
        
    def forward(self, input_state):
        h_state = F.relu(self.fc1(input_state))
        out = F.softmax(self.fc2(h_state), dim=1) # TODO: Why dim=1
        return out
    
    def act(self, state):
        state_in_torch = torch.from_numpy(state).float().unsqueeze(0).to(device) # Convert the state (as a numpy array) into a torch tensor       
        action_probs = self.forward(state_in_torch).cpu() # Pass the input state from the network and get action probs
        m = Categorical(action_probs) # TODO: Write what this does?
        action = m.sample() # TODO: Write what this does?
        
        return action.item(), m.log_prob(action) # TODO: Write what this does?

### Lets try without training the policy how this works
Should work exactly similar to a random policy.

In [71]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

scores = reinforce()


Episodes: 100	 Sum of rewards in the past 100 episodes = 20.48
Episodes: 200	 Sum of rewards in the past 100 episodes = 21.21
Episodes: 300	 Sum of rewards in the past 100 episodes = 22.53
Episodes: 400	 Sum of rewards in the past 100 episodes = 22.41
Episodes: 500	 Sum of rewards in the past 100 episodes = 21.72
Episodes: 600	 Sum of rewards in the past 100 episodes = 20.94
Episodes: 700	 Sum of rewards in the past 100 episodes = 21.72
Episodes: 800	 Sum of rewards in the past 100 episodes = 23.32
Episodes: 900	 Sum of rewards in the past 100 episodes = 22.57
Episodes: 1000	 Sum of rewards in the past 100 episodes = 23.02


## Implement with training the policy
As seen above, an untrained network behaves exactly like a random policy.
Now lets write the code for training the policy