## Policy gradient
### REINFORCE ALGORITHM - MONTE CARLO APPROACH

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import gymnasium as gym
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / float(N)

First, let's define our policy network $\pi$:

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, n_inputs, n_outputs, hidden_dim_size):
        super(PolicyNetwork, self).__init__()
        # TODO: Declare the structure of the network

    def forward(self, state):
        # TODO: return logπ together with π, using torch.log() function

In [None]:
# TODO: Define the model and the optimizer

In [6]:
# loading the CartPole environment
# https://gymnasium.farama.org/environments/classic_control/cart_pole/
env = gym.make('CartPole-v1')

Final loop:

- generate a trajectory $\tau$ following policy $\pi(\cdot | \cdot, \theta)$
- for each $t$ in $\tau$:
    - $G_t \leftarrow \sum_{k=t+1}^{T} \gamma^{k-t-1} r_k$
    - $\theta \leftarrow \theta + \alpha \gamma^t G_t \nabla \ln \pi (a_t|s_t, \theta)$

with $G_t$ being the discounted reward in future at timestep $t$, $\theta$ meaning policy network parameters and trajectory $tau$ being the set of states, actions and rewards $(s_0, a_0, r_0, s_1, ..., s_T, a_T, r_T)$

In [None]:
NUM_TRAJECTORIES = 2000
MAX_EPISODE_LENGTH = 500
gamma = 0.9
# placeholders for rewards for each episode
rewards = []
losses = []
# iterating through trajectories
for tau in tqdm(range(NUM_TRAJECTORIES)):
    # resetting the environment
    state, info = env.reset()
    # setting done to False for while loop 
    done = False
    # storing trajectory and logπ(a_t|s_t, θ)
    transition_buffer = []
    log_probs = []
    
    t = 0
    while done == False and t < MAX_EPISODE_LENGTH:
        # TODO: Play the episode and collect data
        pass
    # logging the episode length as a cumulative reward
    rewards.append(t)
    returns = []
    for t_prime in range(t):
        # TODO: Compute G
        pass
    # TODO: Normalize the returns and perform and update


In [None]:
# plot the results of the training
plt.figure(figsize=(12,9))
plt.plot(running_mean(rewards,100))
plt.grid()
plt.title("REINFORCE cumulative rewards")

In [9]:
import pandas as pd

df = pd.DataFrame(rewards, columns=['reward'])
df.to_csv('REINFORCE.csv')

### Not normalizing returns

In [None]:
# TODO: Define the model and the optimizer again

In [None]:
# TODO: Repeat training without normalizing G

In [None]:
plt.figure(figsize=(12,9))
plt.plot(running_mean(rewards,100), label="normalized")
plt.plot(running_mean(rewards_non_norm, 100), label="not normalized")
plt.grid()
plt.legend()
plt.title("REINFORCE cumulative rewards")

See Chapter 13.3 of Sutton&Barto "Reinforcement Learning"

Additional information about normalizing the discounted rewards:
 - http://karpathy.github.io/2016/05/31/rl/, paragraph *More general advanced functions*