# ENN585 - Advanced Machine Learning - Week 3

Welcome to Week 3 of ENN585!

This week's notebook let's you explore simple Policy Gradient concepts, including a simple implementation of REINFORCE.

In [None]:
# As usual, we start by importing the necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import animation

from IPython.display import display, Image


# This is a helper function that let's us plot rewards and action distributions later on.
def draw_plots(actions, rewards, total_rewards):

    # plot the action probabilities
    actions = np.array(actions); mu = actions.mean(axis=0); std = actions.std(axis=0)
    action_labels = ['L', 'C', 'R']
    for i in range(3):
        plt.plot(mu[:,i], label=f'Action {action_labels[i]}')    
        plt.fill_between(range(mu.shape[0]), mu[:,i]-std[:,i], mu[:,i]+std[:,i], alpha=0.2)        
    plt.ylim(0,1.05)
    plt.xlabel('Time Step'); plt.ylabel('Action Probabilities'); plt.legend(); plt.grid(); plt.tight_layout()

    # plot rewards  
    plt.figure()
    rewards = np.array(rewards); mu = rewards.mean(axis=0); std = rewards.std(axis=0)
    plt.plot(mu, color='black')
    plt.fill_between(range(mu.shape[0]), mu-std, mu+std, alpha=0.2, color='black')
    plt.grid(); plt.xlabel('Time Step'); plt.ylabel('Reward'); plt.tight_layout()

    # plot total rewards
    plt.figure()
    plt.plot(total_rewards)


# a helper function to plot the animations we used in the lecture slides
def plot_animation(i):    
    plt.clf()
    action = trajectory[i][0]
    plt.bar([0, 1, 2], action.flatten())
    plt.ylim(0, 1.05)
    plt.xticks([0, 1, 2], ['L', 'C', 'R'])
    plt.xlabel('Action')
    plt.ylabel('Probability')
    plt.title(f'Action Distribution at Time {i}')

# fig, ax = plt.subplots()
# ani = animation.FuncAnimation(fig, plot_animation, frames=list(range(0, len(trajectory[0:150]))), blit=False)
# filename='ani-06.gif'
# ani.save(filename, writer='imagemagick', fps=40)
# plt.close()
# display(Image(filename))   

## Towards Policy Optimisation

We used a very simple environment and policy in the lecture to illustrate the basic concepts of policy optimisation. The policy is essentially just a lookup table that outputs a probability distribution over the actions, but is independent of the state.

In the Prac, explore the different concepts by recreating what we demonstrated during the lecture.

In [None]:

# A simple policy class. It is actually not depending on the state. Instead, it returns a fixed policy.
# There are three actions, and the policy returns a probability distribution over these actions.
# 
# Three parameters (the logits) are used to define the policy. 
# The softmax function is used to convert the logits to a probability distribution.
class Policy:
    def __init__(self):
        self.logits = np.random.rand(3,1)
        self.backward()

    # the softmax function in numpy
    def softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)

    # the gradient of the softmax function
    def grad_softmax(self, x):
        p = self.softmax(x)
        return p * (1 - p)

    def forward(self, state):
        # clamp the logits to 100 to avoid numerical overflow
        self.logits = np.clip(self.logits, -100, 100)        
        return self.softmax(self.logits)
    
    def backward(self):
        self.grad = self.grad_softmax(self.logits)

    def __call__(self, state):
        return self.forward(state)


# The environment is super simple. There is no state, and the reward is only depending on the action.
class EnvironmentSimple:
    def __init__(self) -> None:
        self.state = 0

    def step(self, action):
        # we only get a reward from action 0, and the state never changes
        if action == 0:
            reward = 1        
        else:
            reward = 0            
        
        # we never stop the episode
        done = False

        return self.state, reward, done

**YOUR TURN!**

Below is the simple policy we used in the lecture. It's a simple lookup table that outputs a probability distribution over the actions, but is independent of the state. 

Change the update rule to the Policy Gradient update and recreate the steps we used in the lecture. Feel free to explore the different concepts by changing the environment and policy, or parameters like the learning rate.

In [None]:
actions = []
rewards = []
total_rewards = []

lr = 0.2

# we run the policy for 100 episodes
for episode in range(100):    
    trajectory = []    
    env = EnvironmentSimple()
    state = 0

    # We re-initialise the policy for every episode, forgetting what we learned.
    # Notice that this is NOT how RL algorithms would work in practice: you would want to keep learning after completing an episode and
    # complete a large number of episodes, what we might call a "run". For the sake of this simple example, we reset the policy at the beginning of each episode.
    policy = Policy()           
    
    # each episode has 500 time steps
    for i in range(500):        

        # run the policy to get the action probabilities
        action = policy(state)        
        
        # sample from the action distribution
        a = np.random.choice([0,1,2], p=action.flatten())
        
        # execute the policy in the environment
        state, reward, done = env.step(a)        
        trajectory.append([action, reward, a])        

        # update the network weights                
        # YOUR TURN! Change the update rule here to the policy gradient update. Retrace the steps we demonstrated in the lecture.
        policy.logits[a] += 0 
                

    # some housekeeping for later plotting
    actions.append(np.array([x[0] for x in trajectory]).squeeze())
    rewards.append(np.array([x[1] for x in trajectory]).squeeze())
    total_rewards.append(rewards[-1].sum())

# plot the action probabilities
draw_plots(actions, rewards, total_rewards)

## Policy Update with a Neural Network

Now we use an actual neural network to represent the policy. We will use the same environment as in the lecture, but now the policy is a neural network that takes the state as input and outputs a probability distribution over the actions. The network is still very simple, but it's a step up from the lookup table. 

Notice how we use PyTorch's abilities to calculate the gradients and perform parameter update steps for us. This is a very simple example, but it's a good illustration of how PyTorch can be used to implement more complex algorithms.

**YOUR TURN!**

Explore the different concepts by changing the network architecture, learning rate, or other parameters.

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 3)
        # YOUR TURN! Feel free to change the network architecture. Make sure you also update the forward() function accordingly.
        # self.fc2 = nn.Linear(10, 3)       

    def forward(self, x):
        x = self.fc1(x)
        # x = F.relu(x)
        # x = self.fc2(x)
        return F.softmax(x, dim=0)
        
actions = []
rewards = []
total_rewards = []

state = 0

for episode in range(50):        
    trajectory = []    
    env = EnvironmentSimple()
    policy = PolicyNetwork()    
    optimiser = torch.optim.Adam(policy.parameters(), lr=1)    

    for i in range(100):
        optimiser.zero_grad()
                
        # get the action probabilities
        action = policy(torch.tensor([state], dtype=torch.float32))        
        
        # sample from the action distribution                
        a = torch.distributions.Categorical(action).sample().item()

        # execute the action
        state, reward, done = env.step(a)        
        
        # YOUR TURN! update the policy network with the loss and a gradient step                
        loss = 0      
        loss.backward()        
        optimiser.step()

        # remember the action probabilities, reward and the executed action
        trajectory.append([action, reward, a])
    
    # some housekeeping to remember the results for later plotting
    actions.append([x[0].detach().numpy() for x in trajectory])
    rewards.append(np.array([x[1] for x in trajectory]).squeeze())
    total_rewards.append(rewards[-1].sum())

draw_plots(actions, rewards, total_rewards)


## REINFORCE: A Simple Policy Gradient Algorithm

Now we are ready to explore REINFORCE, an actual policy gradient algorithm. 

We will introduce a more complicated environment, consisting of 3 states. The agent can transition back and forth between them. 

[ State A ] <--> [ State B ] <--> [ State C ]

The agent can choose from the same actions as before, [Left, Center, Right]. This time, the agent always starts in the left-most state, has to move Right twice, and then invoke the Center action. It receives a positive reward then, but a slightly negative reward in every timestep.


**YOUR TURN!**
Experiment freely here to familiarise yourself to the concepts.

- Experiment with different settings of gamma and learning rate.
- Try to implement a baseline.
- Try different network architectures.




In [None]:
class EnvironmentSequence:
    def __init__(self) -> None:
        # We represent the state as an integer, with -1 being the leftmost state, 0 the middle state, and 1 the rightmost state
        self.state = -1

    def step(self, action):        

        # we have a sequence of states, and the agent needs to move to the right state        
        if action == 0: # move left
            self.state -= 1 
        elif action == 1: # center, i.e. stay in the same state
            self.state = self.state
        elif action == 2: # move right
            self.state += 1
        
        # make sure the state is always between -1 and 1
        self.state = np.clip(self.state, -1, 1)

        # we only get positive reward in the rightmost state when executing action 1 ('center'), then we end the episode
        if self.state == 1 and action == 1:
            reward = 1
            done = True
        else: # otherwise the agent gets a small negative reward for taking actions and moving around 
            reward = -0.05
            done = False
        
        return self.state, reward, done
    
#  =========================================================
# A very simple policy network. It takes the state as input and returns a probability distribution over the 3 actions.
# YOUR TURN! Change the network architecture. Make sure you also update the forward() function accordingly.

class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(1, 3)

    def forward(self, x):
        x = self.fc1(x)
        return F.softmax(x, dim=-1)
    
#  =========================================================
action_probs_to_plot = []
rewards_to_plot = []
total_rewards = []
total_loss = []

for runs in range(10):     

    # change the random seed for every run, but it will be repeatable every time you run this notebook cell  
    torch.manual_seed(runs)

    #initialise the agent with every seed
    policy = PolicyNetwork()    
    optimiser = torch.optim.AdamW(policy.parameters(), lr=0.01)    

    episode_reward = []
    episode_loss = []
    for episode in range(500):      
        env = EnvironmentSequence() 
        state = env.state     

        trajectory = {'state':[], 'action':[], 'reward':[], 'prob': []}        
        optimiser.zero_grad()

        # unroll the episode, i.e. execute the policy until the episode terminates 
        # notice how we remember the trajectory (states, actions, rewards, action probabilities) for later 
        done = False
      
        while not done:
            # get the action probabilities from the policy
            action = policy(torch.tensor([state], dtype=torch.float32))        

            # sample from the action distribution                
            a = torch.distributions.Categorical(action).sample().item()
            prob = action[a]

            # execute the action
            state, reward, done = env.step(a)             
            
            # remember state, the executed action, and the reward
            trajectory['state'].append(state)
            trajectory['action'].append(a)
            trajectory['reward'].append(reward)
            trajectory['prob'].append(prob)
           
             
        # After the episode ends, update the policy network with the policy gradient loss:
        
        # The Discount Factor. YOUR TURN! Experiment with the discount factor. What happens if you decrease it or increase it to 0.99?
        gamma = 0.6 

        # compute the future discounted return for each time step in the episode
        running_g = 0
        gs = []
        for R in trajectory['reward'][::-1]:
            running_g = R + gamma * running_g
            gs.insert(0, running_g)

        # compute the policy gradient loss
        deltas = torch.tensor(gs)
        action_probs = trajectory['prob']   # remember all the action probabilities we stored in the trajectory
        
        # add up all the individual losses for each time step in the episode
        loss = 0
        for prob, delta in zip(action_probs, deltas):            
            loss -= prob * delta    # YOUR TURN! Make changes to the loss function, e.g. explore the influence of a baseline term
                
        # perform a gradient update step on the policy network, using all of the data gathered in the episode
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        
        # # some housekeeping to remember the results for later plotting    
        total_rewards.append(np.sum(trajectory['reward']))
        episode_loss += [loss.detach()]


    # after finishing an episode, print some information
    total_loss += [episode_loss]
    print(f'\nRun {runs} - Total Reward over all Episodes: {np.sum(total_rewards[-episode:])} -- Final Loss: {np.mean(episode_loss)}')
    print(f'Policy in state -1: {policy(torch.tensor([-1], dtype=torch.float32)).detach().tolist()}')       
    print(f'Policy in state 0: {policy(torch.tensor([0], dtype=torch.float32)).detach().tolist()}')       
    print(f'Policy in state 1: {policy(torch.tensor([1], dtype=torch.float32)).detach().tolist()}')          
    print(f'Action sequence in the final episode: {trajectory["action"]}')


# after all the runs, plot how the loss evolved across all episodes, by plotting the mean loss and standard deviations across all runs
mean_loss = np.mean(total_loss, axis = 0)
std_loss = np.std(total_loss, axis = 0)
plt.plot(mean_loss)    
plt.fill_between(range(mean_loss.shape[0]), mean_loss-std_loss, mean_loss+std_loss, alpha=0.2)
plt.xlabel('Episode'); plt.ylabel('Episode loss'); plt.grid(); plt.tight_layout()
plt.show()

## Explore Further

Head to https://gymnasium.farama.org/tutorials/training_agents/reinforce_invpend_gym_v26/#sphx-glr-tutorials-training-agents-reinforce-invpend-gym-v26-py to explore a more complex environment (CartPole) for the REINFORCE algorithm.

You can download a demo notebook with the code from the bottom of the website.


## Connection to the Assessment 1 Project

The concepts you learn in this week's prac are directly applicable to the first assessment project. You can implement a policy gradient algorithm to solve the Fetch Slide environment. Try adopting REINFORCE to this environment and see how it performs. Try initialising the policy network from your hand-crafted policy using the imitation learning techniques from Week 2 and see if it helps the learning process.




