# Deep Learning Applications: Laboratory #3

In this laboratory session we worked to understand some implementations of Deep Reinforcement Learning algorithms.

## Exercise 1: `REINFORCE` Implementation 

In this exercise we start with a `REINFORCE` implementation and train an agent to balance a pole on a cart in the CartPole-v1 environment from OpenAI Gym.

* Policy Network (PolicyNet) - we used a simple feedforward neural network with one hidden layer that outputs a probability distribution over actions using softmax.

* Action Selection (select_action) uses the policy network to sample an action.

* Computing Returns (compute_returns). It computes rewards to estimate how valuable past actions were.

* Running an Episode (run_episode) simulates one complete run of the environment executing actions based on the policy, collects rewards.

Training with REINFORCE uses Monte Carlo Policy Gradient to update the policy network evaluating performance every eval_interval episodes.
At first, the agent will take random actions. But iver time, the policy network improves by assigning higher probabilities to better actions. At some point the agent should be able to balance the pole longer and achieve higher rewards.

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [None]:
class PolicyNet(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 16)
        self.fc2 = nn.Linear(16, env.action_space.n)
        self.relu = nn.ReLU()
        
    def forward(self, s):
        s = F.relu(self.fc1(s))
        s = F.softmax(self.fc2(s), dim=-1)
        return s
    
def select_action(env, obs, policy):
    dist = Categorical(policy(obs))
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return action.item(), log_prob.reshape(1)

def compute_returns(rewards, gamma=0.99):
   return np.flip(np.cumsum([gamma**(i+1)*r for (i, r) in enumerate(rewards)][::-1]), 0).copy()

def run_episode(env, policy, maxlen=500):
    observations = []
    actions = []
    log_probs = []
    rewards = []

    (obs, info) = env.reset()
    for i in range(maxlen):
        obs = torch.tensor(obs)
        (action, log_prob) = select_action(env, obs, policy)
        observations.append(obs)
        actions.append(action)
        log_probs.append(log_prob)
        
        (obs, reward, term, trunc, info) = env.step(action)
        rewards.append(reward)
        if term or trunc:
            break
    return (observations, actions, torch.cat(log_probs), rewards)

def reinforce(policy, env, env_render = None, gamma = 0.99, num_episodes = 10, eval_interval = 100, eval_episodes = 10, standardize_returns = True):
    opt = optim.Adam(policy.parameters(), lr=1e-2)
    running_rewards = [0.0]

    avg_rewards = []

    policy.train()
    for episode in range(num_episodes):
        (observations, actions, log_probs, rewards) = run_episode(env, policy)
        returns = torch.tensor(compute_returns(rewards, gamma), dtype=torch.float32)
        running_rewards.append(0.05 * returns[0].item() + 0.95 * running_rewards[-1])

        if standardize_returns:
            returns = (returns - returns.mean()) / returns.std()
        
        targets = returns

        opt.zero_grad()
        loss = (-log_probs * targets).mean()
        loss.backward()
        opt.step()

        if (episode+1) % eval_interval == 0:
            rewards = [] 
            policy.eval()
            for ep in range(eval_episodes):
                _, _, _, r = run_episode(env, policy)
                rewards.append(np.sum(r))
            print(f"Episode {episode+1}, average reward: {np.mean(rewards)}")
            avg_rewards.append(np.mean(rewards))
            policy.train()
    policy.eval()
    return running_rewards, avg_rewards

In [None]:
env_render = gym.make('CartPole-v1', render_mode='human')
env = gym.make('CartPole-v1')

(obs, info) = env.reset()
print("Observation: ",obs)
print("Observation shape: ", obs.shape)
print("Observation space: ",env.observation_space)
print("Action space: ",env.action_space)

policy = PolicyNet(env)

num_episodes = 100
eval_interval = 100
eval_episodes = 10
running_rewards, avg_rewards = reinforce(policy, env, env_render, num_episodes=num_episodes, eval_interval=eval_interval, eval_episodes=eval_episodes)

plt.figure()
plt.plot(running_rewards)
plt.title('Running rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

plt.figure()
plt.plot(range(eval_interval, num_episodes+1, eval_interval), avg_rewards)
plt.title('Average rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

We trained the policy using reinforce() for 600 episodes. Here are the results.

![Alt Text](plots/cp_rr1.png) 

![Alt Text](plots/cp_ar1.png) 

It is quite clear that after only about 500 episodes the agent has been trained successfully, having a stable running reward.

## Exercise 2: `REINFORCE` with a Value Baseline

In this exercise we changed up REINFORCE to subtract a baseline from the target in the update equation in order to stabilize and  speed-up convergence still in the CartPole environment.

We used a state-value function V(s) as a baseline subtracting it from the returns, stabilizing learning.
A separate ValueNet is used to approximate V(s), so we trained a value network with the policy network optimizing it separately using.

Afterwards, we want to compare the results of the two runs with the same number of episodes for training to observe improvements in stability and convergence speed.

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [None]:
class ValueNet(nn.Module):
    def __init__(self, env):
        super().__init__()
        self.fc1 = nn.Linear(env.observation_space.shape[0], 16)
        self.fc2 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        
    def forward(self, s):
        s = F.relu(self.fc1(s))
        s = self.fc2(s)
        return s
        
def run_episode(env, policy, value_net, maxlen=500):
    observations = []
    actions = []
    log_probs = []
    rewards = []
    values = []

    (obs, info) = env.reset()
    for i in range(maxlen):
        obs = torch.tensor(obs)
        (action, log_prob) = select_action(env, obs, policy)
        value = value_net(obs)
        observations.append(obs)
        actions.append(action)
        log_probs.append(log_prob)
        values.append(value)
        
        (obs, reward, term, trunc, info) = env.step(action)
        rewards.append(reward)
        if term or trunc:
            break
    return (observations, actions, torch.cat(log_probs), rewards, values)
    
def reinforce(policy, value_net, env, env_render = None, gamma = 0.99, num_episodes = 10, eval_interval = 100, eval_episodes = 10, standardize_returns = True):
    opt = optim.Adam(policy.parameters(), lr=1e-2)
    opt_value = optim.Adam(value_net.parameters(), lr=1e-2)
    running_rewards = [0.0]

    avg_rewards = []

    policy.train()
    for episode in range(num_episodes):
        (observations, actions, log_probs, rewards, values) = run_episode(env, policy, value_net)
        values = torch.tensor(values, dtype=torch.float32)
        returns = torch.tensor(compute_returns(rewards, gamma), dtype=torch.float32)
        running_rewards.append(0.05 * returns[0].item() + 0.95 * running_rewards[-1])

        if standardize_returns:
            returns = (returns - returns.mean()) / returns.std()
        
        targets = returns - values

        opt.zero_grad()
        opt_value.zero_grad()
        loss = (-log_probs * targets).mean()
        loss_value = F.mse_loss(returns, values)
        loss_value.requires_grad = True
        loss_value.backward()
        opt_value.step()
        loss += loss_value
        loss.backward()
        opt.step()

        if (episode+1) % eval_interval == 0:
            rewards = [] 
            policy.eval()
            value_net.eval()
            for ep in range(eval_episodes):
                _, _, _, r, _ = run_episode(env, policy, value_net)
                rewards.append(np.sum(r))
            print(f"Episode {episode+1}, average reward: {np.mean(rewards)}")
            avg_rewards.append(np.mean(rewards))
            policy.train()
            value_net.train()
    policy.eval()
    value_net.eval()
    return running_rewards, avg_rewards

In [None]:
env_render = gym.make('CartPole-v1', render_mode='human')
env = gym.make('CartPole-v1')

(obs, info) = env.reset()
print("Observation: ",obs)
print("Observation shape: ", obs.shape)
print("Observation space: ",env.observation_space)
print("Action space: ",env.action_space)

policy = PolicyNet(env)
value_net = ValueNet(env)

num_episodes = 600
eval_interval = 100
eval_episodes = 10
running_rewards, avg_rewards = reinforce(policy, value_net, env, env_render, num_episodes=num_episodes, eval_interval=eval_interval, eval_episodes=eval_episodes)

plt.figure()
plt.plot(running_rewards)
plt.title('Running rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

plt.figure()
plt.plot(range(eval_interval, num_episodes+1, eval_interval), avg_rewards)
plt.title('Average rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

Here are the results of the training.

![Alt Text](plots/cp_rr2.png) 

![Alt Text](plots/cp_ar2.png) 

It is quite clear that after only about 500 episodes the agent has been trained successfully, having a stable running reward.

## Exercise 3: Solving Lunar Lander with `REINFORCE`

In this exercise we implemented policy gradient reinforcement learning using the REINFORCE algorithm with a baseline value network to stabilize training. It is applied to the LunarLander-v3 environment from OpenAI Gym.

The agent is trained on LunarLander-v3, a continuous-state, discrete-action environment where a lander must navigate to a landing pad.

On the beginning we expect the agent to perform poorly as it explores the environment, improvng over time learning a better strategy, with higher rewards and more stable landing behavior.

In [None]:
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from torch.distributions import Categorical
import matplotlib.pyplot as plt

In [None]:
def run_episode(env, policy, value_net, maxlen=1000):
    observations = []
    actions = []
    log_probs = []
    rewards = []
    values = []

    (obs, info) = env.reset()
    for i in range(maxlen):
        obs = torch.tensor(obs)
        (action, log_prob) = select_action(env, obs, policy)
        value = value_net(obs)
        observations.append(obs)
        actions.append(action)
        log_probs.append(log_prob)
        values.append(value)
        
        (obs, reward, term, trunc, info) = env.step(action)
        rewards.append(reward)
        if term or trunc:
            break
    return (observations, actions, torch.cat(log_probs), rewards, values)

def reinforce(policy, value_net, env, env_render = None, gamma = 0.99, num_episodes = 10, eval_interval = 100, eval_episodes = 10, standardize_returns = True):
    opt = optim.Adam(policy.parameters(), lr=1e-2)
    opt_value = optim.Adam(value_net.parameters(), lr=1e-2)
    running_rewards = [0.0]

    avg_rewards = []
    avg_episode_lengths = []

    policy.train()
    for episode in range(num_episodes):
        (observations, actions, log_probs, rewards, values) = run_episode(env, policy, value_net)
        values = torch.tensor(values, dtype=torch.float32)
        returns = torch.tensor(compute_returns(rewards, gamma), dtype=torch.float32)
        running_rewards.append(0.05 * returns[0].item() + 0.95 * running_rewards[-1])

        if standardize_returns:
            returns = (returns - returns.mean()) / returns.std()
        
        targets = returns - values

        opt.zero_grad()
        opt_value.zero_grad()
        loss = (-log_probs * targets).mean()
        loss_value = F.mse_loss(returns, values)
        loss_value.requires_grad = True
        loss_value.backward()
        opt_value.step()
        loss += loss_value
        loss.backward()
        opt.step()

        if (episode+1) % eval_interval == 0:
            rewards = [] 
            policy.eval()
            value_net.eval()
            for ep in range(eval_episodes):
                _, _, _, r, _ = run_episode(env, policy, value_net)
                rewards.append(np.sum(r))
            print(f"Episode {episode+1}, average reward: {np.mean(rewards)}")
            avg_rewards.append(np.mean(rewards))
            policy.train()
            value_net.train()
    policy.eval()
    value_net.eval()
    return running_rewards, avg_rewards

In [None]:
env_render = gym.make('LunarLander-v3', render_mode='human')
env = gym.make('LunarLander-v3')

(obs, info) = env.reset()
print("Observation: ",obs)
print("Observation shape: ", obs.shape)
print("Observation space: ",env.observation_space)
print("Action space: ",env.action_space)

policy = PolicyNet(env)
value_net = ValueNet(env)

num_episodes = 1000
eval_interval = 100
eval_episodes = 10
running_rewards, avg_rewards = reinforce(policy, value_net, env, env_render, num_episodes=num_episodes, eval_interval=eval_interval, eval_episodes=eval_episodes)

plt.figure()
plt.plot(running_rewards)
plt.title('Running rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

plt.figure()
plt.plot(range(eval_interval, num_episodes+1, eval_interval), avg_rewards)
plt.title('Average rewards')
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

Here are the results of the training for 1000 episodes. 

![Alt Text](plots/ll-rr.png) 

![Alt Text](plots/ll-ar.png) 

After about 800 episodes we can see on the running rewards graph that the model has improved significantly but still the average reward was around 0 at best. It is also important to note that that is longer than the easier CartPole environment with worse results.

In some different runs when training fot 2000 episodes the maximum average reward got as high as 12, the tendency was to become higher with more iterations.