# PPO-GAE

### Import Libraries

In [1]:
# Imports and Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import gym
import random
from collections import deque
import matplotlib.pyplot as plt

### Environment Setup

In [2]:
env_name = 'LunarLanderContinuous-v2'
nenvs = 6  # Adjusted for computational efficiency
env = gym.vector.make(env_name, num_envs=nenvs, asynchronous=False)

### Define Actor and Critic Networks

In [3]:
class ActorNet(nn.Module):
    def __init__(self, input_size, hidden_units=64, output_size=2):
        super(ActorNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.Tanh(),
            nn.Linear(hidden_units, int(hidden_units/2)),
            nn.Tanh()
        )
        self.mu_head = nn.Linear(int(hidden_units/2), output_size)
        self.logstd_head = nn.Linear(int(hidden_units/2), output_size)

    def forward(self, x):
        x = self.model(x)
        loc = torch.tanh(self.mu_head(x)) * 2  # Adjust the range if needed
        scale = torch.exp(self.logstd_head(x))
        return loc, scale
    
class CriticNet(nn.Module):
    def __init__(self, input_size, hidden_units=64, output_size=2):
        super(CriticNet, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.Tanh(),
            nn.Linear(hidden_units, int(hidden_units/2)),
            nn.Tanh()
        )
        self.value_head = nn.Linear(int(hidden_units/2), 1)

    def forward(self, x):
        x = self.model(x)
        value = self.value_head(x)
        return value

    def __call__(self, x):
        out = self.forward(x)
        return out

### Initialize Networks, Optimizers, and other variables

In [4]:
# Initialize Networks
obs_dim = env.single_observation_space.shape[0]
n_acts = env.single_action_space.shape[0]
hidden_sizes = 64  # Can be tuned

actor_net = ActorNet(obs_dim, hidden_sizes, n_acts)
critic_net = CriticNet(obs_dim, hidden_sizes, 1)

# Initialize Optimizers
actor_optimizer = optim.Adam(actor_net.parameters(), lr=0.0003)
critic_optimizer = optim.Adam(critic_net.parameters(), lr=0.0003)

# Tensor Conversion Functions
T = lambda x: torch.as_tensor(x, dtype=torch.float32)
Ti = lambda x: torch.as_tensor(x, dtype=torch.int64)

### Training Function and Other Helper Functions

In [5]:
class RunningMem():

    def __init__(self):
        self.reset()

    def store(self, obs, action, logprob, reward, done, obs_, values, values_):
        self.obs.append(obs)
        self.actions.append(action.unsqueeze(-1))
        self.logprobs.append(logprob)
        self.rewards.append(reward.unsqueeze(-1))
        self.dones.append(done.unsqueeze(-1))
        self.obs_.append(obs_)
        self.values.append(values)
        self.values_.append(values_)


    def batches(self, batchsize):
        size = nenvs*memsteps
        idx = list(range(size))
        random.shuffle(idx)

        b_obs = torch.stack(self.obs)
        b_actions = torch.stack(self.actions)
        b_logprobs = torch.stack(self.logprobs)
        b_rewards = torch.stack(self.rewards)
        b_dones = torch.stack(self.dones)
        b_obs_ = torch.stack(self.obs_)
        b_values = torch.stack(self.values)
        b_values_ = torch.stack(self.values_)

        gaes = []
        gae = T(np.zeros(nenvs)).view(nenvs,-1)
        for i in range(len(b_obs)-1,-1,-1):
            delta = b_rewards[i] + gamma * b_values_[i] * (1-b_dones[i]) - b_values[i]
            gae = delta + gamma * lmbda * (1-b_dones[i]) * gae
            gaes.insert(0, gae)

        b_obs = b_obs.view(size, -1)
        b_actions = b_actions.view(size, -1)
        b_logprobs = b_logprobs.view(size, -1)
        b_rewards = b_rewards.view(size, -1)
        b_dones = b_dones.view(size, -1)
        b_obs_ = b_obs_.view(size, -1)
        b_values = b_values.view(size, -1)
        b_values_ = b_values_.view(size, -1)
        b_gae = torch.stack(gaes).view(size, -1)

        for batchn in range(0, len(idx), batchsize):
            batchidx = idx[batchn:batchn+batchsize]
            batchidx = Ti(batchidx)
            mb_obs = torch.index_select(b_obs, 0, batchidx)
            mb_actions = torch.index_select(b_actions, 0, batchidx)
            mb_logprobs = torch.index_select(b_logprobs, 0, batchidx)
            mb_rewards = torch.index_select(b_rewards, 0, batchidx)
            mb_dones = torch.index_select(b_dones, 0, batchidx)
            mb_obs_ = torch.index_select(b_obs_, 0, batchidx)
            mb_values = torch.index_select(b_values, 0, batchidx)
            mb_values_ = torch.index_select(b_values_, 0, batchidx)
            mb_gae = torch.index_select(b_gae, 0, batchidx)
            yield mb_obs, mb_actions, mb_logprobs, mb_rewards, mb_dones, mb_obs_, mb_values, mb_values_, mb_gae

    def reset(self):
        self.obs = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.dones = []
        self.obs_ = []
        self.values = []
        self.values_ = []
        self.gae = []
        
@torch.no_grad()
def sim_action(policy, obs):
    loc, std = policy(T(obs))
    dist = Normal(loc=loc, scale=std+1e-6)
    action = dist.sample()
    action_log_prob = torch.sum(dist.log_prob(action), dim=-1, keepdim=True)
    return action, action_log_prob

def train(mem, gamma=0.99, batchsize=10, epoch_repeat=20, epsilon=0.2, lmbda=0.95):
    for epochrep in range(epoch_repeat):
        for batch in mem.batches(batchsize=batchsize):
            obs, actions, logprobs, rewards, dones, obs_, values, values_, gae = batch
            gae = (gae - torch.mean(gae)) / (torch.std(gae) + 1e-6)
            target = gae + values
            state_values = critic_net(obs)
            critic_loss = F.smooth_l1_loss(state_values, target).mean()

            new_loc, new_scale = actor_net(obs)
            dist = Normal(loc=new_loc, scale=new_scale+1e-6)
            new_logprobs = torch.sum(dist.log_prob(actions), dim=-1, keepdim=True)
            rho = torch.exp(new_logprobs - logprobs)
            surrgt1 = rho * gae
            surrgt2 = rho.clamp(1-epsilon, 1+epsilon) * gae
            policy_loss = -torch.minimum(surrgt1, surrgt2).mean()

            loss = policy_loss + 0.5*critic_loss
            actor_optimizer.zero_grad()
            critic_optimizer.zero_grad()
            loss.backward()
            actor_optimizer.step()
            critic_optimizer.step()

### Main Training Loop

In [6]:
results = deque(maxlen=50)
average_rewards = []  # List to store average rewards per epoch
memsteps = 500
mem = RunningMem()
gamma = 0.99
lmbda = 0.95
epsilon = 0.2
batchsize = 64
epoch_repeat = 10
solved = False  # Flag to indicate whether the problem is solved

totreward = np.zeros(nenvs)
stepcount = 0
epoc = 0
obs, _ = env.reset()

while True:
    stepcount += 1
    action, action_log_prob = sim_action(actor_net, obs)
    next_obs, reward, terminated, truncated, _ = env.step(action.numpy())
    done = terminated | truncated
    with torch.no_grad():
        values = critic_net(T(obs))
        values_ = critic_net(T(next_obs))
    mem.store(T(obs), action, action_log_prob, T(reward), Ti(done), T(next_obs), values, values_)
    obs = next_obs
    totreward += reward
    doneidx = np.where(done == True)
    for k in doneidx[0]:
        results.append(totreward[k])
        totreward[k] = 0

    if stepcount > 1 and stepcount % memsteps == 0:
        epoc += 1
        train(mem, gamma=gamma, batchsize=batchsize, epoch_repeat=epoch_repeat, epsilon=epsilon, lmbda=lmbda)
        mem.reset()

        # Calculate the average reward over the last 50 episodes
        if len(results) >= 50:
            avg_reward = np.mean(list(results)[-50:])  # Average of last 50 elements in deque
        else:
            avg_reward = np.mean(results)  # Average of all elements if less than 50

        average_rewards.append(avg_reward)  # Store the average reward
        print(f'Epoc: {epoc} Avg Result: {avg_reward}')  # Print every epoch

        # Check if the average of the last 50 episodes is >= 195
        if avg_reward >= 195 and not solved:
            print("*" * 125)
            print(f'Solved! Epoc: {epoc}, Average Score Last 50 Episodes: {avg_reward}')
            print("*" * 125)
            solved = True  # Set the flag to indicate the task is solved
            # Save the model or any additional actions here
            break

# Optional: Save the average_rewards for analysis
# np.save('<path_to_save>/average_rewards.npy', np.array(average_rewards))

# Close the environment
env.close()

Epoc: 1 Avg Result: -210.9527172138875
Epoc: 2 Avg Result: -173.79107506392208


KeyboardInterrupt: 

### Visualisations

In [None]:
# Creating the plot

# Generate a range of episode numbers for the x-axis
episode_numbers = list(range(1, len(average_rewards) + 1))

plt.figure(figsize=(12, 6))

# Plot average rewards per episode
plt.plot(episode_numbers, average_rewards, label='Average Reward PPO-GAE', color='blue')

# Add a horizontal line representing the solved threshold
solved_score = 195
plt.axhline(y=solved_score, color='green', linestyle='--', label='Solved Threshold (195)')

# Set labels for x and y axes
plt.xlabel('Episodes')
plt.ylabel('Average Reward')

# Display the plot
plt.title('Average Reward per Episode in LunarLanderContinuous-v2 (PPO-GAE)')
plt.legend()
plt.show()