In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gymnasium as gym
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install swig 
!pip install gymnasium[box2d]

In [None]:
import random
import sys
from time import time
from collections import deque, defaultdict, namedtuple
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.distributions import Categorical

In [None]:
device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)



In [None]:
env = gym.make('LunarLander-v2')
print(env.action_space)
print("")
print(env.observation_space)

In [None]:
#QNetwork of my DQN implementation
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed):
        super(QNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)
        

In [None]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size, seed):
        self.batch_size = batch_size
        self.seed = random.seed(seed)
        self.memory = deque(maxlen=buffer_size)
        self.experience = namedtuple("Experience", field_names=["observation", "action", "reward", "next_state", "terminated"])
        
    def add(self, observation, action, reward, next_state, terminated):
        #next_state = np.array(next_state)
        if isinstance(observation, tuple):
            observation = observation[0]
        if isinstance(next_state, tuple):
            next_state = next_state[0]
        

        experience = self.experience(observation, action, reward, next_state, terminated)
        self.memory.append(experience)
    
    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)
    
        # Stacking the observations (already NumPy arrays)
        observations = torch.from_numpy(np.vstack([experience.observation for experience in experiences])).float().to(device)
        actions = torch.from_numpy(np.vstack([experience.action for experience in experiences]).reshape(-1, 1)).long().to(device)        
        rewards = torch.from_numpy(np.vstack([experience.reward for experience in experiences]).reshape(-1, 1)).float().to(device)
        next_states = torch.from_numpy(np.vstack([experience.next_state for experience in experiences])).float().to(device)
        terminateds = torch.from_numpy(np.vstack([experience.terminated for experience in experiences]).astype(np.uint8).reshape(-1, 1)).float().to(device)

        return (observations, actions, rewards, next_states, terminateds)
    def __len__(self):
        return len(self.memory)

In [None]:
BUFFER_SIZE = int(1e5) # Replay memory size
BATCH_SIZE = 64         # Number of experiences to sample from memory
GAMMA = 0.99            # Discount factor
TAU = 1e-3              # Soft update parameter for updating fixed q network
LR = 1e-4               # Q Network learning rate
UPDATE_EVERY = 4

class DQNAgent:
    def __init__(self, state_size, action_size, seed):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        # Initialize Q and Fixed Q networks
        self.q_network = QNetwork(state_size, action_size, seed).to(device)
        self.fixed_network = QNetwork(state_size, action_size, seed).to(device)
        self.optimizer = optim.Adam(self.q_network.parameters())
        # Initiliase memory 
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
        self.timestep = 0
        
    
    def step(self, observation, action, reward, next_state, terminated):
        self.memory.add(observation, action, reward, next_state, terminated)
        self.timestep += 1
        if self.timestep % UPDATE_EVERY == 0:
            if len(self.memory) > BATCH_SIZE:
                sampled_experiences = self.memory.sample()
                self.learn(sampled_experiences)
        
    def learn(self, experiences):
        
        states, actions, rewards, next_states, terminateds = experiences
        action_values = self.fixed_network(next_states).detach()

        max_action_values = action_values.max(1)[0].unsqueeze(1)
        
        Q_target = rewards + (GAMMA * max_action_values * (1 - terminateds))
        Q_expected = self.q_network(states).gather(1, actions)
        
        # Calculate loss
        loss = F.mse_loss(Q_expected, Q_target)
        self.optimizer.zero_grad()
        # backward pass
        loss.backward()
        # update weights
        self.optimizer.step()
        
        # Update fixed weights
        self.update_fixed_network(self.q_network, self.fixed_network)
        
    def update_fixed_network(self, q_network, fixed_network):
        for source_parameters, target_parameters in zip(q_network.parameters(), fixed_network.parameters()):
            target_parameters.data.copy_(TAU * source_parameters.data + (1.0 - TAU) * target_parameters.data)
        
        
    def act(self, observation, eps=0.0):
        if isinstance(observation, tuple):
            observation = observation[0]
        rnd = random.random()
        if rnd < eps:
            return np.random.randint(self.action_size)
        else:   
            observation = torch.from_numpy(observation).float().unsqueeze(0).to(device)
            # set the network into evaluation mode 
            self.q_network.eval()
            with torch.no_grad():
                action_values = self.q_network(observation)
            # Back to training mode
            self.q_network.train()
            action = np.argmax(action_values.cpu().data.numpy())
            return action    
        
    def checkpoint(self, filename):
        torch.save(self.q_network.state_dict(), filename)

In [None]:
MAX_EPISODES = 3000  # Max number of episodes to play
MAX_STEPS = 1000     # Max steps allowed in a single episode/play
ENV_SOLVED = 200     # MAX score at which we consider environment to be solved
PRINT_EVERY = 100    # How often to print the progress


EPS_START = 1.0      # Default/starting value of eps
EPS_DECAY = 0.999    # Epsilon decay rate
EPS_MIN = 0.01    

In [None]:
EPS_DECAY_RATES = [0.9, 0.99, 0.999, 0.9999]
plt.figure(figsize=(10,6))

for decay_rate in EPS_DECAY_RATES:
    test_eps = EPS_START
    eps_list = []
    for _ in range(MAX_EPISODES):
        test_eps = max(test_eps * decay_rate, EPS_MIN)
        eps_list.append(test_eps)          
    
    plt.plot(eps_list, label='decay rate: {}'.format(decay_rate))

plt.title('Effect of various decay rates')
plt.legend(loc='best')
plt.xlabel('# of episodes')
plt.ylabel('epsilon')
plt.show()

In [None]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print('State size: {}, action size: {}'.format(state_size, action_size))

In [None]:
dqn_agent = DQNAgent(state_size, action_size, seed=0)

In [None]:
dqn_scores = []
# Maintain a list of last 100 scores
dqn_scores_window = deque(maxlen=100)
eps = EPS_START
start = time()
dqn_actions = []
for episode in range(1, MAX_EPISODES + 1):
    state = env.reset(seed = 42)
    score = 0
    episode_actions = []
    for t in range(MAX_STEPS):
        action = dqn_agent.act(state, eps)  
        episode_actions.append(action)

        #note that observation is the state after taking the action
        observation, reward, terminated, truncated, info = env.step(action)
        if isinstance(observation, tuple):
            observation = observation[0]
            
        dqn_agent.step(state, action, reward, observation, terminated)
        state = observation        
        score += reward        
        if terminated:
            break
            
        eps = max(eps * EPS_DECAY, EPS_MIN)
        mean_score = 0
        if episode % PRINT_EVERY == 0:
            mean_score = np.mean(dqn_scores_window)
            print('\r Progress {}/{}, average score:{:.2f}'.format(episode, MAX_EPISODES, mean_score), end="")
        if mean_score >= ENV_SOLVED:
            print('\rEnvironment solved in {} episodes, average score: {:.2f}'.format(episode, mean_score), end="")
            sys.stdout.flush()
            dqn_agent.checkpoint('solved_200.pth')
            break
            
    dqn_scores_window.append(score)
    dqn_scores.append(score)
    dqn_actions.append(episode_actions)
    
end = time()    
print('Took {} seconds'.format(end - start))
dqn_time = end-start

In [None]:
env.close()


In [None]:
# Actor Network
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.action_head = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        action_probs = torch.softmax(self.action_head(x), dim=-1)
        return action_probs

# Critic Network
class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim=256):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.value_head = nn.Linear(hidden_dim, 1)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        state_value = self.value_head(x)
        return state_value

# Memory for storing experience
class Memory:
    def __init__(self):
        self.states = []
        self.actions = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []

    def clear(self):
        del self.states[:]
        del self.actions[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


In [None]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=0.0003, gamma=0.99, K_epochs=4, eps_clip=0.2):
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.actor = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim)

        self.optimizer = optim.Adam([
            {'params': self.actor.parameters(), 'lr': lr},
            {'params': self.critic.parameters(), 'lr': lr}
        ])

        self.policy_old = Actor(state_dim, action_dim)
        self.policy_old.load_state_dict(self.actor.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        state = torch.FloatTensor(state)
        with torch.no_grad():
            action_probs = self.policy_old(state)
        dist = Categorical(action_probs)
        action = dist.sample()

        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        return action.item()

    def update(self, memory):
        # Convert list to tensor
        old_states = torch.stack(memory.states).detach()
        old_actions = torch.tensor(memory.actions).detach()
        old_logprobs = torch.stack(memory.logprobs).detach()

        # Compute discounted rewards
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        rewards = torch.tensor(rewards, dtype=torch.float32).detach()
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # Optimize policy for K epochs
        for _ in range(self.K_epochs):
            # Evaluate old actions and values
            action_probs = self.actor(old_states)
            dist = Categorical(action_probs)

            # New log probabilities and state values
            logprobs = dist.log_prob(old_actions)
            state_values = self.critic(old_states).squeeze()
            dist_entropy = dist.entropy()

            # Ratios for PPO
            ratios = torch.exp(logprobs - old_logprobs)

            # Advantages
            advantages = rewards - state_values.detach()

            # Surrogate loss
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages

            # Loss function
            loss = -torch.min(surr1, surr2) + 0.5 * self.MseLoss(state_values, rewards) - 0.01 * dist_entropy

            # Take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.actor.state_dict())

In [None]:
env = gym.make('LunarLander-v2')

# Get the size of the state and action spaces
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Instantiate the PPO agent
ppo = PPOAgent(state_dim, action_dim)

In [None]:
# Parameters
max_episodes = 3000  # Number of episodes to train
max_timesteps = 1000  # Max timesteps per episode
update_timestep = 1024  # Timesteps after which to update the policy
timestep = 0

# To track scores
ppo_scores = []  # Store total reward per episode
scores_window_ppo = deque(maxlen=100)  # Last 100 episode scores for moving average
ppo_actions = []
memory = Memory()
# Run training loop
start = time()
for episode in range(1, max_episodes + 1):
    state, _ = env.reset()
    total_reward = 0
    episode_actions = []
    for t in range(max_timesteps):
        timestep += 1
        action = ppo.select_action(state, memory)
        episode_actions.append(action)
        state, reward, terminated, _, _ = env.step(action)

        memory.rewards.append(reward)
        memory.is_terminals.append(terminated)
        total_reward += reward

        # Update PPO after reaching update timestep
        if timestep % update_timestep == 0:
            ppo.update(memory)
            memory.clear()
            timestep = 0

        if terminated:
            break

    # Store the score for the current episode
    ppo_scores.append(total_reward)
    scores_window_ppo.append(total_reward)  # Save the most recent score
    ppo_actions.append(episode_actions)
    
    mean_score = 0
    if episode % PRINT_EVERY == 0:
        mean_score = np.mean(scores_window_ppo)
        print('\r Progress {}/{}, average score:{:.2f}'.format(episode, MAX_EPISODES, mean_score), end="")
    if mean_score >= ENV_SOLVED:
        print('\rEnvironment solved in {} episodes, average score: {:.2f}'.format(episode, mean_score), end="")
        sys.stdout.flush()
        break

end = time()    
print('Took {} seconds'.format(end - start))
ppo_time = end-start

env.close()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(dqn_scores)
plt.plot(pd.Series(dqn_scores).rolling(100).mean())
plt.title('DQN Training for Lunar Landing')
plt.xlabel('# of episodes')
plt.ylabel('score')
plt.savefig('dqn_lunar.png', dpi=300)

plt.show()



In [None]:
plt.figure(figsize=(10,6))
plt.plot(ppo_scores)
plt.plot(pd.Series(ppo_scores).rolling(100).mean())
plt.title('PPO Training for Lunar Landing')
plt.xlabel('# of episodes')
plt.ylabel('score')
plt.savefig('ppo_lunar.png', dpi=300)

plt.show()

In [None]:
def plot_rewards(dqn_rewards, ppo_rewards):
    plt.plot(dqn_rewards, label='DQN')
    plt.plot(ppo_rewards, label='PPO')
    plt.xlabel('Episodes')
    plt.ylabel('Cumulative Reward')
    plt.title('Reward Comparison: DQN vs PPO')
    plt.legend()
    plt.show()

plot_rewards(dqn_scores, ppo_scores)

In [None]:
def plot_reward_variance(dqn_rewards, ppo_rewards, window=100):
    dqn_variance = [np.var(dqn_rewards[max(0, i-window):i+1]) for i in range(len(dqn_rewards))]
    ppo_variance = [np.var(ppo_rewards[max(0, i-window):i+1]) for i in range(len(ppo_rewards))]
    
    plt.plot(dqn_variance, label='DQN Reward Variance')
    plt.plot(ppo_variance, label='PPO Reward Variance')
    plt.xlabel('Episodes')
    plt.ylabel('Variance in Rewards')
    plt.title('Reward Variance: DQN vs PPO')
    plt.legend()
    plt.show()

plot_reward_variance(dqn_scores, ppo_scores)

In [None]:

plt.figure(figsize=(10, 6))
plt.plot(pd.Series(ppo_scores).rolling(100).mean(), label="ppo")
plt.plot(pd.Series(dqn_scores).rolling(100).mean(), label="dqn")

plt.title('DQN vs PPO scores averaged per 100 episodes for Lunar Landing')
plt.xlabel('# of episodes')
plt.ylabel('score')

# Get the last points of the PPO and DQN lines for labeling
ppo_label_x = len(ppo_scores)
ppo_label_y = pd.Series(ppo_scores).rolling(100).mean().iloc[-1]

dqn_label_x = len(dqn_scores)
dqn_label_y = pd.Series(dqn_scores).rolling(100).mean().iloc[-1]

# Place labels near the end of the plot lines
plt.text(ppo_label_x - 50, ppo_label_y, 'PPO', color='blue', fontsize=12, fontweight='bold')
plt.text(dqn_label_x - 50, dqn_label_y, 'DQN', color='orange', fontsize=12, fontweight='bold')

plt.legend()
plt.show()



In [None]:
dqn_flattened_actions = [action for episode in dqn_actions for action in episode]
ppo_flattened_actions = [action for episode in ppo_actions for action in episode]

def plot_action_distribution(dqn_actions, ppo_actions):
    dqn_action_counts = [dqn_actions.count(i) for i in range(4)]  # Assuming 4 actions (0-3)
    ppo_action_counts = [ppo_actions.count(i) for i in range(4)]
    
    plt.bar(range(4), dqn_action_counts, alpha=0.6, label='DQN', width=0.4)
    plt.bar([i + 0.4 for i in range(4)], ppo_action_counts, alpha=0.6, label='PPO', width=0.4)
    plt.xlabel('Actions')
    plt.ylabel('Frequency')
    plt.title('Action Distribution: DQN vs PPO')
    plt.legend()
    plt.show()

plot_action_distribution(dqn_flattened_actions, ppo_flattened_actions)


In [None]:
print(f"Time taken by DQN for 3000 episodes : {dqn_time}")
print(f"Time taken by PPO for 3000 episodes : {ppo_time}")


In [None]:
env.close()

In [None]:
#new environment

env = gym.make("CartPole-v1")

In [None]:
MAX_EPISODES = 1000  # Max number of episodes to play
MAX_STEPS = 1000     # Max steps allowed in a single episode/play
ENV_SOLVED = 195     # MAX score at which we consider environment to be solved
PRINT_EVERY = 100    # How often to print the progress


EPS_START = 1.0      # Default/starting value of eps
EPS_DECAY = 0.999    # Epsilon decay rate
EPS_MIN = 0.01

In [None]:
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

print('State size for CartPole : {}, action size: {}'.format(state_size, action_size))

dqn_agent = DQNAgent(state_size, action_size, seed=0)

In [None]:
#DQN for CartPole

dqn_scores = []
# Maintain a list of last 100 scores
dqn_scores_window = deque(maxlen=100)
eps = EPS_START
start = time()
dqn_actions = []
for episode in range(1, MAX_EPISODES + 1):
    state = env.reset(seed = 42)
    score = 0
    episode_actions = []
    for t in range(MAX_STEPS):
        action = dqn_agent.act(state, eps)  
        episode_actions.append(action)

        #note that observation is the state after taking the action
        observation, reward, terminated, truncated, info = env.step(action)
        if isinstance(observation, tuple):
            observation = observation[0]
            
        dqn_agent.step(state, action, reward, observation, terminated)
        state = observation        
        score += reward        
        if terminated:
            break
            
        eps = max(eps * EPS_DECAY, EPS_MIN)
        mean_score = 0
        if episode % PRINT_EVERY == 0:
            mean_score = np.mean(dqn_scores_window)
            print('\r Progress {}/{}, average score:{:.2f}'.format(episode, MAX_EPISODES, mean_score), end="")
        if mean_score >= ENV_SOLVED:
            print('\rEnvironment solved in {} episodes, average score: {:.2f}'.format(episode, mean_score), end="")
            sys.stdout.flush()
            dqn_agent.checkpoint('solved_200.pth')
            break
            
    dqn_scores_window.append(score)
    dqn_scores.append(score)
    dqn_actions.append(episode_actions)
    
end = time()    
print('Took {} seconds'.format(end - start))
dqn_time = end-start

In [None]:
env.close()

In [None]:
env = gym.make('CartPole-v1')

# Get the size of the state and action spaces
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# Instantiate the PPO agent
ppo = PPOAgent(state_dim, action_dim)

In [None]:
# Parameters
max_episodes = 1000  # Number of episodes to train
max_timesteps = 300  # Max timesteps per episode
update_timestep = 500  # Timesteps after which to update the policy
timestep = 0

# To track scores
ppo_scores = []  # Store total reward per episode
scores_window_ppo = deque(maxlen=100)  # Last 100 episode scores for moving average
ppo_actions = []
memory = Memory()
# Run training loop
start = time()
for episode in range(1, max_episodes + 1):
    state, _ = env.reset()
    total_reward = 0
    episode_actions = []
    for t in range(max_timesteps):
        timestep += 1
        action = ppo.select_action(state, memory)
        episode_actions.append(action)
        state, reward, terminated, _, _ = env.step(action)

        memory.rewards.append(reward)
        memory.is_terminals.append(terminated)
        total_reward += reward

        # Update PPO after reaching update timestep
        if timestep % update_timestep == 0:
            ppo.update(memory)
            memory.clear()
            timestep = 0

        if terminated:
            break

    # Store the score for the current episode
    ppo_scores.append(total_reward)
    scores_window_ppo.append(total_reward)  # Save the most recent score
    ppo_actions.append(episode_actions)
    
    mean_score = 0
    if episode % PRINT_EVERY == 0:
        mean_score = np.mean(scores_window_ppo)
        print('\r Progress {}/{}, average score:{:.2f}'.format(episode, MAX_EPISODES, mean_score), end="")
    if mean_score >= ENV_SOLVED:
        print('\rEnvironment solved in {} episodes, average score: {:.2f}'.format(episode, mean_score), end="")
        sys.stdout.flush()
        break
        
end = time()    
print('Took {} seconds'.format(end - start))
ppo_time = end-start

env.close()

In [None]:
plt.figure(figsize=(10,6))
plt.plot(dqn_scores)
plt.plot(pd.Series(dqn_scores).rolling(100).mean())
plt.title('DQN Training for CartPole')
plt.xlabel('# of episodes')
plt.ylabel('score')
plt.savefig('dqn_Cart_Pole.png', dpi=300)

plt.show()



In [None]:
plt.figure(figsize=(10,6))
plt.plot(ppo_scores)
plt.plot(pd.Series(ppo_scores).rolling(100).mean())
plt.title('PPO Training for cart pole')
plt.xlabel('# of episodes')
plt.ylabel('score')
plt.savefig('ppo_cart_pole.png', dpi=300)

plt.show()

In [None]:
def plot_reward_variance(dqn_rewards, ppo_rewards, window=100):
    dqn_variance = [np.var(dqn_rewards[max(0, i-window):i+1]) for i in range(len(dqn_rewards))]
    ppo_variance = [np.var(ppo_rewards[max(0, i-window):i+1]) for i in range(len(ppo_rewards))]
    
    plt.plot(dqn_variance, label='DQN Reward Variance')
    plt.plot(ppo_variance, label='PPO Reward Variance')
    plt.xlabel('Episodes')
    plt.ylabel('Variance in Rewards')
    plt.title('Reward Variance: DQN vs PPO')
    plt.legend()
    plt.show()

plot_reward_variance(dqn_scores, ppo_scores)

In [None]:
dqn_flattened_actions = [action for episode in dqn_actions for action in episode]
ppo_flattened_actions = [action for episode in ppo_actions for action in episode]

def plot_action_distribution(dqn_actions, ppo_actions):
    dqn_action_counts = [dqn_actions.count(i) for i in range(2)]  # CartPole has 2 actions (0 and 1)
    ppo_action_counts = [ppo_actions.count(i) for i in range(2)]
    
    plt.bar(range(2), dqn_action_counts, alpha=0.6, label='DQN', width=0.4, align='center')
    plt.bar([i + 0.4 for i in range(2)], ppo_action_counts, alpha=0.6, label='PPO', width=0.4, align='center')
    plt.xlabel('Actions (0: Left, 1: Right)')
    plt.ylabel('Frequency')
    plt.title('Action Distribution: DQN vs PPO on CartPole')
    plt.xticks([0.2, 1.2], ['Left (0)', 'Right (1)'])  # Adjusted to show labels for both actions
    plt.legend()
    plt.show()

plot_action_distribution(dqn_flattened_actions, ppo_flattened_actions)


In [None]:
print(f"Time taken by DQN for solving env : {dqn_time}")
print(f"Time taken by PPO for solving env : {ppo_time}")


# Environment - Lunar Landing V2

**DQN vs PPO training performance** 

DQN :

* DQN shows significant instability before episode 1000. However, after that, the performance stabilizes, and DQN achieves rewards above 200 after ~1200 episodes.
* By the end of training, DQN comes close in solving the environment.

PPO : 

* PPO demonstrates more consistent learning in the early episodes, achieving steady improvement up until around 1000 episodes.
* Despite reaching a decent level of performance, PPO does not reach the same level of reward as DQN by the end of training.

**Reward Variance**

* DQN has a much higher variance compared to PPO and thus, is unstable compared to PPO. PPO suggests a stable learning process.

**Average scores over time**

* DQN has better long-term performance, though PPO is better early on in training.

**Action Distribution**

* DQN shows a preference for taking action 2, while PPO distributes more evenly.
* DQN is more exploitative of certain actions, while PPO maintains broader exploration

**Conclusion**
* DQN is more performance-efficient, with it reaching higher rewards, but it is also much higher in variance and volatility. It is less sample-efficient early in training but surpasses PPO after enough episodes.
* PPO is more sample-efficient early, stabilizes faster, and its reward variance is much lower. However, it converges to a lower score compared to DQN overall. 
* PPO is much more hyperparameter sensitive thus some more hyperparameters tuning is needed.



# Environment - Cart Pole

**DQN vs PPO training performance** 

* DQN starts slowly, with low rewards for the first 400-500 episodes. Performance sharply improves around episode 600, nearing a score of 200 by episode 1000. However, performance dips toward the end, stabilizing just below 200.
* PPO performs lightning fast solving the environment in 300 episodes

**Reward Variance**

* PPO demonstrates better consistency with much lower variance in rewards, while DQN exhibits large fluctuations throughout training, reflecting its instability.

**Action Distribution**

* Both seem to have an equal split in their choice of actions though DQN does favour action 1 slightly more.

**Conclusion**
* PPO outperforms DQN in the CartPole environment in terms of both sample efficiency and performance efficiency, solving the environment faster and with greater stability.


***Final*** -

My implementation of DQN makes use of a fully connected neural network instead of a CNN as suggested in the paper based on this implementation - 

https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

My PPO implementation was inspired from several sites and my own attempt at interpreting the algorithm. The results clearly demonstrate a need for more experimentation with hyperparameters considering its sensitivity in the PPO agent.

Overall, the PPO agent was consistently less time taking completing the same number of episodes DQN runs for but in a lesser time. 

