In [1]:
#importing dependencies
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
!apt-get install swig
!pip install box2d box2d-py

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,565 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126308 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

In [3]:
!pip install gymnasium



In [2]:
# this is our environment that will train our ppo agent on
import gymnasium as gym

env = gym.make("LunarLander-v3",render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)


    episode_over = terminated or truncated

env.close()


print('observation space observation_space',env.observation_space.shape)
print('action space',env.action_space.n)

observation space observation_space (8,)
action space 4


In [3]:
# check for cuda availability
device  =  "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [4]:

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward

        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):

        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 120),
            nn.ReLU(),
            nn.Linear(120, 120),
            nn.ReLU(),
            nn.Linear(120, 1)
        )

    def forward(self, x):
        return self.fc(x)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
from torch.distributions import Categorical
from torch.utils.data import TensorDataset, DataLoader
from gymnasium.vector import SyncVectorEnv

class PPOAgent:
    def __init__(self, state_dim, action_dim,device="cpu", lr=3e-4, gamma=0.99, eps_clip=0.2, k_epochs=10, entropy_coef=0.01):
        self.policy = PolicyNetwork(state_dim, action_dim).to(device)
        self.old_policy = PolicyNetwork(state_dim, action_dim).to(device)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_function = ValueNetwork(state_dim).to(device)

        self.policy_optimizer = optim.AdamW(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.AdamW(self.value_function.parameters(), lr=lr)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.entropy_coef = entropy_coef
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).to(device)
        with torch.no_grad():
            action_prob = self.old_policy(state)
        dist = Categorical(action_prob)
        action = dist.sample()
        return action, dist.log_prob(action), dist.entropy()

    def compute_advantages(self, rewards, values, next_value, dones, lambda_gae=0.95):
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def update(self, buffer, batch_size=120):
    # Collect episode data
        data = buffer.get_episode_data()
        states = data['states'].clone().detach().float().to(device)
        actions = data['actions'].clone().detach().long().to(device)
        log_probs_old = data['log_probs'].clone().detach().float().to(device)
        rewards = data['rewards']
        dones = data['dones']

        values = self.value_function(states).squeeze().detach()
        next_value = self.value_function(torch.tensor(data['next_state'], dtype=torch.float32).to(device)).item() if data['next_state'] else 0.0
        advantages = self.compute_advantages(rewards, values, next_value, dones)
        targets = advantages + values

        # Create dataset and dataloader
        dataset = TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        total_loss_actor = 0
        total_loss_critic = 0
        total_entropy = 0
        num_batches = 0

        for _ in range(self.k_epochs):
            for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:
                action_probs = self.policy(batch_states)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_actions)

                # we add this entropy to encourage exploration
                entropy = dist.entropy().mean()

                ratios = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                loss_actor = -torch.min(surr1, surr2).mean() - self.entropy_coef * entropy

                values_pred = self.value_function(batch_states).squeeze()
                loss_critic = self.mse_loss(values_pred, batch_targets)

                self.policy_optimizer.zero_grad()
                loss_actor.backward()
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                loss_critic.backward()
                self.value_optimizer.step()

                total_loss_actor += loss_actor.item()
                total_loss_critic += loss_critic.item()
                total_entropy += entropy.item()
                num_batches += 1

        self.old_policy.load_state_dict(self.policy.state_dict())

        return total_loss_actor / num_batches, total_loss_critic / num_batches, total_entropy / num_batches



    def save_model(self, path):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'value_state_dict': self.value_function.state_dict(),
            'policy_optimizer': self.policy_optimizer.state_dict(),
            'value_optimizer': self.value_optimizer.state_dict()
        }, path)

    def load_model(self, path):

        checkpoint = torch.load(path, map_location=device)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['policy_state_dict'])
        self.value_function.load_state_dict(checkpoint['value_state_dict'])

        if 'policy_optimizer' in checkpoint:
            self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        if 'value_optimizer' in checkpoint:
            self.value_optimizer.load_state_dict(checkpoint['value_optimizer'])


def train_ppo(num_episodes=5000, max_time_steps=500,model_name = None):

    # runing multiple environments at once so that our model can learn faster and from multiple environments

    num_envs = 5

    env = SyncVectorEnv([lambda: gym.make("LunarLander-v3") for _ in range(num_envs)])
    state_dim = env.single_observation_space.shape[0]
    action_dim = env.single_action_space.n

    agent = PPOAgent(state_dim, action_dim,device)
    if model_name is not None:
        agent.load_model(model_name)

    buffer = StorageBuffer()
    best_reward = -float('inf')

    for episode in range(num_episodes):
        states, _ = env.reset()
        buffer.reset()
        episode_rewards = np.zeros(num_envs)

        for t in range(max_time_steps):
            actions, log_probs, entropys = agent.select_action(states)
            next_states, rewards, terminateds, truncateds, _ = env.step(actions.tolist())
            dones = terminateds | truncateds
            for i in range(num_envs):
                episode_rewards[i] += rewards[i]
                buffer.add_step(states[i], actions[i], rewards[i], log_probs[i], dones[i], next_states[i] if not dones[i] else None)
            states = next_states
            if all(dones):
                break

        loss_actor, loss_critic, entropy_val = agent.update(buffer,500)

        if episode_rewards.mean() > best_reward:
            best_reward = episode_rewards.mean()
            agent.save_model("ppo_lunarlander3.pth")

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1},AVG Reward: {float(episode_rewards.mean()):.2f}, Best: {float(best_reward):.2f}, Loss Actor: {float(loss_actor):.4f}, Loss Critic: {float(loss_critic):.4f}, Entropy: {float(entropy_val):.4f}")


    env.close()
    # wandb.finish()
    return agent, buffer


def test_model(model_path, num_episodes=10):
    env = gym.make("LunarLander-v3" ,render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = PPOAgent(state_dim, action_dim,device)
    agent.load_model(model_path)
    buffer = StorageBuffer()

    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        done = False
        total_reward = 0

        while not done:
            action, _, _ = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action.tolist())
            done = terminated or truncated
            total_reward += reward
            state = next_state

        print(f"Test Episode {episode + 1}, Total Reward: {total_reward:.2f}")

    env.close()


In [None]:
if __name__ == "__main__":
    # this is for training and testing the model
    # note that this environment is bit complex for that it will take a lot more training if you have gpu access that can speed up the training
    num_episodes = 10000  # You can change this number
    print(f"Starting training for {num_episodes} episodes...")

    model_name="ppo_lunarlander3.pth"
    # model_name = None

    agent, train_storage = train_ppo(num_episodes=num_episodes,model_name=model_name)

    print("\nTraining completed! Training statistics:")

    train_stats = train_storage.get_statistics()
    print(f"Mean reward: {train_stats['mean_reward']:.2f}")
    print(f"Max reward: {train_stats['max_reward']:.2f}")
    print(f"Mean episode length: {train_stats['mean_length']:.2f}")

    model_path = "ppo_lunarlander3.pth"

    print("\nStarting model testing...")
    test_storage = test_model(model_path, num_episodes=5)


Starting training for 10000 episodes...


  'dones': torch.tensor(self.dones),


Episode 10,AVG Reward: -40.01, Best: 33.69, Loss Actor: -0.0077, Loss Critic: 0.8539, Entropy: 0.5286
Episode 20,AVG Reward: -83.77, Best: 33.69, Loss Actor: -0.0089, Loss Critic: 0.8380, Entropy: 0.6228
Episode 30,AVG Reward: -68.32, Best: 33.69, Loss Actor: -0.0091, Loss Critic: 0.9375, Entropy: 0.5776
Episode 40,AVG Reward: -44.27, Best: 33.69, Loss Actor: -0.0134, Loss Critic: 0.8813, Entropy: 0.6671
Episode 50,AVG Reward: 80.01, Best: 80.01, Loss Actor: -0.0097, Loss Critic: 0.9102, Entropy: 0.4941
Episode 60,AVG Reward: 70.59, Best: 106.55, Loss Actor: -0.0095, Loss Critic: 0.8736, Entropy: 0.4214
Episode 70,AVG Reward: 39.38, Best: 106.55, Loss Actor: -0.0103, Loss Critic: 0.8915, Entropy: 0.5117
Episode 80,AVG Reward: -42.75, Best: 106.55, Loss Actor: -0.0091, Loss Critic: 0.8761, Entropy: 0.5553
Episode 90,AVG Reward: -76.25, Best: 106.55, Loss Actor: -0.0093, Loss Critic: 0.8665, Entropy: 0.5180
Episode 100,AVG Reward: -47.08, Best: 106.55, Loss Actor: -0.0088, Loss Critic: 0

KeyboardInterrupt: 

In [6]:
# testing the model with the saved params

model_path = "ppo_lunarlander3.pth"

print("\nStarting model testing...")
test_storage = test_model(model_path, num_episodes=5)



Starting model testing...
Test Episode 1, Total Reward: 256.98
Test Episode 2, Total Reward: 205.33
Test Episode 3, Total Reward: 42.90
Test Episode 4, Total Reward: 238.27
Test Episode 5, Total Reward: -34.29
