In [1]:
!apt-get install swig
!pip install box2d box2d-py

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 21 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,407 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

In [2]:
pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2351229 sha256=37450f9bef3ae7173773be85d9b2d0397b5aa2420c3cf59f8d348c7cc834e9cc
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daa

In [2]:
#importing dependencies
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward

        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):

        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)


In [3]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()

        # Actor (Mean and Std Dev for continuous actions)
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
             nn.Tanh(),
            nn.Linear(64, action_dim)  # Outputs mean of action
        )

        self.log_std = nn.Parameter(torch.zeros(action_dim))  # Learnable log standard deviation

        # Critic (State Value)
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Single value output
        )

    def forward(self, state):
        mean = self.actor(state)
        std = self.log_std.exp()  # Convert log_std to std
        value = self.critic(state)
        return mean, std, value


In [4]:
from torch.distributions import Normal
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, k_epochs=10):
        self.policy = ActorCritic(state_dim, action_dim)
        self.old_policy = ActorCritic(state_dim, action_dim)
        self.old_policy.load_state_dict(self.policy.state_dict())

        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            mean, std, _ = self.old_policy(state)

        dist = Normal(mean, std)
        action = dist.sample()
        action_log_prob = dist.log_prob(action).sum(dim=-1)  # Sum for multi-dimensional actions
        return action.numpy()[0], action_log_prob.numpy()[0]

    def compute_advantages(self, rewards, values, next_value, dones, lambda_gae=0.95):
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def save_model(self, filepath):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict()
        }, filepath)

    def load_model(self, filepath):
        checkpoint = torch.load(filepath)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.old_policy.load_state_dict(self.policy.state_dict())


    def update(self, buffer, batch_size=64):
        data = buffer.get_episode_data()
        states = torch.tensor(data['states'], dtype=torch.float32)
        actions = torch.tensor(data['actions'], dtype=torch.float32)
        log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)
        rewards = data['rewards']
        dones = data['dones']

        values = self.policy.critic(states).squeeze().detach()
        next_value = self.policy.critic(torch.tensor(data['next_state'], dtype=torch.float32)).item() if data['next_state'] else 0.0
        advantages = self.compute_advantages(rewards, values, next_value, dones)
        targets = advantages + values

        dataset = torch.utils.data.TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for _ in range(self.k_epochs):
            for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:

                mean, std, values_pred = self.policy(batch_states)
                dist = Normal(mean, std)
                log_probs = dist.log_prob(batch_actions).sum(dim=-1)

                ratios = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                loss_actor = -torch.min(surr1, surr2).mean()

                loss_critic = self.mse_loss(values_pred.squeeze(), batch_targets)

                loss = loss_actor + 0.5 * loss_critic

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

        self.old_policy.load_state_dict(self.policy.state_dict())




In [7]:
import statistics
def train_ppo(env_name="LunarLanderContinuous-v2", num_episodes=1000, max_timesteps=2000):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPOAgent(state_dim, action_dim)
    buffer = StorageBuffer()
    best_reward = -float('inf')

    mean_raward = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        episode_reward = 0

        for t in range(max_timesteps):
            action, log_prob = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            buffer.add_step(state, action, reward, log_prob, done, next_state if not done else None)
            state = next_state
            if done:
                break
        mean_raward.append(episode_reward)

        agent.update(buffer)
        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save_model("lunar_lander_continuous.pth")

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, mean raward: {statistics.mean(mean_raward)}, Best: {best_reward:.2f}")
            mean_raward = []

    env.close()
    return agent


def test_model(model_path, env_name="LunarLanderContinuous-v2", num_episodes=10):
    env = gym.make(env_name, render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPOAgent(state_dim, action_dim)
    agent.load_model(model_path)

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, _ = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state

        print(f"Test Episode {episode + 1}, Total Reward: {total_reward:.2f}")

    env.close()


In [25]:
train_ppo()

  states = torch.tensor(data['states'], dtype=torch.float32)
  actions = torch.tensor(data['actions'], dtype=torch.float32)
  log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)


Episode 10, mean raward: -183.58017089277988, Best: -58.78
Episode 20, mean raward: -351.9370443369463, Best: -58.78
Episode 30, mean raward: -177.95139493568928, Best: -56.87
Episode 40, mean raward: -176.55515385063444, Best: -23.65
Episode 50, mean raward: -135.39923742339832, Best: 43.18
Episode 60, mean raward: -406.1119892217437, Best: 43.18
Episode 70, mean raward: -525.9344330712152, Best: 43.18
Episode 80, mean raward: -597.6216824474468, Best: 43.18
Episode 90, mean raward: -83.53261962996963, Best: 43.18
Episode 100, mean raward: -67.01166763324015, Best: 43.18
Episode 110, mean raward: -9.84739522050436, Best: 46.97
Episode 120, mean raward: -18.632398736876596, Best: 84.00
Episode 130, mean raward: 70.75937320406683, Best: 167.41
Episode 140, mean raward: 94.64526832527706, Best: 175.06
Episode 150, mean raward: 142.01454435530368, Best: 196.29
Episode 160, mean raward: 7.780285618314126, Best: 196.29
Episode 170, mean raward: 39.39317996904693, Best: 196.29
Episode 180, m

<__main__.PPOAgent at 0x7b87c4a971d0>

In [8]:
model_path  = "lunar_lander_continuous.pth"
test_model(model_path)

  checkpoint = torch.load(filepath)


Test Episode 1, Total Reward: 54.34
Test Episode 2, Total Reward: -0.86
Test Episode 3, Total Reward: 26.17
Test Episode 4, Total Reward: 38.09
Test Episode 5, Total Reward: 8.93
Test Episode 6, Total Reward: 107.00
Test Episode 7, Total Reward: 246.87
Test Episode 8, Total Reward: 121.78
Test Episode 9, Total Reward: 88.87
Test Episode 10, Total Reward: 232.88
