In [4]:
!apt-get install swig
!pip install box2d box2d-py

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
swig is already the newest version (4.0.2-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.


In [None]:
pip install gymnasium[box2d]

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting swig==4.* (from gymnasium[box2d])
  Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.0-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp311-cp311-linux_x86_64.whl size=2351226 sha256=1bfc5b5ce94a0104732ba94c74ced1f747a916c57d975efd1d150c4af923fd61
  Stored in directory: /root/.cache/pip/wheels/ab/f1/0c/d56f4a2bdd12bae0a0693ec33f2f0daa

In [7]:
pip install wandb



In [1]:
#importing dependencies
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward

        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):

        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
        )
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, x):
        mean =   self.fc(x)
        std = self.log_std.exp()
        return mean,std

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)


In [14]:
import wandb
# wandb.login()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohamedrxo4[0m ([33mmohamedrxo4-netflix[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [38]:
wandb.finish()

0,1
episode_reward,▅▆▇█▇▄▅▄▆▇▇▆▆▆▅▅▃▃▃▅▅▅▆▁▄▄▂█▅▆▇▅▇▅▇▅▅▃▇▇

0,1
episode_reward,-47.98077


In [39]:
run = wandb.init(
    project="PPO-LunarLande_continous",

)

In [2]:
from torch.distributions import Normal
class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, k_epochs=10):

        self.policy = PolicyNetwork(state_dim, action_dim)
        self.old_policy = PolicyNetwork(state_dim, action_dim)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_net = ValueNetwork(state_dim)

        self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=lr)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            mean, std = self.old_policy(state)

        dist = Normal(mean, std)
        action = dist.sample()
        action_log_prob = dist.log_prob(action).sum(dim=-1)  # Sum for multi-dimensional actions
        return action.numpy()[0], action_log_prob.numpy()[0]

    def compute_advantages(self, rewards, values, next_value, dones, lambda_gae=0.95):
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    # def save_model(self, filepath):
    #     torch.save({
    #         'policy_state_dict': self.policy.state_dict(),
    #         'value_state_dict': self.value_net.state_dict()
    #     }, filepath)

    # def load_model(self, filepath):
    #     checkpoint = torch.load(filepath)
    #     self.policy.load_state_dict(checkpoint['policy_state_dict'])
    #     self.value_net.load_state_dict(checkpoint['value_state_dict'])

    def save_model(self, filepath):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'old_policy_state_dict': self.old_policy.state_dict(),
            'value_state_dict': self.value_net.state_dict(),
            'policy_optimizer_state_dict': self.policy_optimizer.state_dict(),
            'value_optimizer_state_dict': self.value_optimizer.state_dict()
        }, filepath)

    def load_model(self, filepath):
        checkpoint = torch.load(filepath)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['old_policy_state_dict'])
        self.value_net.load_state_dict(checkpoint['value_state_dict'])
        self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer_state_dict'])
        self.value_optimizer.load_state_dict(checkpoint['value_optimizer_state_dict'])

        # Ensure all models are in eval mode for testing
        self.policy.eval()
        self.old_policy.eval()
        self.value_net.eval()

    def update(self, buffer, batch_size=64):
        data = buffer.get_episode_data()
        states = torch.tensor(data['states'], dtype=torch.float32)
        actions = torch.tensor(data['actions'], dtype=torch.float32)
        log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)
        rewards = data['rewards']
        dones = data['dones']

        values = self.value_net(states).squeeze().detach()
        next_value = self.value_net(torch.tensor(data['next_state'], dtype=torch.float32)).item() if data['next_state'] else 0.0
        advantages = self.compute_advantages(rewards, values, next_value, dones)
        targets = advantages + values

        dataset = torch.utils.data.TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

        for _ in range(self.k_epochs):
            for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:

                mean, std = self.policy(batch_states)
                values_pred  =  self.value_net(batch_states)
                dist = Normal(mean, std)
                log_probs = dist.log_prob(batch_actions).sum(dim=-1)

                ratios = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                loss_actor = -torch.min(surr1, surr2).mean()

                loss_critic = self.mse_loss(values_pred.squeeze(), batch_targets)

                self.policy_optimizer.zero_grad()
                loss_actor.backward(retain_graph=True)
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                loss_critic.backward()
                self.value_optimizer.step()

        self.old_policy.load_state_dict(self.policy.state_dict())


In [5]:
import statistics
def train_ppo(env_name="LunarLanderContinuous-v2", num_episodes=1000, max_timesteps=2000):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPOAgent(state_dim, action_dim)
    buffer = StorageBuffer()
    best_reward = -float('inf')

    mean_raward = []
    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        episode_reward = 0

        for t in range(max_timesteps):
            action, log_prob = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward

            buffer.add_step(state, action, reward, log_prob, done, next_state if not done else None)
            state = next_state
            if done:
                break
        mean_raward.append(episode_reward)
        wandb.log({"episode_reward":episode_reward})

        agent.update(buffer)
        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save_model("lunar_lander_continuous2.pth")

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, mean raward: {statistics.mean(mean_raward)}, Best: {best_reward:.2f}")
            mean_raward = []

    env.close()
    return agent


def test_model(model_path, env_name="LunarLanderContinuous-v2", num_episodes=10):
    env = gym.make(env_name, render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    agent = PPOAgent(state_dim, action_dim)
    agent.load_model(model_path)

    for episode in range(num_episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            action, _ = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state

        print(f"Test Episode {episode + 1}, Total Reward: {total_reward:.2f}")

    env.close()


In [43]:
train_ppo()

  states = torch.tensor(data['states'], dtype=torch.float32)
  actions = torch.tensor(data['actions'], dtype=torch.float32)
  log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)


Episode 10, mean raward: -316.90505711824824, Best: -142.09
Episode 20, mean raward: -202.02425145012722, Best: -72.09
Episode 30, mean raward: -222.1707567527065, Best: -66.79
Episode 40, mean raward: -243.20076068485295, Best: -48.63
Episode 50, mean raward: -173.2686520546552, Best: -13.43
Episode 60, mean raward: -212.9633876995138, Best: -13.43
Episode 70, mean raward: -157.76324666839693, Best: -13.43
Episode 80, mean raward: -91.78177700931286, Best: 27.85
Episode 90, mean raward: -302.2741856110952, Best: 27.85
Episode 100, mean raward: -358.05486627674827, Best: 27.85
Episode 110, mean raward: -400.5408956523221, Best: 27.85
Episode 120, mean raward: -133.89960364438488, Best: 28.66
Episode 130, mean raward: -233.49998975943487, Best: 28.66
Episode 140, mean raward: -147.71557081395295, Best: 30.93
Episode 150, mean raward: -66.35643860595002, Best: 30.93
Episode 160, mean raward: -124.29937165323702, Best: 30.93
Episode 170, mean raward: -133.16533601493427, Best: 30.93
Episo

<__main__.PPOAgent at 0x7f1348b75790>

In [7]:
model_path  = "lunar_lander_continuous2.pth"
test_model(model_path)

  checkpoint = torch.load(filepath)


Test Episode 1, Total Reward: 77.36
Test Episode 2, Total Reward: -68.43
Test Episode 3, Total Reward: -96.00
Test Episode 4, Total Reward: -88.84
Test Episode 5, Total Reward: -92.62
Test Episode 6, Total Reward: -203.78
Test Episode 7, Total Reward: -67.98
Test Episode 8, Total Reward: 72.53
Test Episode 9, Total Reward: -97.86
Test Episode 10, Total Reward: -91.42


False
False
