In [None]:
#importing dependencies
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward


        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones,dtype=torch.bool),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):
        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.fc(x)

In [2]:
pip install swig

Collecting swig
  Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1.post0


In [3]:
pip install "gymnasium[box2d]"

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp312-cp312-linux_x86_64.whl size=2409496 sha256=50489730bf7fc8f014c5b19c0ef371e99a5a8d84bf29778ee00da847f4a163d5
  Stored in directory: /root/.cache/pip/wheels/2a/e9/60/774da0bcd07f7dc7761a8590fa2d065e4069568e78dcdc3318
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5


In [2]:
# this is our environment that will train our ppo agent on
import gymnasium as gym

env = gym.make("LunarLander-v3", render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)

    episode_over = terminated or truncated

env.close()
print('observation space observation_space',env.observation_space.shape[0])
print('action space',env.action_space.n)

observation space observation_space 8
action space 4


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from torch.utils.data import TensorDataset, DataLoader
from gymnasium.vector import SyncVectorEnv
import numpy as np
import torch
import torch.nn as nn

class PPOAgent():
    def __init__(self, state_dim, action_dim, lr=1e-4, gamma=0.99, eps_clip=0.2, k_epochs=10, gae_lambda=0.95, entropy_coef=0.01, value_coef=0.5, device=None):
        # Set device
        if device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        else:
            self.device = torch.device(device)

        self.policy = PolicyNetwork(state_dim, action_dim).to(self.device)
        self.old_policy = PolicyNetwork(state_dim, action_dim).to(self.device)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_function = ValueNetwork(state_dim).to(self.device)
        self.policy_optimizer = optim.AdamW(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.AdamW(self.value_function.parameters(), lr=lr)
        self.gamma = gamma
        self.gae_lambda = gae_lambda
        self.k_epochs = k_epochs
        self.eps_clip = eps_clip
        self.entropy_coef=entropy_coef
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        # this function used to select action by our old policy and returning the log_prob of it
        state = torch.tensor(state, dtype=torch.float32).to(self.device)
        with torch.no_grad():
            action_prob = self.old_policy(state)

        disc = Categorical(action_prob)
        action = disc.sample()

        return action.cpu(), disc.log_prob(action).cpu()

    def compute_advantages(self, rewards, values, next_value, dones):
        # this function calculate the advantage function
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * self.gae_lambda * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32).to(self.device)

        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def save_model(self, path):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'value_state_dict': self.value_function.state_dict(),
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path, map_location=self.device)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['policy_state_dict'])
        self.value_function.load_state_dict(checkpoint['value_state_dict'])

    def update(self, buffer,batch_size=32):
        data = buffer.get_episode_data()
        states = data['states'].to(self.device)
        actions = data['actions'].to(self.device)
        rewards = data['rewards']
        dones = data['dones']
        log_probs_old = data['log_probs'].to(self.device)

        with torch.no_grad():
            values = self.value_function(states).squeeze()
            if data['next_state'] is not None:
                next_state_tensor = data['next_state'].to(self.device)
                next_values = self.value_function(next_state_tensor).item()
            else:
                next_values = 0.0

        advantages = self.compute_advantages(rewards, values.cpu(), next_values, dones)
        advantages = advantages.to(self.device)
        targets = advantages + values

        # Normalize advantages across the entire batch

        dataset = TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        policy_losses = []
        value_losses = []
        entropy_losses = []

        for _ in range(self.k_epochs):
            for batch in dataloader:
                batch_states, batch_actions, batch_old_log_probs, batch_advantages, batch_targets = batch

                # Policy loss with entropy bonus
                action_probs = self.policy(batch_states)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_actions)
                entropy = dist.entropy().mean()

                ratios = torch.exp(log_probs - batch_old_log_probs)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages

                loss_actor = -torch.min(surr1, surr2).mean() - self.entropy_coef * entropy

                # Value loss
                values_pred = self.value_function(batch_states).squeeze()
                loss_critic = self.mse_loss(values_pred, batch_targets)

                self.policy_optimizer.zero_grad()
                loss_actor.backward()
                torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5)
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                loss_critic.backward()
                torch.nn.utils.clip_grad_norm_(self.value_function.parameters(), 0.5)
                self.value_optimizer.step()

                policy_losses.append(loss_actor.item())
                value_losses.append(loss_critic.item())
                entropy_losses.append(entropy.item())

        self.old_policy.load_state_dict(self.policy.state_dict())

        return {
            'policy_loss': np.mean(policy_losses),
            'value_loss': np.mean(value_losses),
            'entropy': np.mean(entropy_losses),
            'average_value': values.mean().item()
        }

def train_ppo(num_episodes=5000, max_time_steps=200, num_env=5, model_path=None, device=None):
    # this function uses the interact with the environment and passes data to the update function to apply ppo on the model
    import numpy as np
    env = SyncVectorEnv([lambda: gym.make("LunarLander-v3") for _ in range(num_env)])
    state_dim = env.single_observation_space.shape[0]
    action_dim = env.single_action_space.n
    best_reward = -float('inf')

    agent = PPOAgent(state_dim, action_dim, device=device)
    if model_path is not None:
      agent.load_model(model_path)
    reward_progress = []

    print(f"Training on device: {agent.device}")

    for episode in range(num_episodes):
        storages = [StorageBuffer() for _ in range(num_env)]
        states, _ = env.reset()
        states = np.array(states)
        episode_active = [True] * num_env
        episode_rewards = np.zeros(num_env)

        for t in range(max_time_steps):
            actions, old_log_probs = agent.select_action(states)

            next_states, rewards, terminateds, truncateds, _ = env.step(actions.tolist())

            dones = terminateds | truncateds
            for i in range(num_env):
                if episode_active[i]:

                    episode_rewards[i]+=rewards[i]

                    storages[i].add_step(
                        state=states[i],
                        action=actions[i],
                        reward=rewards[i],
                        log_prob=old_log_probs[i],
                        done=dones[i],
                        next_state=next_states[i] if not dones[i] else None
                    )

                    if dones[i]:
                        episode_active[i] = False

            states = next_states
            if all(dones):
                break

        for storage in storages:
            if len(storage.rewards) > 0:
                agent.update(storage,batch_size=300)

        avg_reward = episode_rewards.mean()
        reward_progress.append(avg_reward)

        if avg_reward >= best_reward:
            agent.save_model(model_path)

        if episode_rewards.max().tolist() >= best_reward:
              best_reward = episode_rewards.max().tolist()

        if (episode + 1) % 10 == 0:
          print(f"Episode {episode + 1}, AVG Reward: {avg_reward:.3f}, Best Reward: {best_reward:.3f}")


    env.close()
    return agent, storages, reward_progress


def test_model(model_path, num_episodes=10, device=None):
    env = gym.make("LunarLander-v3", render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = PPOAgent(state_dim, action_dim, device=device)
    agent.load_model(model_path)

    print(f"Testing on device: {agent.device}")

    for episode in range(num_episodes):
        state, _ = env.reset()
        storage = StorageBuffer()
        done = False

        while not done:
            action, log_prob = agent.select_action(torch.tensor(state))
            next_state, reward, terminated, truncated, _ = env.step(action.item())
            done = terminated or truncated

            storage.add_step(
                state,
                action,
                reward,
                log_prob,
                done,
                next_state
            )
            state = next_state

        stats = storage.get_statistics()
        print(f"Test Episode {episode + 1}, Total Reward: {stats['current_reward']}")

    env.close()
    return storage

In [44]:
import matplotlib.pyplot as plt
if __name__ == "__main__":
    # this is for training and testing the model
    # since this environment only gives reward at the end of the episode we need to give it more training
    num_episodes = 1000  # You can change this number
    model_path = "lunar_lander4.pth"
    print(f"Starting training for {num_episodes} episodes...")
    agent, train_storage,reward_progress = train_ppo(num_episodes=num_episodes,num_env=5,model_path=model_path)

    # print("\nTraining completed! Training statistics:")
    # train_stats = train_storage.get_statistics()
    # print(f"Mean reward: {train_stats['mean_reward']:.2f}")
    # print(f"Max reward: {train_stats['max_reward']:.2f}")
    # print(f"Mean episode length: {train_stats['mean_length']:.2f}")

    plt.plot(reward_progress, label='Episode Reward')
    plt.xlabel('Episode')
    plt.ylabel('Reward')
    plt.title('Reward Progress Over Time')
    plt.legend()
    plt.grid(True)
    plt.show()



Starting training for 1000 episodes...
Training on device: cuda


  'dones': torch.tensor(self.dones,dtype=torch.bool),


Episode 10, AVG Reward: -73.797, Best Reward: 5.042
Episode 20, AVG Reward: -19.064, Best Reward: 130.020
Episode 30, AVG Reward: 0.134, Best Reward: 144.317
Episode 40, AVG Reward: 28.630, Best Reward: 144.317
Episode 50, AVG Reward: -57.312, Best Reward: 144.317
Episode 60, AVG Reward: 9.908, Best Reward: 207.247
Episode 70, AVG Reward: 50.604, Best Reward: 207.247
Episode 80, AVG Reward: 38.036, Best Reward: 207.247
Episode 90, AVG Reward: -12.603, Best Reward: 245.182
Episode 100, AVG Reward: 46.934, Best Reward: 245.182
Episode 110, AVG Reward: -10.207, Best Reward: 245.182
Episode 120, AVG Reward: 66.138, Best Reward: 245.182
Episode 130, AVG Reward: 65.649, Best Reward: 245.182
Episode 140, AVG Reward: -47.814, Best Reward: 245.182
Episode 150, AVG Reward: -3.030, Best Reward: 245.182
Episode 160, AVG Reward: 22.760, Best Reward: 245.182
Episode 170, AVG Reward: 3.099, Best Reward: 245.182
Episode 180, AVG Reward: -25.166, Best Reward: 245.182
Episode 190, AVG Reward: 107.725, B

KeyboardInterrupt: 

In [None]:
print("\nStarting model testing...")
model_path = "lunar_lander4.pth"
test_storage = test_model(model_path, num_episodes=5)

print("\nTesting completed! Test statistics:")
test_stats = test_storage.get_statistics()



Starting model testing...
Testing on device: cpu


  state = torch.tensor(state, dtype=torch.float32).to(self.device)


Test Episode 1, Total Reward: 118.8628495310367
Test Episode 2, Total Reward: -23.99068718786036
Test Episode 3, Total Reward: -52.94602981307331
Test Episode 4, Total Reward: 0.3140470418869654
Test Episode 5, Total Reward: 157.2568481800966

Testing completed! Test statistics:
Mean test reward: 157.26
Max test reward: 157.26
Min test reward: 157.26
