In [None]:
print('hello world')

hello world


In [None]:
!apt-get install swig
!pip install box2d box2d-py


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 20 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,552 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

In [None]:
!pip install gymnasium




In [3]:
# this is our environment that will train our ppo agent on
import gymnasium as gym

env = gym.make("LunarLander-v2",render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()
    observation, reward, terminated, truncated, info = env.step(action)


    episode_over = terminated or truncated

env.close()


print('observation space observation_space',env.observation_space.shape)
print('action space',env.action_space.n)

observation space observation_space (8,)
action space 4


In [4]:
#importing dependencies
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward

        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):

        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.fc(x)

i used  weight and biases for visualizing the reward and losses

In [None]:
pip install wandb




In [None]:
import wandb


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
from torch.distributions import Categorical
from torch.utils.data import TensorDataset, DataLoader

class PPOAgent:
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, eps_clip=0.2, k_epochs=10, entropy_coef=0.01):
        self.policy = PolicyNetwork(state_dim, action_dim)
        self.old_policy = PolicyNetwork(state_dim, action_dim)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_function = ValueNetwork(state_dim)

        self.policy_optimizer = optim.AdamW(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.AdamW(self.value_function.parameters(), lr=lr)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.entropy_coef = entropy_coef
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action_prob = self.old_policy(state)
        dist = Categorical(action_prob)
        action = dist.sample()
        return action.item(), dist.log_prob(action), dist.entropy()

    def compute_advantages(self, rewards, values, next_value, dones, lambda_gae=0.95):
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def update(self, buffer, batch_size=64):
    # Collect episode data
        data = buffer.get_episode_data()
        states = torch.tensor(data['states'], dtype=torch.float32)
        actions = torch.tensor(data['actions'], dtype=torch.int64)
        log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)
        rewards = data['rewards']
        dones = data['dones']

        values = self.value_function(states).squeeze().detach()
        next_value = self.value_function(torch.tensor(data['next_state'], dtype=torch.float32)).item() if data['next_state'] else 0.0
        advantages = self.compute_advantages(rewards, values, next_value, dones)
        targets = advantages + values

        dataset = TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        total_loss_actor = 0
        total_loss_critic = 0
        total_entropy = 0
        num_batches = 0

        for _ in range(self.k_epochs):
            for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:
                action_probs = self.policy(batch_states)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_actions)
                
                # we add this entropy to encourage exploration
                entropy = dist.entropy().mean()

                ratios = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                loss_actor = -torch.min(surr1, surr2).mean() - self.entropy_coef * entropy

                values_pred = self.value_function(batch_states).squeeze()
                loss_critic = self.mse_loss(values_pred, batch_targets)

                self.policy_optimizer.zero_grad()
                loss_actor.backward()
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                loss_critic.backward()
                self.value_optimizer.step()

                total_loss_actor += loss_actor.item()
                total_loss_critic += loss_critic.item()
                total_entropy += entropy.item()
                num_batches += 1

        self.old_policy.load_state_dict(self.policy.state_dict())

        return total_loss_actor / num_batches, total_loss_critic / num_batches, total_entropy / num_batches


    def save_model(self, path):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'value_state_dict': self.value_function.state_dict(),
        }, path)

    def load_model(self, path):
        checkpoint = torch.load(path)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['policy_state_dict'])
        self.value_function.load_state_dict(checkpoint['value_state_dict'])


def train_ppo(num_episodes=5000, max_time_steps=200):
    # init a new wandb parameters
    wandb.init(project="PPO-LunarLander", config={
        "learning_rate": 3e-4,
        "gamma": 0.99,
        "eps_clip": 0.2,
        "entropy_coef": 0.01,
        "batch_size": 64,
        "k_epochs": 10
    })

    env = gym.make("LunarLander-v2")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = PPOAgent(state_dim, action_dim)
    buffer = StorageBuffer()
    best_reward = -float('inf')

    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        episode_reward = 0

        for t in range(max_time_steps):
            action, log_prob, entropy = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            buffer.add_step(state, action, reward, log_prob, done, next_state if not done else None)
            state = next_state
            if done:
                break

        loss_actor, loss_critic, entropy_val = agent.update(buffer)
        #sending data and visualizing it on  weight and biases official website
        wandb.log({
            "episode": episode,
            "reward": episode_reward,
            "actor_loss": loss_actor,
            "critic_loss": loss_critic,
            "entropy": entropy_val
        })

        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save_model("ppo_lunar_lander.pth")

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, Reward: {episode_reward:.2f}, Best: {best_reward:.2f}")

    env.close()
    wandb.finish()
    return agent, buffer



def test_model(model_path, num_episodes=10):
    env = gym.make("LunarLander-v2", render_mode="human")
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = PPOAgent(state_dim, action_dim)
    agent.load_model(model_path)
    buffer = StorageBuffer()

    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        done = False
        total_reward = 0

        while not done:
            action, _, _ = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state

        print(f"Test Episode {episode + 1}, Total Reward: {total_reward:.2f}")

    env.close()


In [14]:
if __name__ == "__main__":
    # this is for training and testing the model
    # note that this environment is bit complex for that it will take a lot more training if you have gpu access that can speed up the training
    num_episodes = 10000  # You can change this number
    print(f"Starting training for {num_episodes} episodes...")
    agent, train_storage = train_ppo(num_episodes=num_episodes)

    print("\nTraining completed! Training statistics:")

    train_stats = train_storage.get_statistics()
    print(f"Mean reward: {train_stats['mean_reward']:.2f}")
    print(f"Max reward: {train_stats['max_reward']:.2f}")
    print(f"Mean episode length: {train_stats['mean_length']:.2f}")

    model_path = "ppo_lunar_lander.pth"

    print("\nStarting model testing...")
    test_storage = test_model(model_path, num_episodes=5)


Starting training for 10000 episodes...


  states = torch.tensor(data['states'], dtype=torch.float32)
  actions = torch.tensor(data['actions'], dtype=torch.int64)
  log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)


Episode 10, Reward: -260.68, Best: -28.68
Episode 20, Reward: -165.22, Best: -28.68
Episode 30, Reward: -124.42, Best: -28.68
Episode 40, Reward: -149.49, Best: -28.68
Episode 50, Reward: -158.68, Best: -28.68
Episode 60, Reward: -216.27, Best: -28.68
Episode 70, Reward: -128.80, Best: -26.71
Episode 80, Reward: 102.56, Best: 102.56
Episode 90, Reward: -276.89, Best: 102.56
Episode 100, Reward: -60.11, Best: 107.19
Episode 110, Reward: -8.88, Best: 107.19
Episode 120, Reward: -49.17, Best: 107.19
Episode 130, Reward: 37.54, Best: 107.19
Episode 140, Reward: -37.25, Best: 107.19
Episode 150, Reward: 27.99, Best: 107.19
Episode 160, Reward: 26.72, Best: 107.19
Episode 170, Reward: 34.51, Best: 107.19
Episode 180, Reward: 46.21, Best: 107.19
Episode 190, Reward: 29.58, Best: 107.19
Episode 200, Reward: 53.18, Best: 107.19
Episode 210, Reward: 21.02, Best: 107.19
Episode 220, Reward: 69.26, Best: 107.19
Episode 230, Reward: 20.28, Best: 107.19
Episode 240, Reward: 54.54, Best: 107.19
Episo

  return F.mse_loss(input, target, reduction=self.reduction)


Episode 1330, Reward: -50.58, Best: 107.19
Episode 1340, Reward: 22.77, Best: 107.19
Episode 1350, Reward: -17.77, Best: 107.19
Episode 1360, Reward: -0.55, Best: 107.19
Episode 1370, Reward: 7.50, Best: 107.19
Episode 1380, Reward: 18.84, Best: 107.19
Episode 1390, Reward: 8.82, Best: 107.19
Episode 1400, Reward: -5.44, Best: 107.19
Episode 1410, Reward: 16.71, Best: 107.19
Episode 1420, Reward: 23.72, Best: 107.19
Episode 1430, Reward: 33.02, Best: 107.19
Episode 1440, Reward: 58.27, Best: 107.19
Episode 1450, Reward: 57.60, Best: 107.19
Episode 1460, Reward: 15.33, Best: 107.19
Episode 1470, Reward: 50.69, Best: 107.19
Episode 1480, Reward: 56.11, Best: 107.19
Episode 1490, Reward: 31.17, Best: 107.19
Episode 1500, Reward: 6.16, Best: 107.19
Episode 1510, Reward: 19.87, Best: 107.19
Episode 1520, Reward: -11.48, Best: 107.19
Episode 1530, Reward: 55.14, Best: 107.19
Episode 1540, Reward: 5.86, Best: 107.19
Episode 1550, Reward: 21.33, Best: 107.19
Episode 1560, Reward: -5.49, Best: 

0,1
actor_loss,▂▂▂▁▄▃▂▃▂▅▂▇▂▃▃▄▆▄▃▄▃▃▂▃▃▇▂▆▂▄▄█▂▅▁▃▂▇▄▂
critic_loss,▄▃▅▃▅▇▇▇▇█▅▆▄▅▄▂▁▇▅▂▁▆▆▂▂▁▅▁█▅▃▁▃▆▁▁▄▄▇▄
entropy,▆█▇▆▇▄▄▇▆▇▅▅▅▇▇▆▇▅▇▄▆▄▃▃▄▅▃▃▅▃▇▁▂▅▂▄▃▂▄▂
episode,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇▇███
reward,▅▆▅▅▅▅▆█▇▁▆▅▆▆▃▆▆▆▇█▃▇▇▇█▇█▇█▇▃▆▇▇▇▇▇▇▇▅

0,1
actor_loss,-0.03034
critic_loss,0.36869
entropy,1.00773
episode,9999.0
reward,-47.8511



Training completed! Training statistics:
Mean reward: -29.41
Max reward: 284.25
Mean episode length: 103.88

Starting model testing...


  checkpoint = torch.load(path)


FileNotFoundError: [Errno 2] No such file or directory: 'lunar_lander_ppo.pth'

In [11]:
# testing the model with the saved params

model_path = "ppo_lunar_lander.pth"

print("\nStarting model testing...")
test_storage = test_model(model_path, num_episodes=5)



Starting model testing...


  checkpoint = torch.load(path)


Test Episode 1, Total Reward: 199.12
Test Episode 2, Total Reward: -15.60
Test Episode 3, Total Reward: -75.68
Test Episode 4, Total Reward: 190.00
Test Episode 5, Total Reward: 194.63
