In [64]:
#importing dependencies
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment  

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode
        
    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0
        
    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward
        
        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))
            
    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }
    
    def get_statistics(self):
        
        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}
        
        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, action_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, 1)
        )
    
    def forward(self, x):
        return self.fc(x)

In [69]:
# this is our environment that will train our ppo agent on 
import gymnasium as gym

env = gym.make("CliffWalking-v0",render_mode="human")
observation, info = env.reset()

episode_over = False
while not episode_over:
    action = env.action_space.sample()  
    observation, reward, terminated, truncated, info = env.step(action)


    episode_over = terminated or truncated

env.close()


print('observation space observation_space',env.observation_space.shape)
print('action space',env.action_space.n)

In [67]:
# the ppo agent class uses the value and policy network and adamW optimizer and MSELoss
from torch.utils.data import TensorDataset, DataLoader

# we use one hot encoding to encode the states giving one to the index of the state and 0 otherwise
def one_hot_state(state, state_dim):
    one_hot = np.zeros(state_dim, dtype=np.float32)
    one_hot[state] = 1.0
    return one_hot

# since it hard to reach the goal because the environment  do not provide reward the until we reach the goal which is hard to reach 
# which mean that if the agent never reach the goal it will never learn so i write this function to help direction the agent 
# and it do that by giving reward when it closer to the goal

def manhattan_distance(state, goal=47):
    x1, y1 = divmod(state, 12)
    x2, y2 = divmod(goal, 12)
    return abs(x1 - x2) + abs(y1 - y2)

class PPOAgent():
    def __init__(self,state_dim,action_dim,lr=3e-4,gamma = 0.9,eps_clip =  0.2 , k_epochs=10):
        self.policy  = PolicyNetwork(state_dim,action_dim)
        self.old_policy  = PolicyNetwork(state_dim,action_dim)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_function = ValueNetwork(state_dim)
        self.policy_optimizer =  optim.AdamW(self.policy.parameters(),lr=lr)
        self.value_optimizer =  optim.AdamW(self.value_function.parameters(),lr)
        self.gamma = gamma
        self.k_epochs = k_epochs 
        self.eps_clip  = eps_clip
        self.mse_loss = nn.MSELoss()
    def select_action(self,state):
        # this function used to select action by our old policy and returning the log_prob of it 
        state = torch.tensor(state,dtype=torch.float32).unsqueeze(0)
        with torch.no_grad():
            action_prob  =  self.old_policy(state)
        disc =  Categorical(action_prob)
        action  = disc.sample()
        return action.item(),disc.log_prob(action)
    def compute_advantages(self, rewards, values, next_value, dones):
        advantages = []
        gae = 0
        lambda_gae = 0.95  # Missing lambda parameter for GAE
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                next_value = next_value
            else:
                next_value = values[t + 1]
            
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
        advantages = torch.tensor(advantages, dtype=torch.float32)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)
    
    def save_model(self, path):

        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'value_state_dict': self.value_function.state_dict(),
        }, path)

    def load_model(self, path):

        checkpoint = torch.load(path)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['policy_state_dict'])
        self.value_function.load_state_dict(checkpoint['value_state_dict'])
    

    def update(self, buffer, batch_size=64):
            
 

            # Get episode data
            data = buffer.get_episode_data()
            states = torch.tensor(data['states'], dtype=torch.float32)
            actions = torch.tensor(data['actions'], dtype=torch.int64)
            log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)
            rewards = data['rewards']
            dones = data['dones']

            # Compute Value function and Advantage Estimates
            values = self.value_function(states).squeeze().detach()

            if data['next_state'] is not None:
                next_value = self.value_function(torch.tensor(data['next_state'], dtype=torch.float32)).item()
            else:
                next_value = 0.0

            advantages = self.compute_advantages(rewards, values, next_value, dones)
            targets = advantages + values  # Target for critic (value function)

            # Create a dataset and DataLoader

            dataset = TensorDataset(states, actions, log_probs_old, advantages, targets)
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

            policy_losses = []
            value_losses = []
            
            # here we train on batches of data instead of training on the entire data at once at  we split them into batches for better learning

            for _ in range(self.k_epochs):  # PPO iterates over the same data multiple times
                
                for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:
                    
                    # Compute new policy distribution
                    action_probs = self.policy(batch_states)
                    dist = Categorical(action_probs)
                    log_probs = dist.log_prob(batch_actions)

                    # PPO Clipped Objective
                    ratios = torch.exp(log_probs - batch_log_probs_old)
                    surr1 = ratios * batch_advantages
                    surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                    loss_actor = -torch.min(surr1, surr2).mean()

                    # Value Function Loss
                    values_pred = self.value_function(batch_states).squeeze()
                    loss_critic = self.mse_loss(values_pred, batch_targets)

                    policy_losses.append(loss_actor.item())
                    value_losses.append(loss_critic.item())

                    # Backpropagation for Policy (Actor)
                    self.policy_optimizer.zero_grad()
                    loss_actor.backward()
                    self.policy_optimizer.step()

                    # Backpropagation for Value Function (Critic)
                    self.value_optimizer.zero_grad()
                    loss_critic.backward()
                    self.value_optimizer.step()

            # Update the old policy
            self.old_policy.load_state_dict(self.policy.state_dict())

            return {
                'policy_loss': np.mean(policy_losses),
                'value_loss': np.mean(value_losses),
                'average_value': values.mean().item()
            }


def train_ppo(num_episodes=5000, max_time_steps=100):
    env = gym.make("CliffWalking-v0")
    state_dim = env.observation_space.n
    action_dim = env.action_space.n
    best_reward = -float('inf')

    agent = PPOAgent(state_dim, action_dim)
    storage = StorageBuffer()

    for episode in range(num_episodes):
        state, _ = env.reset()
        storage.reset()
        episode_reward = 0  # Track total reward for this episode

        for t in range(max_time_steps):
            action, old_log_prob = agent.select_action(one_hot_state(state, state_dim))
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Reward Shaping
            if next_state == 47:  # Reached the goal
                reward = 10
            elif reward == -100:  # Fell off the cliff
                reward = -1
            else:  # Normal step
                reward = -0.1

            if manhattan_distance(next_state) < manhattan_distance(state):
                    reward += 0.1  # Encourage moving toward goal

            episode_reward += reward  # Track episode reward

            storage.add_step(
                one_hot_state(state, state_dim),
                action,
                reward,
                old_log_prob,
                done,
                one_hot_state(next_state, state_dim) if not done else None
            )

            state = next_state
            if done:
                break

        agent.update(storage)

        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save_model("ppo_cliffwalking.pth")

        print(f"Episode {episode + 1}, Reward: {episode_reward}, Best: {best_reward}")

    env.close()
    return agent, storage
    
def test_model(model_path, num_episodes = 10):
    # this function allow us to view the model performance on the environment
    env = gym.make("CliffWalking-v0",render_mode="human")
    state_dim  =  env.observation_space.n
    action_dim =   env.action_space.n
    best_reward = 0
    agent  =  PPOAgent(state_dim,action_dim)
    agent.load_model(model_path)

    storage = StorageBuffer()
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        state = np.array(state)
        storage.reset()  # Reset storage for new episode
        done = False
        
        while not done:
            action, log_prob = agent.select_action(one_hot_state(state, state_dim))
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            next_state = np.array(next_state)

            if next_state == 47:  # Reached the goal
                reward = 10
            elif reward == -100:  # Fell off the cliff
                reward = -1
            else:  # Normal step
                reward = -0.1
            
            
            storage.add_step(one_hot_state(state, state_dim), action, reward, log_prob, done, one_hot_state(next_state, state_dim) if done else None)
            state = next_state
        
        stats = storage.get_statistics()
        print(f"Test Episode {episode + 1}, Total Reward: {stats['current_reward']}")
    
    env.close()
    return storage
            

In [68]:
if __name__ == "__main__":
    # this is for training and testing the model
    # since this environment only gives reward at the end of the episode we need to give it more training
    num_episodes = 300  # You can change this number
    print(f"Starting training for {num_episodes} episodes...")
    agent, train_storage = train_ppo(num_episodes=num_episodes)
    
    print("\nTraining completed! Training statistics:")
    
    train_stats = train_storage.get_statistics()
    print(f"Mean reward: {train_stats['mean_reward']:.2f}")
    print(f"Max reward: {train_stats['max_reward']:.2f}")
    print(f"Mean episode length: {train_stats['mean_length']:.2f}")
    model_path = "ppo_cliffwalking.pth"
    
    print("\nStarting model testing...")
    test_storage = test_model(model_path, num_episodes=5)
    
    print("\nTesting completed! Test statistics:")
    test_stats = test_storage.get_statistics()
    print(f"Mean test reward: {test_stats['mean_reward']:.2f}")
    print(f"Max test reward: {test_stats['max_reward']:.2f}")
    print(f"Min test reward: {test_stats['min_reward']:.2f}")

Starting training for 300 episodes...


  states = torch.tensor(data['states'], dtype=torch.float32)
  actions = torch.tensor(data['actions'], dtype=torch.int64)
  log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32)


Episode 1, Reward: -14.899999999999991, Best: -14.899999999999991
Episode 2, Reward: -7.899999999999997, Best: -7.899999999999997
Episode 3, Reward: -5.4, Best: -5.4
Episode 4, Reward: -7.499999999999995, Best: -5.4
Episode 5, Reward: -5.099999999999999, Best: -5.099999999999999
Episode 6, Reward: -5.7999999999999945, Best: -5.099999999999999
Episode 7, Reward: -13.699999999999994, Best: -5.099999999999999
Episode 8, Reward: -5.799999999999999, Best: -5.099999999999999
Episode 9, Reward: -8.299999999999992, Best: -5.099999999999999
Episode 10, Reward: -4.100000000000002, Best: -4.100000000000002
Episode 11, Reward: -4.499999999999999, Best: -4.100000000000002
Episode 12, Reward: -10.899999999999993, Best: -4.100000000000002
Episode 13, Reward: -9.999999999999993, Best: -4.100000000000002
Episode 14, Reward: -8.399999999999997, Best: -4.100000000000002
Episode 15, Reward: -11.899999999999993, Best: -4.100000000000002
Episode 16, Reward: -10.799999999999992, Best: -4.100000000000002
Epis

  checkpoint = torch.load(path)


Test Episode 1, Total Reward: 4.800000000000003
Test Episode 2, Total Reward: 6.599999999999999
Test Episode 3, Total Reward: 3.600000000000004
Test Episode 4, Total Reward: 5.3000000000000025
Test Episode 5, Total Reward: 8.8

Testing completed! Test statistics:
Mean test reward: 5.82
Max test reward: 8.80
Min test reward: 3.60


In [36]:
# testing the model with the saved params

model_path = "ppo_cliffwalking.pth"
    
print("\nStarting model testing...")
test_storage = test_model(model_path, num_episodes=5)

print("\nTesting completed! Test statistics:")
test_stats = test_storage.get_statistics()
print(f"Mean test reward: {test_stats['mean_reward']:.2f}")
print(f"Max test reward: {test_stats['max_reward']:.2f}")
print(f"Min test reward: {test_stats['min_reward']:.2f}")


Starting model testing...


  checkpoint = torch.load(path)


Test Episode 1, Total Reward: 1.0
Test Episode 2, Total Reward: 1.0
Test Episode 3, Total Reward: 0.0
Test Episode 4, Total Reward: 1.0
Test Episode 5, Total Reward: 1.0

Testing completed! Test statistics:
Mean test reward: 0.80
Max test reward: 1.00
Min test reward: 0.00
