In [1]:
print("test test")

test test


In [10]:
import flappy_bird_gymnasium
import gymnasium
env = gymnasium.make("FlappyBird-v0", render_mode="human", use_lidar=False)

obs, _ = env.reset()
while True:
    # Next action:
    # (feed the observation to your agent here)
    action = env.action_space.sample()

    # Processing:
    obs, reward, terminated, trancated, info = env.step(action)
    
    print("obs",obs,"reward", reward, "terminated",terminated," trancated", trancated)
    # Checking if the player is still alive
    if terminated:
        break

env.close()

obs [ 0.98611111  0.2734375   0.46875     1.          0.          1.
  1.          0.          1.          0.45898438 -0.9         0.5       ] reward 0.1 terminated False  trancated False
obs [ 0.97222222  0.2734375   0.46875     1.          0.          1.
  1.          0.          1.          0.44335938 -0.8         0.46666667] reward 0.1 terminated False  trancated False
obs [ 0.95833333  0.2734375   0.46875     1.          0.          1.
  1.          0.          1.          0.4296875  -0.7         0.43333333] reward 0.1 terminated False  trancated False
obs [ 0.94444444  0.2734375   0.46875     1.          0.          1.
  1.          0.          1.          0.41210938 -0.9         0.5       ] reward 0.1 terminated False  trancated False
obs [ 0.93055556  0.2734375   0.46875     1.          0.          1.
  1.          0.          1.          0.39648438 -0.8         0.46666667] reward 0.1 terminated False  trancated False
obs [ 0.91666667  0.2734375   0.46875     1.          0.    

In [45]:
device  =  "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [2]:
#importing dependencies
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# this class define the storage buffer of the environment

class StorageBuffer:
    def __init__(self):
        self.reset()
        self.episode_rewards = []  # Store rewards for each episode
        self.episode_lengths = []  # Store lengths for each episode

    def reset(self):
        # Current episode storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        self.dones = []
        self.next_state = None
        self.current_reward = 0

    def add_step(self, state, action, reward, log_prob, done, next_state=None):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.log_probs.append(log_prob)
        self.dones.append(done)
        self.current_reward += reward

        if done:
            self.next_state = next_state
            self.episode_rewards.append(self.current_reward)
            self.episode_lengths.append(len(self.rewards))

    def get_episode_data(self):
        return {
            'states': torch.FloatTensor(np.array(self.states)),
            'actions': torch.tensor(self.actions),
            'rewards': torch.tensor(self.rewards),
            'log_probs': torch.tensor(self.log_probs),
            'dones': torch.tensor(self.dones),
            'next_state': torch.FloatTensor(self.next_state).unsqueeze(0) if self.next_state is not None else None
        }

    def get_statistics(self):

        if not self.episode_rewards:
            return {"mean_reward": 0, "max_reward": 0, "min_reward": 0, "mean_length": 0}

        return {
            "mean_reward": np.mean(self.episode_rewards),
            "max_reward": np.max(self.episode_rewards),
            "min_reward": np.min(self.episode_rewards),
            "mean_length": np.mean(self.episode_lengths),
            "current_reward": self.current_reward,
            "current_length": len(self.rewards)
        }

#the policy and value networks
# i used small model since it simple game but you can make it bigger if you want

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 200),
            nn.ReLU(),
            nn.Linear(200, 200),
            nn.ReLU(),
            nn.Linear(200, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, x):
        return self.fc(x)

class ValueNetwork(nn.Module):
    def __init__(self, state_dim):
        super(ValueNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 120),
            nn.ReLU(),
            nn.Linear(120, 120),
            nn.ReLU(),
            nn.Linear(120, 1)
        )

    def forward(self, x):
        return self.fc(x)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import gymnasium as gym
from torch.distributions import Categorical
from torch.utils.data import TensorDataset, DataLoader
from gymnasium.vector import SyncVectorEnv

class PPOAgent:
    def __init__(self, state_dim, action_dim,device="cpu", lr=3e-4, gamma=0.99, eps_clip=0.2, k_epochs=10, entropy_coef=0.01):
        self.policy = PolicyNetwork(state_dim, action_dim).to(device)
        self.old_policy = PolicyNetwork(state_dim, action_dim).to(device)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.value_function = ValueNetwork(state_dim).to(device)

        self.policy_optimizer = optim.AdamW(self.policy.parameters(), lr=lr)
        self.value_optimizer = optim.AdamW(self.value_function.parameters(), lr=lr)

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.k_epochs = k_epochs
        self.entropy_coef = entropy_coef
        self.mse_loss = nn.MSELoss()

    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float32).to(device)
        with torch.no_grad():
            action_prob = self.old_policy(state)
        dist = Categorical(action_prob)
        action = dist.sample()
        return action, dist.log_prob(action), dist.entropy()

    def compute_advantages(self, rewards, values, next_value, dones, lambda_gae=0.95):
        advantages = []
        gae = 0
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * (1 - float(dones[t])) * next_value - values[t]
            gae = delta + self.gamma * lambda_gae * (1 - float(dones[t])) * gae
            advantages.insert(0, gae)
            next_value = values[t]
        advantages = torch.tensor(advantages, dtype=torch.float32).to(device)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8)

    def update(self, buffer, batch_size=120):
    # Collect episode data
        data = buffer.get_episode_data()
        # states = torch.tensor(data['states'], dtype=torch.float32).to(device)
        # actions = torch.tensor(data['actions'], dtype=torch.int64).to(device)
        # log_probs_old = torch.tensor(data['log_probs'], dtype=torch.float32).to(device)
        states = data['states'].clone().detach().float().to(device)
        actions = data['actions'].clone().detach().long().to(device)
        log_probs_old = data['log_probs'].clone().detach().float().to(device)
        rewards = data['rewards']
        dones = data['dones']

        values = self.value_function(states).squeeze().detach()
        next_value = self.value_function(torch.tensor(data['next_state'], dtype=torch.float32).to(device)).item() if data['next_state'] else 0.0
        advantages = self.compute_advantages(rewards, values, next_value, dones)
        targets = advantages + values


        dataset = TensorDataset(states, actions, log_probs_old, advantages, targets)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        total_loss_actor = 0
        total_loss_critic = 0
        total_entropy = 0
        num_batches = 0

        for _ in range(self.k_epochs):
            for batch_states, batch_actions, batch_log_probs_old, batch_advantages, batch_targets in dataloader:
                action_probs = self.policy(batch_states)
                dist = Categorical(action_probs)
                log_probs = dist.log_prob(batch_actions)

                # we add this entropy to encourage exploration
                entropy = dist.entropy().mean()

                ratios = torch.exp(log_probs - batch_log_probs_old)
                surr1 = ratios * batch_advantages
                surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * batch_advantages
                loss_actor = -torch.min(surr1, surr2).mean() - self.entropy_coef * entropy

                values_pred = self.value_function(batch_states).squeeze()
                loss_critic = self.mse_loss(values_pred, batch_targets)

                self.policy_optimizer.zero_grad()
                loss_actor.backward()
                self.policy_optimizer.step()

                self.value_optimizer.zero_grad()
                loss_critic.backward()
                self.value_optimizer.step()

                total_loss_actor += loss_actor.item()
                total_loss_critic += loss_critic.item()
                total_entropy += entropy.item()
                num_batches += 1

        self.old_policy.load_state_dict(self.policy.state_dict())

        return total_loss_actor / num_batches, total_loss_critic / num_batches, total_entropy / num_batches


    def save_model(self, path):
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'value_state_dict': self.value_function.state_dict(),
            'policy_optimizer': self.policy_optimizer.state_dict(),
            'value_optimizer': self.value_optimizer.state_dict()
        }, path)

    def load_model(self, path):
            
        checkpoint = torch.load(path, map_location=device)
        self.policy.load_state_dict(checkpoint['policy_state_dict'])
        self.old_policy.load_state_dict(checkpoint['policy_state_dict'])
        self.value_function.load_state_dict(checkpoint['value_state_dict'])
        
        if 'policy_optimizer' in checkpoint:
            self.policy_optimizer.load_state_dict(checkpoint['policy_optimizer'])
        if 'value_optimizer' in checkpoint:
            self.value_optimizer.load_state_dict(checkpoint['value_optimizer'])


def train_ppo(num_episodes=5000, max_time_steps=200,model_name = None):

    # env = gymnasium.make("FlappyBird-v0",  use_lidar=False)
    # state_dim = env.observation_space.shape[0]
    # action_dim = env.action_space.n

    num_envs = 5
    
    env = SyncVectorEnv([lambda: gymnasium.make("FlappyBird-v0",  use_lidar=False) for _ in range(num_envs)]) 
    state_dim = env.single_observation_space.shape[0]
    action_dim = env.single_action_space.n

    agent = PPOAgent(state_dim, action_dim,device)
    if model_name is not None:
        agent.load_model(model_name)

    buffer = StorageBuffer()
    best_reward = -float('inf')

    for episode in range(num_episodes):
        states, _ = env.reset()
        buffer.reset()
        episode_rewards = np.zeros(num_envs)

        for t in range(max_time_steps):
            actions, log_probs, entropys = agent.select_action(states)
            next_states, rewards, terminateds, truncateds, _ = env.step(actions)
            dones = terminateds | truncateds
            for i in range(num_envs):
                episode_rewards[i] += rewards[i]
                buffer.add_step(states[i], actions[i], rewards[i], log_probs[i], dones[i], next_states[i] if not dones[i] else None)
            states = next_states
            if all(dones):
                break

        loss_actor, loss_critic, entropy_val = agent.update(buffer,250)

        if episode_rewards.mean() > best_reward:
            best_reward = episode_rewards.mean()
            agent.save_model("ppo_flappy_bird.pth")

        if (episode + 1) % 10 == 0:
            print(f"Episode {episode + 1}, Reward: {float(episode_rewards.mean()):.2f}, Best: {float(best_reward):.2f}, Loss Actor: {float(loss_actor):.4f}, Loss Critic: {float(loss_critic):.4f}, Entropy: {float(entropy_val):.4f}")


    env.close()
    # wandb.finish()
    return agent, buffer


def test_model(model_path, num_episodes=10):
    env = gymnasium.make("FlappyBird-v0" ,render_mode="human",use_lidar=False)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    agent = PPOAgent(state_dim, action_dim,device)
    agent.load_model(model_path)
    buffer = StorageBuffer()

    for episode in range(num_episodes):
        state, _ = env.reset()
        buffer.reset()
        done = False
        total_reward = 0

        while not done:
            action, _, _ = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state

        print(f"Test Episode {episode + 1}, Total Reward: {total_reward:.2f}")

    env.close()


In [40]:
if __name__ == "__main__":
    # this is for training and testing the model
    # note that this environment is bit complex for that it will take a lot more training if you have gpu access that can speed up the training
    num_episodes = 500  # You can change this number
    print(f"Starting training for {num_episodes} episodes...")

    model_name="ppo_flappy_bird.pth"
    # model_name = None

    agent, train_storage = train_ppo(num_episodes=num_episodes,model_name=model_name)

    print("\nTraining completed! Training statistics:")

    train_stats = train_storage.get_statistics()
    print(f"Mean reward: {train_stats['mean_reward']:.2f}")
    print(f"Max reward: {train_stats['max_reward']:.2f}")
    print(f"Mean episode length: {train_stats['mean_length']:.2f}")

    model_path = "ppo_flappy_bird.pth"

    print("\nStarting model testing...")
    test_storage = test_model(model_path, num_episodes=5)


Starting training for 500 episodes...


  'dones': torch.tensor(self.dones),


Episode 10, Reward: 68.20, Best: 68.20, Loss Actor: -0.0067, Loss Critic: 1.0076, Entropy: 0.4243
Episode 20, Reward: 15.30, Best: 68.20, Loss Actor: -0.0045, Loss Critic: 0.8866, Entropy: 0.3673
Episode 30, Reward: 82.00, Best: 82.90, Loss Actor: -0.0030, Loss Critic: 0.9992, Entropy: 0.2664
Episode 40, Reward: 84.70, Best: 85.90, Loss Actor: -0.0044, Loss Critic: 0.9605, Entropy: 0.1922
Episode 50, Reward: 87.70, Best: 88.60, Loss Actor: -0.0038, Loss Critic: 0.9040, Entropy: 0.1680
Episode 60, Reward: 19.50, Best: 88.60, Loss Actor: -0.0042, Loss Critic: 0.9837, Entropy: 0.2290
Episode 70, Reward: 18.90, Best: 88.60, Loss Actor: -0.0028, Loss Critic: 0.9894, Entropy: 0.2131
Episode 80, Reward: 19.50, Best: 88.60, Loss Actor: -0.0054, Loss Critic: 0.7441, Entropy: 0.2101
Episode 90, Reward: 18.90, Best: 88.60, Loss Actor: -0.0053, Loss Critic: 0.7042, Entropy: 0.2678
Episode 100, Reward: 11.70, Best: 88.60, Loss Actor: -0.0087, Loss Critic: 1.0033, Entropy: 0.2888
Episode 110, Reward

KeyboardInterrupt: 

In [51]:
# testing the model with the saved params

model_path = "ppo_flappy_bird.pth"

print("\nStarting model testing...")
test_storage = test_model(model_path, num_episodes=5)



Starting model testing...
Test Episode 1, Total Reward: 12.90
Test Episode 2, Total Reward: 8.40
Test Episode 3, Total Reward: 50.40
Test Episode 4, Total Reward: 12.90
Test Episode 5, Total Reward: 12.90
