In [None]:
from PPO_RLHF.ppo_rlhf import PPORLHF
from PPO_RLHF.ppo import PPO
from PPO_RLHF.networks import ActorNetwork
import gymnasium as gym
import torch
from torch.distributions import Categorical
import numpy as np
import matplotlib.pyplot as plt
import os
import time

In [None]:
cartpole_hyperparameters = {
    'timesteps_per_batch': 2048,      # Reduced batch size for faster learning
    'max_timesteps_per_episode': 500,  # CartPole episodes typically end within 500 steps
    'gamma': 0.99,                     # Higher discount factor for CartPole
    'n_updates_per_iteration': 10,     # More updates per iteration
    'lr': 0.0003,                     # Lower learning rate for stability
    'clip': 0.2,                      # Standard PPO clipping
    'render': False,
    'render_every_i': 10,
    'reward_model_epochs': 50
}

In [None]:
SEEDS = [42, 43, 44]
TOTAL_TIMESTEPS = 100000
ENV_NAME = ['CartPole-v1', 'MountainCar-v0']
EVAL_N_EPISODES = 10
EVAL_LEARNING_RATE = 0.005
PREFERENCE_DATA_SIZE = [10, 50, 100, 300, 500, 700, 1000]



In [None]:
preference_dataset = []
raw_data = dataset
# Format the preference dataset correctly
preference_dataset = []
for i in range(len(raw_data[0])):  # Iterate over trajectories
    traj1_states = raw_data[0][i]  # States from policy 1
    traj1_actions = raw_data[1][i]  # Actions from policy 1
    traj2_states = raw_data[2][i]  # States from policy 2
    traj2_actions = raw_data[3][i]  # Actions from policy 2
    preference = raw_data[4][i]     # Preference probability
    
    # Format each trajectory pair
    traj_pair = [
        (traj1_states, traj1_actions),  # First trajectory
        (traj2_states, traj2_actions),  # Second trajectory
        preference                      # Preference probability
    ]
    preference_dataset.append(traj_pair)

# Create different sized datasets
formatted_datasets = []
for size in PREFERENCE_DATA_SIZE:
    data = preference_dataset[:size]
    formatted_datasets.append(data)

In [None]:
def run_experiment(formatted_preference_dataset, seed):
    print("Evaluating the model...")
    env = gym.make(ENV_NAME[0])
    rewards_per_experiment = []

    model = PPORLHF(
        env=env,
        preference_data=formatted_preference_dataset,
        seed=seed,
        **cartpole_hyperparameters
    )
    model.learn(TOTAL_TIMESTEPS)

    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
    else:
        action_dim = env.action_space.shape[0]
    obs_dim = env.observation_space.shape[0]
    actor = ActorNetwork(action_dim, obs_dim, alpha=EVAL_LEARNING_RATE)
    actor.load_state_dict(torch.load(os.path.join("PPO_RLHF", "models", 'ppo-rlhf_actor.pth')))
    actor.eval()

    device = actor.device

    # ---- Run evaluation ----
    rewards_per_episode = []

    for ep in range(EVAL_N_EPISODES):
        obs, _ = env.reset()
        done = False
        total_reward = 0

        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)

            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, get action probabilities and sample
                    action_probs = actor(obs_tensor)
                    dist = Categorical(action_probs)
                    action = dist.sample()
                    action = action.item()  # Convert to Python scalar
                else:
                    # For continuous actions, get the action directly and clip it
                    action = actor(obs_tensor)
                    action = action.cpu().numpy()
                    action = np.clip(action, env.action_space.low, env.action_space.high)

            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            #time.sleep(0.02)

        rewards_per_episode.append(total_reward)
        print(f"Episode {ep+1}: Reward = {total_reward}")

    env.close()
    rewards_per_experiment.append(rewards_per_episode)

    return rewards_per_experiment

In [None]:
preference_rew = []

for i in range(len(PREFERENCE_DATA_SIZE)):
    seed_rew = []
    print(f"===== Running for Preference Data size {PREFERENCE_DATA_SIZE[i]}... =====")
    # Run for each seed
    for seed in SEEDS:
        print(f"---- Running for seed number {seed}... ----")
        reward = run_experiment(formatted_datasets[i], seed)
        seed_rew.append(reward)
    
    # Calculate mean and std across seeds
    preference_rew.append(seed_rew)

In [None]:
import scipy.stats as stats

# Calculate mean and confidence intervals for each preference data size
means = []
lower_ci = []
upper_ci = []
confidence_level = 0.95  # 95% confidence interval

for size_rewards in preference_rew:
    # Flatten rewards across seeds and episodes
    all_rewards = [reward for seed_rewards in size_rewards for reward in seed_rewards[0]]
    
    mean = np.mean(all_rewards)
    std = np.std(all_rewards)
    n = len(all_rewards)
    
    # Calculate confidence interval using t-distribution
    t_value = stats.t.ppf((1 + confidence_level) / 2, n-1)
    margin_of_error = t_value * (std / np.sqrt(n))
    
    means.append(mean)
    lower_ci.append(mean - margin_of_error)
    upper_ci.append(mean + margin_of_error)

# Create the plot
plt.figure(figsize=(10, 6))

# Plot mean line
plt.plot(PREFERENCE_DATA_SIZE, means, label='Mean', marker='o')

plt.xlabel('Preference Dataset Size')
plt.ylabel('Average Reward')
plt.title(f'PPO-RLHF Performance for environment {ENV_NAME[0]}')
plt.grid(True)
plt.legend()
plt.savefig(f'ppo-rlhf_performance_{ENV_NAME[0]}_Final2.png', dpi=300)
plt.show()