In [None]:
from PPO_RLHF.ppo_rlhf import PPORLHF
from PPO_RLHF.ppo import PPO
from PPO_RLHF.networks import ActorNetwork
import gymnasium as gym
import torch
from torch.distributions import Categorical
import numpy as np
import matplotlib.pyplot as plt
import os
import time

## Preferred dataset

In [None]:
# path for policies
PATH = os.path.abspath(os.getcwd())

# TO COMPLETE, choose the file names of the desired policies

pi1_path = os.path.join(PATH, 'saved_policies', 'pi1_cartpole_1.pt')

pi2_path = os.path.join(PATH, 'saved_policies', 'pi2_cartpole_1.pt')

In [None]:
from utils.generate_preference_dataset import dataset

n_trajectories = 1000
max_t=500
seed = 0
env = gym.make('CartPole-v1')
dim_state = 4

trajectories_states_pi1, trajectories_actions_pi1, trajectories_states_pi2, trajectories_actions_pi2, trajectories_rewards_pi1, trajectories_rewards_pi2 = dataset(
    n_trajectories, max_t, seed, pi1_path, pi2_path, env, dim_state)


In [None]:
# Generate K pairs of trajectories and preferences
K = 100  # Number of preference pairs to generate
preferences = []
trajectory_pairs = []

for k in range(K):
    # Randomly select two different trajectory indices
    idx1 = np.random.randint(0, len(trajectories_rewards_pi1))
    idx2 = np.random.randint(0, len(trajectories_rewards_pi2))
    
    # Get the states and actions for both trajectories
    traj1_states = trajectories_states_pi1[idx1]
    traj1_actions = trajectories_actions_pi1[idx1]
    traj2_states = trajectories_states_pi2[idx2] 
    traj2_actions = trajectories_actions_pi2[idx2]
    
    # Store the trajectory pair
    trajectory_pairs.append([
        (traj1_states, traj1_actions),
        (traj2_states, traj2_actions)
    ])
    
    # Calculate preference probability using the softmax formula
    reward1 = trajectories_rewards_pi1[idx1]
    reward2 = trajectories_rewards_pi2[idx2]
    
    # P(τ1) = exp(R(τ1)) / (exp(R(τ1)) + exp(R(τ2)))
    prob_traj1 = np.exp(reward1) / (np.exp(reward1) + np.exp(reward2))
    preferences.append(prob_traj1)

preferences = np.array(preferences)
print(f"Generated {K} preference pairs")
print(f"Average preference probability for first trajectory: {np.mean(preferences):.3f}")

In [None]:
# Format the preference dataset for the reward model
formatted_preference_dataset = []

for i in range(len(trajectory_pairs)):
    traj_pair = trajectory_pairs[i]
    preference = preferences[i]
    formatted_preference_dataset.append([
        traj_pair[0],  # (states1, actions1) for trajectory 1
        traj_pair[1],  # (states2, actions2) for trajectory 2
        preference     # preference probability for trajectory 1
    ])

print("\nFormatted preference dataset:")
print(f"Number of trajectory pairs: {len(formatted_preference_dataset)}")

## PPO-RLHF

### Parameters and magic numbers

In [None]:
hyperparameters = {
    'timesteps_per_batch': 4080,
    'max_timesteps_per_episode': 1600,
    'gamma': 0.95,
    'n_updates_per_iteration': 5,
    'lr': 0.005,
    'clip': 0.2,
    'render': True,
    'render_every_i': 10
}

In [None]:
SEEDS = [42, 123, 456]
TOTAL_TIMESTEPS = 50000
ENV_NAME = ['CartPole-v1', 'MountainCar-v0']
EVAL_N_EPISODES = 5
EVAL_LEARNING_RATE = 0.005


### Run the PPO-RLHF algorithm

In [None]:
rewards_per_episode = []
rewards_per_seed = []

for seed in SEEDS:
    # Create the environment
    env = gym.make(ENV_NAME[0])
    model = PPORLHF(
        env=env,
        preference_data=formatted_preference_dataset,
        seed=seed,
        **hyperparameters
    )
    model.learn(TOTAL_TIMESTEPS)

    env.close()
    env = gym.make(ENV_NAME[0], render_mode="human")

    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
    else:
        action_dim = env.action_space.shape[0]
    obs_dim = env.observation_space.shape[0]
    actor = ActorNetwork(action_dim, obs_dim, alpha=EVAL_LEARNING_RATE)
    actor.load_state_dict(torch.load(os.path.join("PPO_RLHF", "models", 'ppo-rlhf_actor.pth')))
    actor.eval()

    device = actor.device

    # ---- Run evaluation ----
    episode_rewards = []

    for ep in range(EVAL_N_EPISODES):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        frames = []

        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)

            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, get action probabilities and sample
                    action_probs = actor(obs_tensor)
                    dist = Categorical(action_probs)
                    action = dist.sample()
                    action = action.item()  # Convert to Python scalar
                else:
                    # For continuous actions, get the action directly and clip it
                    action = actor(obs_tensor)
                    action = action.cpu().numpy()
                    action = np.clip(action, env.action_space.low, env.action_space.high)

            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            
            time.sleep(0.02)

        rewards_per_episode.append(total_reward)
        print(f"Episode {ep+1}: Reward = {total_reward}")

    env.close()
    rewards_per_seed.append(rewards_per_episode)

    # ---- Plot reward results ----
    plt.plot(total_reward, marker='o')
    plt.title("PPO-RLHF - Evaluation: Seed Rewards")
    plt.xlabel("Seeds")
    plt.ylabel("Total Reward")
    plt.grid(True)
    plt.show()

## PPO

In [None]:
rewards_per_episode = []
rewards_per_seed = []

for seed in SEEDS:
    # Create the environment
    env = gym.make(ENV_NAME[0])
    model = PPO(
        env=env,
        seed=seed,
        **hyperparameters
    )
    model.learn(TOTAL_TIMESTEPS)

    env.close()
    env = gym.make(ENV_NAME[0], render_mode="human")

    if isinstance(env.action_space, gym.spaces.Discrete):
        action_dim = env.action_space.n
    else:
        action_dim = env.action_space.shape[0]
    obs_dim = env.observation_space.shape[0]

    actor = ActorNetwork(action_dim, obs_dim, alpha=EVAL_LEARNING_RATE)
    actor.load_state_dict(torch.load(os.path.join("PPO_RLHF", "models", 'ppo_actor.pth')))
    actor.eval()

    device = actor.device

    # ---- Run evaluation ----
    episode_rewards = []

    for ep in range(EVAL_N_EPISODES):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        frames = []

        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32).to(device)

            with torch.no_grad():
                if isinstance(env.action_space, gym.spaces.Discrete):
                    # For discrete actions, get action probabilities and sample
                    action_probs = actor(obs_tensor)
                    dist = Categorical(action_probs)
                    action = dist.sample()
                    action = action.item()  # Convert to Python scalar
                else:
                    # For continuous actions, get the action directly and clip it
                    action = actor(obs_tensor)
                    action = action.cpu().numpy()
                    action = np.clip(action, env.action_space.low, env.action_space.high)

            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            
            time.sleep(0.02)

        rewards_per_episode.append(total_reward)
        print(f"Episode {ep+1}: Reward = {total_reward}")

    rewards_per_seed.append(rewards_per_episode)
    
    env.close()
    
    # ---- Plot reward results ----
    #plt.plot(total_reward, marker='o')
    #plt.title("PPO - Evaluation: Seed Rewards")
    #plt.xlabel("Seeds")
    #plt.ylabel("Total Reward")
    #plt.grid(True)
    #plt.show()