In [None]:
import os
import json
import pandas as pd
import numpy as np
import gymnasium as gym
from datetime import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import VecMonitor

# User defined imports
import tools_RLHF


In [34]:
path = "..\\PPO_Original\\Training\\2025-05-15_22-16-35\\RLHF_trajectory_pairs.csv"

**check the total reward (in cartpole environment, total reward is the same as episode length) of preferred and rejected trajectory**

In [None]:
# path = "trajectory_pairs.csv"
Data = tools_RLHF.Data_Class(path)
for i in range(10):
    print(i, len(Data.trajs_prefer_list.get_single_traj_json(i)), len(Data.trajs_reject_list.get_single_traj_json(i)))

Data loaded successfully
0 500 44
1 500 175
2 500 78
3 500 41
4 500 116
5 500 103
6 500 60
7 500 60
8 500 91
9 500 13


## **Reward Model Training**
use `trajectory_pairs.csv` to train reward model

In [None]:

# ‚Äî‚Äî‚Äî‚Äî Hyperparameters ‚Äî‚Äî‚Äî‚Äî
gamma = 0.99      
lr = 1e-4          
batch_size = 16
num_epochs = 100

# ‚Äî‚Äî‚Äî‚Äî Load Data ‚Äî‚Äî‚Äî‚Äî
path = "trajectory_pairs.csv"
Data = tools_RLHF.Data_Class(path)

# Ëá™ÂÆö‰πâ collate_fnÔºå‰øùÁïôÂèòÈïøÂ∫èÂàó
# Custom collate_fn to keep variable-length sequences
def variable_collate(batch):
    # batch: List of tuples (s_pref, a_pref, s_rej, a_rej)
    s_pf, a_pf, s_rj, a_rj = zip(*batch)
    return list(s_pf), list(a_pf), list(s_rj), list(a_rj)

# ‚Äî‚Äî‚Äî Training Preperation ‚Äî‚Äî‚Äî
dataset = tools_RLHF.PreferenceDataset(
    Data.traj_prefer_list_list_tensor,
    Data.traj_reject_list_list_tensor,
    gamma
)
loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=variable_collate
)

reward_net = tools_RLHF.RewardMLP(Data.dim_state, Data.dim_action, hidden_dim=64) # ÂÆû‰æãÂåñ Á•ûÁªèÁΩëÁªú MLP
optimizer  = optim.Adam(reward_net.parameters(), lr=lr)
loss_fn    = nn.BCEWithLogitsLoss()

# ‚Äî‚Äî‚Äî Training Loop ‚Äî‚Äî‚Äî
for epoch in range(1, num_epochs + 1):
    total_loss = 0.0
    for s_pref_list, a_pref_list, s_rej_list, a_rej_list in loader:
        R_pref_batch = []
        R_rej_batch  = []

        # ËÆ°ÁÆó prefer ËΩ®ËøπÁöÑÂõûÊä•
        # Calculate the return for preferred trajectories
        for s_pf, a_pf in zip(s_pref_list, a_pref_list):
            r_pf = reward_net(s_pf, a_pf)           # [L_i]
            discounts = torch.tensor([gamma**t for t in range(r_pf.size(0))], device=r_pf.device)
            R_pref_batch.append((r_pf * discounts).sum())

        # ËÆ°ÁÆó reject ËΩ®ËøπÁöÑÂõûÊä•
        # Calculate the return for rejected trajectories
        for s_rj, a_rj in zip(s_rej_list, a_rej_list):
            r_rj = reward_net(s_rj, a_rj)          # [L_j]
            discounts = torch.tensor([gamma**t for t in range(r_rj.size(0))], device=r_rj.device)
            R_rej_batch.append((r_rj * discounts).sum())

        R_pref = torch.stack(R_pref_batch)
        R_rej = torch.stack(R_rej_batch)

        logits = R_pref - R_rej
        targets = torch.ones_like(logits)        
        loss = loss_fn(logits, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(R_pref_batch)

    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch}/{num_epochs} ‚Äî Avg Loss: {avg_loss:.4f}")

# ‚Äî‚Äî‚Äî Save Model ‚Äî‚Äî‚Äî
torch.save(reward_net.state_dict(), 'reward_net.pth')
print("üéâ Model has been saved to: reward_net.pth")


Data loaded successfully
Epoch 1/100 ‚Äî Avg Loss: 0.7466
Epoch 2/100 ‚Äî Avg Loss: 0.6167
Epoch 3/100 ‚Äî Avg Loss: 0.5279
Epoch 4/100 ‚Äî Avg Loss: 0.4635
Epoch 5/100 ‚Äî Avg Loss: 0.4184
Epoch 6/100 ‚Äî Avg Loss: 0.3800
Epoch 7/100 ‚Äî Avg Loss: 0.3549
Epoch 8/100 ‚Äî Avg Loss: 0.3379
Epoch 9/100 ‚Äî Avg Loss: 0.3243
Epoch 10/100 ‚Äî Avg Loss: 0.3150
Epoch 11/100 ‚Äî Avg Loss: 0.3068
Epoch 12/100 ‚Äî Avg Loss: 0.3004
Epoch 13/100 ‚Äî Avg Loss: 0.2949
Epoch 14/100 ‚Äî Avg Loss: 0.2888
Epoch 15/100 ‚Äî Avg Loss: 0.2868
Epoch 16/100 ‚Äî Avg Loss: 0.2829
Epoch 17/100 ‚Äî Avg Loss: 0.2776
Epoch 18/100 ‚Äî Avg Loss: 0.2744
Epoch 19/100 ‚Äî Avg Loss: 0.2735
Epoch 20/100 ‚Äî Avg Loss: 0.2711
Epoch 21/100 ‚Äî Avg Loss: 0.2682
Epoch 22/100 ‚Äî Avg Loss: 0.2658
Epoch 23/100 ‚Äî Avg Loss: 0.2644
Epoch 24/100 ‚Äî Avg Loss: 0.2608
Epoch 25/100 ‚Äî Avg Loss: 0.2589
Epoch 26/100 ‚Äî Avg Loss: 0.2576
Epoch 27/100 ‚Äî Avg Loss: 0.2553
Epoch 28/100 ‚Äî Avg Loss: 0.2530
Epoch 29/100 ‚Äî Avg Loss: 0.252

## **Reward Model Testing**
Load `trajectory_pairs.csv` to see if the total reward matches what the `MLP reward model` predicts

In [None]:
# ‚Äî‚Äî‚Äî Load MLP Reward Model ‚Äî‚Äî‚Äî
reward_net_loaded = tools_RLHF.RewardMLP(Data.dim_state, Data.dim_action, hidden_dim=64)
reward_net_loaded.load_state_dict(torch.load('reward_net.pth', weights_only=True))
reward_net_loaded.eval()

# Load .csv Data
for i in range(10):
    traj_prefer_json = Data.trajs_prefer_list.get_single_traj_json(i)
    traj_reject_json = Data.trajs_reject_list.get_single_traj_json(i)

    # convert to tensor
    states_prefer  = torch.stack([torch.from_numpy(np.array(step['state'])).float().view(-1)
                                  for step in traj_prefer_json], dim=0)    # [L, s_dim]
    actions_prefer = torch.stack([torch.from_numpy(np.array(step['action'])).float().view(-1)
                                    for step in traj_prefer_json], dim=0)  # [L, a_dim]
    states_reject  = torch.stack([torch.from_numpy(np.array(step['state'])).float().view(-1)
                                    for step in traj_reject_json], dim=0)  # [L, s_dim]
    actions_reject = torch.stack([torch.from_numpy(np.array(step['action'])).float().view(-1)
                                    for step in traj_reject_json], dim=0)  # [L, a_dim]

    # ËÆ°ÁÆó prefer ËΩ®ËøπÁöÑÂõûÊä•
    # Calculate the return for preferred trajectories
    with torch.no_grad():
        r_pref = reward_net_loaded(states_prefer, actions_prefer)          

    # ËÆ°ÁÆóÊÄªÂõûÊä•
    # Calculate total return
    discounts = torch.tensor([gamma**t for t in range(r_pref.size(0))])
    total_return_prefer = (r_pref * discounts).sum()

    # ËÆ°ÁÆó reject ËΩ®ËøπÁöÑÂõûÊä•
    # Calculate the return for rejected trajectories
    with torch.no_grad():
        r_rj = reward_net_loaded(states_reject, actions_reject)          

    # ËÆ°ÁÆóÊÄªÂõûÊä•
    # Calculate total return
    discounts = torch.tensor([gamma**t for t in range(r_rj.size(0))])
    total_return_reject = (r_rj * discounts).sum()

    print(i, total_return_prefer, total_return_reject)



0 tensor(20.7971) tensor(19.7983)
1 tensor(19.7432) tensor(16.6396)
2 tensor(21.2318) tensor(19.4247)
3 tensor(20.6215) tensor(18.8928)
4 tensor(21.0973) tensor(12.7524)
5 tensor(21.1557) tensor(13.4750)
6 tensor(19.6617) tensor(11.9227)
7 tensor(20.5343) tensor(11.9880)
8 tensor(19.8585) tensor(18.6292)
9 tensor(19.8084) tensor(13.6240)


## **PPO-RLHF Training**

In [None]:
# --------------------------------------------------------------------------------------------------
# 2. ÂÆö‰πâ‰∏Ä‰∏™ WrapperÔºåÂú® step ÈáåÁî®‰Ω†ÁöÑ MLP ËÆ°ÁÆó reward
# 2. Define a Wrapper that uses your MLP to calculate the reward in step
# --------------------------------------------------------------------------------------------------
class CustomRewardWrapper(gym.Wrapper):
    def __init__(self, env, reward_model_path, device="cpu"):
        super().__init__(env)

        # state dimension
        self.dim_state = env.observation_space.shape[0]

        # action dimension
        try:                  self.dim_action = env.action_space.shape[0]
        except IndexError:    self.dim_action = 1

        self.device = device

        # Ensure the input dimensions match the checkpoint
        checkpoint = torch.load(reward_model_path, map_location=device, weights_only=False)
        input_dim = checkpoint['net.0.weight'].size(1)  # Extract input size from checkpoint
        self.reward_model = tools_RLHF.RewardMLP(input_dim - self.dim_action, self.dim_action).to(device)
        self.reward_model.load_state_dict(checkpoint)
        self.reward_model.load_state_dict(torch.load(reward_model_path, map_location=device, weights_only=False))
        self.reward_model.eval()
    
    def step(self, action):
        # ÊâßË°åÂéü envÔºå‰∏çÁî®Âéü reward
        # Execute the original env, without the original reward
        obs, _, terminated, truncated, info = self.env.step(action)   
        
        state_tensor = torch.tensor(obs, dtype=torch.float32, device=self.device).unsqueeze(0)

        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_tensor = torch.tensor([action], dtype=torch.long, device=self.device)
        else:
            action_tensor = torch.tensor(action, dtype=torch.float32, device=self.device).unsqueeze(0)

        # ‰øÆÊîπ action_tensor ÂΩ¢Áä∂
        # Modify action_tensor shape
        if action_tensor.ndim == 1:
            action_tensor = action_tensor.view(1, -1)

        # ËÆ°ÁÆóÂ•ñÂä±  
        # Calculate reward
        with torch.no_grad():
            reward_tensor = self.reward_model(state_tensor, 
                                              action_tensor)
        reward = reward_tensor.item()
        return obs, reward, terminated, truncated, info
    
def reset(self, **kwargs):
    return self.env.reset(**kwargs)


# --------------------------------------------------------------------------------------------------
# 3. ÊûÑÈÄ† vectorized ÁéØÂ¢ÉÔºåÂπ∂Â∫îÁî®Ëá™ÂÆö‰πâ Wrapper
# 3. Construct vectorized environment and apply custom Wrapper
# --------------------------------------------------------------------------------------------------

# Log path
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_path = os.path.join("Training", current_time)
os.makedirs(log_path, exist_ok=True)

# Reward Model path
MODEL_PATH = "reward_net.pth"


#### Build env
vec_env = make_vec_env(
    env_id="CartPole-v1",
    n_envs=8,
    wrapper_class=lambda env: CustomRewardWrapper(env, MODEL_PATH, device="cpu"),
    monitor_dir=log_path
)

# vec_env = VecMonitor(vec_env, log_path)


#### Build PPO Model
model = PPO(
    policy="MlpPolicy",
    env=vec_env,
    n_steps=256,
    device="cpu",
    verbose=1,
    tensorboard_log=log_path
)

#### Training
model.learn( 
    total_timesteps=50000,
    # callback=[eval_callback, save_callback]
)

#### Save Model
model.save(os.path.join(log_path, "model_full_training"))

Using cpu device
Logging to Training\2025-05-15_22-30-31\PPO_1




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.2     |
|    ep_rew_mean     | 5.1      |
| time/              |          |
|    fps             | 1804     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 28.5         |
|    ep_rew_mean          | 6.54         |
| time/                   |              |
|    fps                  | 989          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0116936825 |
|    clip_fraction        | 0.142        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.684       |
|    explained_variance   | 0.0034       |
|    learning_r

## **PPO-RLHF Testing**
You can find record video in `Training\2025-xx-xx\video`  
‰Ω†ÂèØ‰ª•Âú®Ëøô‰∏™‰ΩçÁΩÆÊâæÂà∞ÂΩïÂÉè `Training\2025-xx-xx\video`

In [39]:
import os
import sys
cur_dir = os.getcwd()
pkg_dir = os.path.dirname(cur_dir)
if pkg_dir not in sys.path:
    sys.path.append(pkg_dir)
from PPO_Original import tools


# PPO-RLHF model testing
# log_path = "Training\\2025-05-08_19-59-36" #0000FF Change this to your log path
print("log_path:", log_path)
PPO_Model_Path = os.path.join(log_path, "model_full_training")
tools.test_model("PPO", PPO_Model_Path, n_episodes=2, render = True, record=True)

log_path: Training\2025-05-15_22-30-31
Training\2025-05-15_22-30-31\model_full_training



  logger.warn(


Episode: 1 Score: 500.0
Episode: 2 Score: 500.0
