In [None]:
import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "src")))

In [None]:
import torch


env_name = "button-press-topdown-v2"
# env_name = "box-close-v2"
exp_name = "AESPA-17"
pair_algo = "ternary-100"
reward_model_algo = "MR-linear"

os.environ["CUDA_VISIBLE_DEVICES"] = "4" 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAJECTORY_LENGTH = 25

import numpy as np
from data_generation.utils import save_feedbacks_npz
from data_loading.load_data import load_pair
from data_loading.load_data import load_dataset
from reward_learning.get_model import get_reward_model
from utils.path import get_reward_model_path

import csv
import os

save_dir = "feedback_stats"
os.makedirs(save_dir, exist_ok=True)
csv_path = os.path.join(save_dir, f"feedback_summary_{env_name}.csv")

In [None]:
import glob



def predict_rewards(
    env_name,
    exp_name,
    pair_algo,
    reward_model_algo,
):
    dataset = load_dataset(env_name)
    obs_dim = dataset["observations"].shape[1]
    act_dim = dataset["actions"].shape[1]

    print("obs_dim:", obs_dim, "act_dim:", act_dim)
    model_path_pattern = get_reward_model_path(
        env_name=env_name,
        exp_name=exp_name,
        pair_algo=pair_algo,
        reward_model_algo=reward_model_algo,
        reward_model_tag="*",
    )
    model_files = glob.glob(model_path_pattern)
    model_list = []

    for model_file in model_files:
        model, _ = get_reward_model(
            reward_model_algo=reward_model_algo,
            obs_dim=obs_dim,
            act_dim=act_dim,
            model_path=model_file,
            allow_existing=True,
        )
        model_list.append(model)

    dataset = load_dataset(env_name)

    num_samples = len(dataset["observations"])
    batch_size = num_samples // 20
    model_outputs = []

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)

        obs_batch = torch.tensor(
            dataset["observations"][start_idx:end_idx], dtype=torch.float32
        ).to(device)
        act_batch = torch.tensor(
            dataset["actions"][start_idx:end_idx], dtype=torch.float32
        ).to(device)

        batch_model_outputs = []
        for model in model_list:
            rewards = model.batched_forward_trajectory(
                obs_batch=obs_batch, act_batch=act_batch
            )
            batch_model_outputs.append(rewards.detach().cpu().numpy())

        batch_predicted_rewards = np.mean(batch_model_outputs, axis=0)
        model_outputs.append(batch_predicted_rewards)

    predicted_rewards = np.concatenate(model_outputs, axis=0).squeeze()

    return predicted_rewards


In [None]:
def fill_feedback_from_raw_dataset(average_reward, cumulative_rewards, pairs):
    """
    Fill feedback in dataset using cumulative rewards and calculate mu values.
    """

    feedbacks = []

    for s0, s1 in pairs:
        sum_of_rewards_0 = cumulative_rewards[s0[1] - 1] - (
            cumulative_rewards[s0[0] - 1] if s0[0] > 0 else 0
        )
        sum_of_rewards_1 = cumulative_rewards[s1[1] - 1] - (
            cumulative_rewards[s1[0] - 1] if s1[0] > 0 else 0
        )

        if (
            np.abs(sum_of_rewards_0 - sum_of_rewards_1)
            < average_reward * TRAJECTORY_LENGTH * 0.1
        ):
            mu = 0.5
        else:
            mu = 0 if sum_of_rewards_0 > sum_of_rewards_1 else 1

        feedbacks.append((s0, s1, mu))

    return feedbacks


In [None]:
from reward_learning.train_model import train_reward_model


feedback_count_to_add = 100

raw_dataset = load_dataset(env_name)
average_reward = np.mean(raw_dataset["rewards"])
cumulative_rewards = np.cumsum(raw_dataset["rewards"])



for i in range(8, 10):
    exp_name_with_suffix = f"{exp_name}-{i:02d}"
    train_all_pairs_with_mu = load_pair(
            env_name=env_name,
            exp_name=exp_name_with_suffix,
            pair_type="train_all",
            pair_algo="raw",
    )

    all_traj_set = []

    for p in train_all_pairs_with_mu:
        all_traj_set.append(p[0])
        all_traj_set.append(p[1])

    for j in range(10):
        if j == 0:
            previous_pair_name = pair_algo
        else:
            previous_pair_name = current_pair_name

        current_pair_name = f"{pair_algo}-aug-high-{j:02d}"

        print(previous_pair_name)

        predicted_rewards = predict_rewards(
            env_name=env_name,
            exp_name=exp_name_with_suffix,
            pair_algo=previous_pair_name,
            reward_model_algo=reward_model_algo,
        )

        predicted_rewards = np.array(predicted_rewards)
        predicted_cumsum = np.cumsum(predicted_rewards)

        # ✅ sub_traj + 원본 traj 인덱스 저장
        traj_with_reward = []
        for traj_idx, traj in enumerate(all_traj_set):
            for start in range(traj[0], traj[1] - TRAJECTORY_LENGTH + 1):
                traj_start = start
                traj_end = start + TRAJECTORY_LENGTH

                reward_sum = predicted_cumsum[traj_end - 1] - (
                    predicted_cumsum[traj_start - 1] if traj_start > 0 else 0
                )
                sub_traj = (traj_start, traj_end)
                traj_with_reward.append((sub_traj, reward_sum, traj_idx))   

        traj_with_reward.sort(key=lambda x: x[1], reverse=True)

        pairs = []
        num_pairs = feedback_count_to_add
        # stride = len(traj_with_reward) / (num_pairs * 2)

        # for i in range(num_pairs):
        #     idx1 = int(i * 2 * stride)
        #     idx2 = idx1 + 1
        #     if idx2 >= len(traj_with_reward):
        #         break

        #     traj1 = traj_with_reward[idx1][0]
        #     traj2 = traj_with_reward[idx2][0]
        #     pairs.append((traj1, traj2))

        # for i in range(num_pairs):
        #     idx1 = i
        #     idx2 = len(traj_with_reward) - i - 1

        #     traj1 = traj_with_reward[idx1][0]
        #     traj2 = traj_with_reward[idx2][0]
        #     pairs.append((traj1, traj2))

        

        k = 0
        while k < len(traj_with_reward) - 1:
            traj1, _, idx1 = traj_with_reward[k]
            traj2, _, idx2 = traj_with_reward[k + 1]

            if idx1 != idx2:
                pairs.append((traj1, traj2))
                k += 2  
            else:
                k += 1  

            if len(pairs) >= feedback_count_to_add:
                break
        
        feedbacks = fill_feedback_from_raw_dataset(
            average_reward=average_reward,
            cumulative_rewards=cumulative_rewards,
            pairs=pairs,
        )

        count_0 = 0
        count_0_5 = 0
        count_1 = 0

        for _, _, mu in feedbacks:
            if mu == 0:
                count_0 += 1
            elif mu == 0.5:
                count_0_5 += 1
            elif mu == 1:
                count_1 += 1

        write_header = not os.path.exists(csv_path)

        with open(csv_path, mode="a", newline="") as file:
            writer = csv.writer(file)

            if write_header:
                writer.writerow(["exp_name", "pair_name", "feedback_count", "mu=0", "mu=0.5", "mu=1"])

            writer.writerow([
                exp_name_with_suffix,
                current_pair_name,
                len(feedbacks),
                count_0,
                count_0_5,
                count_1
            ])
        
        # add expected feedbacks
        for idx, (s0, s1) in enumerate(pairs):
            low_traj = traj_with_reward[len(traj_with_reward) - 1 - idx]

            feedbacks.append(
                (low_traj[0], s0, 1)
            )
            feedbacks.append(
                (low_traj[0], s1, 1)
            )



        previous_feedback = load_pair(
            env_name=env_name,
            exp_name=exp_name_with_suffix,
            pair_type="train",
            pair_algo=previous_pair_name,
        ).tolist()

        print(len(previous_feedback+feedbacks))

        save_feedbacks_npz(
            env_name=env_name,
            exp_name=exp_name_with_suffix,
            feedbacks=previous_feedback + feedbacks,
            pair_type="train",
            pair_name=current_pair_name
        )

        for k in range(7):
            train_reward_model(
                env_name=env_name,
                exp_name=exp_name_with_suffix,
                pair_algo=current_pair_name,
                reward_model_algo=reward_model_algo,
                reward_model_tag=f"{k:02d}",
                num_epoch=100,
        )
  
    

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import csv

dataset = load_dataset(env_name)

save_dir = "reward_eval_stats"
os.makedirs(save_dir, exist_ok=True)

csv_path = os.path.join(save_dir, "reward_eval_summary.csv")

write_header = not os.path.exists(csv_path)
if write_header:
    with open(csv_path, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["env_name", "exp_name", "pair_algo", "pcc", "order_agreement"])

for i in range(10):
    exp_name_with_suffix = f"{exp_name}-{i:02d}"
    for j in range(10):
        current_pair_algo = f"ternary-100-aug-similar-{j:02d}"
        predicted_rewards = predict_rewards(
            env_name=env_name,
            exp_name=exp_name_with_suffix,
            pair_algo=current_pair_algo,
            reward_model_algo=reward_model_algo,
        )

        true_rewards = dataset["rewards"]

        true_rewards = np.array(true_rewards).flatten()
        predicted_rewards = np.array(predicted_rewards).flatten()

        plt.figure(figsize=(6, 6))
        plt.scatter(true_rewards, predicted_rewards, alpha=0.1, s=10, label="Samples")
        plt.xlabel("True Reward")
        plt.ylabel("Predicted Reward")
        plt.title(f"Predicted vs True Rewards ({exp_name_with_suffix} / {current_pair_algo})")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

        try:
            pcc = np.corrcoef(true_rewards, predicted_rewards)[0, 1]
        except Exception as e:
            print(f"[{exp_name_with_suffix}] PCC 계산 중 오류 발생: {e}")
            pcc = float('nan')

        num_samples = 100000
        agree = 0
        n = len(true_rewards)

        for _ in range(num_samples):
            i1, i2 = random.sample(range(n), 2)
            gt_diff = true_rewards[i1] - true_rewards[i2]
            pred_diff = predicted_rewards[i1] - predicted_rewards[i2]

            if gt_diff * pred_diff > 0 or (gt_diff == 0 and pred_diff == 0):
                agree += 1

        order_agreement = agree / num_samples

        with open(csv_path, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([
                env_name,
                exp_name_with_suffix,
                current_pair_algo,
                f"{pcc:.6f}",
                f"{order_agreement:.6f}",
            ])

        print(f"[{exp_name_with_suffix}]")
        print(f"  PCC (Pearson Correlation): {pcc:.4f}")
        print(f"  Order Agreement ({num_samples} pairs): {order_agreement:.4f}")
        print()