In [1]:
import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "src")))


In [2]:
import torch
import numpy as np


# env_name = "button-press-topdown-v2"
# env_name = "box-close-v2"
env_name = "dial-turn-v2"
exp_name = "AESPA-22-test"
pair_algo = "ternary-500"
reward_model_algo = "MR-exp"

os.environ["CUDA_VISIBLE_DEVICES"] = "7" 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAJECTORY_LENGTH = 25

In [3]:
from data_loading.load_data import load_dataset


dataset = load_dataset(
    env_name=env_name,
)
cumsum = np.cumsum(dataset["rewards"], dtype=np.float64)
average_reward = np.mean(dataset["rewards"])


def get_reward_from_cumsum(start, end):
    """
    Get the reward from the cumulative sum between start and end indices.
    """

    if start == 0:
        return cumsum[end - 1]
    else:
        return cumsum[end - 1] - cumsum[start - 1]


In [4]:

import itertools

from tqdm import tqdm
from data_loading.load_data import load_pair
import matplotlib.pyplot as plt
import seaborn as sns


def augment_with_bucket(env_name, exp_name, result, k=20, select_ratio=0.005, z=10):
    """
    각 bucket에서 uncertainty 가장 낮은 top N% trajectory 선택 후,
             모든 bucket 쌍에 대해 pair 생성
    각 bucket 쌍에서 신뢰구간 겹치지 않는 pair를 top_k 개 만큼 생성
    """

    # ----------- 1. 기본 정보 세팅 (trajectory 정리) -----------
    data = np.array(result)
    mean = data[:, 1]
    std = data[:, 2]
    var = std**2
    mean_cum = np.cumsum(mean, dtype=np.float64)
    var_cum = np.cumsum(var, dtype=np.float64)

    feedbacks = load_pair(
        env_name=env_name,
        exp_name=exp_name,
        pair_type="train",
        pair_algo="ternary-10000",
    )
    trajectories = []

    for p in feedbacks:
        trajectories.append(p[0])
        trajectories.append(p[1])

    trajs = []
    for (s, e) in trajectories:
        r = mean_cum[e - 1] - (mean_cum[s - 1] if s > 0 else 0)
        v = var_cum[e - 1] - (var_cum[s - 1] if s > 0 else 0)
        std_ = np.sqrt(v)
        trajs.append(((s, e), r, std_))

    # ----------- 2. 버킷 나누기 -----------
    trajs.sort(key=lambda x: x[1])  # reward 기준 정렬
    n = len(trajs)
    buckets = [trajs[n * i // k : n * (i + 1) // k] for i in range(k)]
    bucket_ranges = [(buckets[i][0][1], buckets[i][-1][1]) for i in range(k)]

    for i, bucket_range in enumerate(bucket_ranges):
        print(f"Bucket {i}: Range = {bucket_range[0]:.2f} to {bucket_range[1]:.2f}")




    # ----------- 3. Step 3: std 기준 top N% 선택 후 전체 조합 -----------
    feedbacks_bucket_1 = []

    # 각 bucket에서 select_ratio 비율만큼 선택
    selected_per_bucket = []
    for bucket in buckets:
        num_select = max(1, int(len(bucket) * select_ratio))
        sorted_by_std = sorted(bucket, key=lambda x: x[2])
        selected_per_bucket.append(sorted_by_std[:num_select])

    success_rate_i_j = []

    # 모든 bucket 쌍 조합에 대해 pair 생성 및 정답률 계산
    for i, j in itertools.combinations(range(k), 2):
        if (np.abs(i - j ) < 3): 
            continue
        correct = 0
        total = 0
        for traj_i, traj_j in itertools.product(selected_per_bucket[i], selected_per_bucket[j]):
            s0, r0, _ = traj_i
            s1, r1, _ = traj_j
            feedbacks_bucket_1.append((s0, s1, 1.0))

            rewards_sum_0 = get_reward_from_cumsum(s0[0], s0[1])
            rewards_sum_1 = get_reward_from_cumsum(s1[0], s1[1])

            mu_value = np.where(
                np.abs(rewards_sum_0 - rewards_sum_1) < average_reward * 0.0 * 25,
                0.5,
                np.where(rewards_sum_0 > rewards_sum_1, 0, 1),
            )
            predicted = 1.0  

            if predicted == mu_value:
                correct += 1
            total += 1

        acc = correct / total if total > 0 else 0.0
        success_rate_i_j.append((i, j, acc))


    print(f"[bucket_1] Generated {len(feedbacks_bucket_1)} confident pairs")

   # heatmap matrix 만들기
    heat = np.full((k, k), np.nan)
    for i, j, acc in success_rate_i_j:
        heat[i, j] = acc

    # 전체 정답률 평균 계산
    all_accs = [acc for _, _, acc in success_rate_i_j]
    overall_accuracy = sum(all_accs) / len(all_accs) if all_accs else 0.0
    print(f"Overall Pairwise Accuracy: {overall_accuracy:.4f}")

    # heatmap 시각화
    plt.figure(figsize=(5, 4))
    sns.heatmap(heat, annot=True, fmt=".2f", cmap="YlGnBu", cbar=True,
                xticklabels=range(k), yticklabels=range(k), annot_kws={"size": 4})
    plt.title("Pairwise Accuracy Heatmap")
    plt.xlabel("Bucket j")
    plt.ylabel("Bucket i")
    plt.tight_layout()
    plt.show()


    # conf 로 구한 pair들의 분포 그리기
    unlabel_feedbacks = load_pair(
        env_name=env_name,
        exp_name=exp_name,
        pair_type="train",
        pair_algo="ternary-10000",
    )

    conf_pairs = []

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    threshold = 0.999

    for p in tqdm(unlabel_feedbacks, desc="Filtering pairs using uncertainty"):
        (s0, e0), (s1, e1), _ = p

        # 예측 reward 합
        r0 = mean_cum[e0 - 1] - (mean_cum[s0 - 1] if s0 > 0 else 0)
        r1 = mean_cum[e1 - 1] - (mean_cum[s1 - 1] if s1 > 0 else 0)

        mu = sigmoid(r1 - r0)

        # mu가 threshold 이상인 경우만 confident_pairs에 추가
        if mu > threshold or mu < 1 - threshold:
            conf_pairs.append(((s0,e0), (s1,e1)))
        

    def get_total_reward(s, e, reward_cumsum):
        return reward_cumsum[e - 1] - (reward_cumsum[s - 1] if s > 0 else 0)

    def get_bucket_index(reward, bucket_ranges):
        for i, (low, high) in enumerate(bucket_ranges):
            if low <= reward <= high:  # high 포함 or 미포함은 상황 따라 조정
                return i
        return None  # 혹시라도 범위를 벗어나는 경우
    
    bucket_matrix = np.zeros((k, k), dtype=int)

    # bucket_pairs = [(i, j) for i, j, _ in feedbacks_bucket_1]

    for (s0, e0), (s1, e1) in conf_pairs:
        r0 = get_total_reward(s0, e0, mean_cum)
        r1 = get_total_reward(s1, e1, mean_cum)

        b0 = get_bucket_index(r0, bucket_ranges)
        b1 = get_bucket_index(r1, bucket_ranges)

        if b0 is not None and b1 is not None:
            bucket_matrix[b0, b1] += 1

    plt.figure(figsize=(6, 5))
    sns.heatmap(bucket_matrix, annot=True, fmt="d", cmap="Blues",
                xticklabels=[f"B{j}" for j in range(k)],
                yticklabels=[f"B{i}" for i in range(k)])
    plt.xlabel("Bucket j (r1)")
    plt.ylabel("Bucket i (r0)")
    plt.title("Confident Pair Distribution across Buckets")
    plt.tight_layout()
    plt.show()




In [5]:
from reward_learning.train_model import train_reward_model


def train_mr(env_name, exp_name):
    for i in range(7):
        train_reward_model(
            env_name=env_name,
            exp_name=exp_name,
            pair_algo="ternary-500",
            reward_model_algo="MR-exp",
            reward_model_tag=f"{i:02d}",
            num_epoch=200,
        )

In [6]:
from data_generation.data_research import calculate_from_mr, train_mr_and_surf


def main():
    # train_mr(env_name, exp_name)
    result = calculate_from_mr(env_name, exp_name)
    augment_with_bucket(
        env_name=env_name,
        exp_name=exp_name,
        result=result,
        k=20,
        select_ratio=0.01,
        z=10,
    )

In [7]:
main()

FileNotFoundError: Pair file not found at pair/dial-turn-v2/AESPA-22-test/train/ternary-10000.npz