In [1]:
import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.path.dirname("__file__"), "src")))


In [2]:
import torch
import numpy as np

env_list = [
        "button-press-topdown-v2",
        "box-close-v2",
        "dial-turn-v2",
        "sweep-v2",
        "button-press-topdown-wall-v2",
        "sweep-into-v2",
        "drawer-open-v2",
        "lever-pull-v2",
]

pair_algo = "ternary-500"
reward_model_algo = "MR-linear"

os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

TRAJECTORY_LENGTH = 25

In [3]:
from data_generation.data_research import predict_rewards
from data_loading.load_data import load_dataset


def predict_only_rewards(
    env_name,
    exp_name,
    pair_algo,
    reward_model_algo,
):
    result = predict_rewards(
        env_name=env_name,
        exp_name=exp_name,
        pair_algo=pair_algo,
        reward_model_algo=reward_model_algo,
    )

    pred_reward_list = [r for (_, r, _, _) in result]
    pred_reward_list = np.array(pred_reward_list)

    return pred_reward_list


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def get_total_reward(s, e, reward_cumsum):
    return reward_cumsum[e - 1] - (reward_cumsum[s - 1] if s > 0 else 0)

In [4]:
import numpy as np
import matplotlib.pyplot as plt
from data_loading.load_data import load_pair

def eval_feedbacks(feedbacks, env_name, unlabel_pair="unlabel-100000"):
    unlabel_pairs = load_pair(
        env_name=env_name,
        exp_name="CUDA-01-00",
        pair_type="train",
        pair_algo=unlabel_pair,
    )

    dataset = load_dataset(env_name=env_name)
    cumsum = np.cumsum(dataset["rewards"], dtype=np.float64)

    raw_traj_truth_rewards = []
    for p in unlabel_pairs:
        (s0, e0), (s1, e1), _ = p
        raw_traj_truth_rewards.append(get_total_reward(s0, e0, cumsum))
        raw_traj_truth_rewards.append(get_total_reward(s1, e1, cumsum))

    traj_truth_rewards = []
    total = correct = 0
    for (s0, e0), (s1, e1), mu in feedbacks:
        r0 = get_total_reward(s0, e0, cumsum)
        r1 = get_total_reward(s1, e1, cumsum)
        if mu != 0.5:
            total += 1
            if (mu == 1.0 and r0 < r1) or (mu == 0.0 and r0 > r1):
                correct += 1
        traj_truth_rewards.extend([(r0, r1), (r1, r0)])

    accuracy = correct / total if total > 0 else 0.0

    bins = 25
    bin_edges = np.linspace(0, 250, bins + 1)
    r0_list = [r0 for r0, _ in traj_truth_rewards]
    r1_list = [r1 for _, r1 in traj_truth_rewards]
    observed, _, _ = np.histogram2d(r0_list, r1_list, bins=[bin_edges, bin_edges])
    observed = observed.T

    traj_hist, _ = np.histogram(raw_traj_truth_rewards, bins=bin_edges)
    traj_prob = traj_hist / np.sum(traj_hist)
    expected = np.outer(traj_prob, traj_prob) * np.sum(observed)

    eps = 1e-8
    P = observed / (np.sum(observed) + eps)
    Q = expected / (np.sum(expected) + eps)
    mask = P > 0
    kl_divergence = np.sum(P[mask] * np.log(P[mask] / (Q[mask] + eps)))

    return kl_divergence, accuracy

In [5]:
import random
from tqdm import tqdm


def extract_feedbacks_without_buckets(
    env_name,
    exp_name,
    result,
    label_pair_algo="ternary-500",
    unlabel_pair_algo="unlabel-100000",
    new_pair_name="aug-bucket",
    n=10000,
    m=10000,
    z=3.1,
    threshold=0.99,
    use_conf=False,
):
    unlabeled_feedbacks = load_pair(
        env_name=env_name,
        exp_name=exp_name,
        pair_type="train",
        pair_algo=unlabel_pair_algo,
    )

    data = np.array(result)
    mean = data[:, 1]
    std = data[:, 2]
    var = std**2
    mean_cum = np.cumsum(mean, dtype=np.float64)
    var_cum = np.cumsum(var, dtype=np.float64)

    # trajectory list 생성
    trajectories = []
    for p in unlabeled_feedbacks:
        trajectories.append(p[0])
        trajectories.append(p[1])
    trajectories = trajectories[:n]

    traj_data = []
    for s, e in trajectories:
        r = get_total_reward(s, e, mean_cum)
        v = get_total_reward(s, e, var_cum)
        std_ = np.sqrt(v)
        traj_data.append(((s, e), r, std_))

    seen_pairs = set()
    feedbacks = []
    total = len(traj_data)

    pbar = tqdm(total=m, desc="Sampling confident feedbacks")
    while len(feedbacks) < m:
        i, j = random.sample(range(total), 2)
        if i == j:
            continue

        pair_key = (min(i, j), max(i, j))
        if pair_key in seen_pairs:
            continue
        seen_pairs.add(pair_key)

        t0 = traj_data[i]
        t1 = traj_data[j]

        if t0[1] > t1[1]:
            t0, t1 = t1, t0
        
        (s0, r0, std0) = t0
        (s1, r1, std1) = t1

        if use_conf:
            mu = sigmoid(r1 - r0)
            if mu > threshold:
                feedbacks.append((s0, s1, 1.0))
                pbar.update(1)
            elif 1 - mu > threshold:
                feedbacks.append((s0, s1, 0.0))
                pbar.update(1)
        else:
            upper_0 = r0 + z * std0
            lower_1 = r1 - z * std1

            if upper_0 < lower_1:
                feedbacks.append((s0, s1, 1.0))
                pbar.update(1)

    pbar.close()

    return feedbacks

In [None]:
import csv

z_list = [1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5, 2.75, 3.0, 4.0, 4.5, 5.0, 7.0, 8.5, 10.0, 12.5, 15.0, 17.5, 20.0]
mu_list = [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.975, 0.99, 0.999, 0.9999, 0.99999]

results = []

csv_path = "feedback_eval_results.csv"

# 헤더가 없을 경우 생성
if not os.path.exists(csv_path):
    with open(csv_path, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["env", "is_z", "z", "kl", "acc"])

for env in env_list:
    result_list = []
    for i in range(10):
        exp_name = f"CUDA-01-{i:02d}"
        result = predict_rewards(
            env, exp_name, pair_algo="ternary-500", reward_model_algo="MR-exp"
        )
        result_list.append(result)

    for z in z_list:
        feedbacks = []
        for i in range(10):
            exp_name = f"CUDA-01-{i:02d}"
            feedback = extract_feedbacks_without_buckets(
                env_name=env,
                exp_name=exp_name,
                result=result_list[i],
                label_pair_algo="ternary-500",
                unlabel_pair_algo="unlabel-100000",
                new_pair_name="aug-bucket",
                n=10000,
                m=10000,
                z=z,
                threshold=0.999,
                use_conf=False,
            )
            feedbacks.extend(feedback)

        kl, acc = eval_feedbacks(feedbacks, env_name=env, unlabel_pair="unlabel-100000")

        with open(csv_path, mode='a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([env, True, z, round(kl, 4), round(acc, 4)])

        print(f"Env: {env}, Z: {z}, KL: {kl:.4f}, Acc: {acc:.4f}")
    
    for mu in mu_list:
        feedbacks = []
        for i in range(10):
            exp_name = f"CUDA-01-{i:02d}"
            feedback = extract_feedbacks_without_buckets(
                env_name=env,
                exp_name=exp_name,
                result=result_list[i],
                label_pair_algo="ternary-500",
                unlabel_pair_algo="unlabel-100000",
                new_pair_name="aug-bucket",
                n=10000,
                m=10000,
                z=3.1,
                threshold=mu,
                use_conf=True,
            )
            feedbacks.extend(feedback)

        kl, acc = eval_feedbacks(feedbacks, env_name=env, unlabel_pair="unlabel-100000")

        with open(csv_path, mode='a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([env, False, mu, round(kl, 4), round(acc, 4)])

        print(f"Env: {env}, Mu: {mu}, KL: {kl:.4f}, Acc: {acc:.4f}")

obs_dim: 39 act_dim: 4
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_00.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_02.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_01.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_06.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_03.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_05.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-00/reward/ternary-500/MR-exp_04.pth
obs_dim: 39 act_dim: 4
Model loaded from model/button-press-topdown-v2/CUDA-01-01/reward/ternary-500/MR-exp_00.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-01/reward/ternary-500/MR-exp_02.pth
Model loaded from model/button-press-topdown-v2/CUDA-01-01/reward/ternary-500/MR-exp_01.pth
Model loaded from model/button-pre