In [1]:
import sys
import os
import math
import pandas as pd
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
sys.path.insert(0, '../../')
from scale_rl.common.wandb_utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from rliable import library as rly
from rliable import metrics as rly_metrics
from rliable import plot_utils as rly_plot_utils

aggregate_func = lambda x: np.array([
  rly_metrics.aggregate_iqm(x),
  rly_metrics.aggregate_median(x),
  rly_metrics.aggregate_mean(x)]
)

#### Collect Results

#### Set Experiment Name

In [4]:
abl_df = read_and_concatenate_eval_df(directory_path='../../results/hypersimba/design_study')
base_df = read_and_concatenate_eval_df(directory_path='../../results/hypersimba')

eval_df = pd.concat([abl_df, base_df])
print(eval_df['exp_name'].unique())
eval_df

['k_scaler_scale_1' 'e_mse_loss' 'g_no_rew_bound' 'j_scaler_init_1'
 'f_no_rew_normalize' 'i_no_lr_decay' 'b_no_shift' 'h_hard_target_copy'
 'c_shift_1' 'l_alpha_init_05' 'a_no_input_norm' 'm_alpha_scale_1'
 'd_resize_proj' 'hypersimba']


Unnamed: 0,exp_name,env_name,seed,metric,env_step,value
0,k_scaler_scale_1,h1-pole-v0,9000,avg_return,0.0,35.859411
1,k_scaler_scale_1,h1-pole-v0,9000,avg_return,100000.0,142.686500
2,k_scaler_scale_1,h1-pole-v0,9000,avg_return,200000.0,547.310449
3,k_scaler_scale_1,h1-pole-v0,9000,avg_return,300000.0,533.528960
4,k_scaler_scale_1,h1-pole-v0,9000,avg_return,400000.0,674.806596
...,...,...,...,...,...,...
13535,hypersimba,HalfCheetah-v4,0,avg_success,800000.0,0.000000
13536,hypersimba,HalfCheetah-v4,0,avg_success,850000.0,0.000000
13537,hypersimba,HalfCheetah-v4,0,avg_success,900000.0,0.000000
13538,hypersimba,HalfCheetah-v4,0,avg_success,950000.0,0.000000


In [5]:
design_choices = {
    'hypersimba': 'HyperSimba',
    'a_no_input_norm': 'No L2 Normalize',
    'b_no_shift': 'No Shifting',
    'c_shift_1': '$c_{shift}: 1 $',
    'd_resize_proj': 'Resize Projection',
    'e_mse_loss': 'MSE Loss',
    'f_no_rew_normalize': 'No Reward Scaling',
    'g_no_rew_bound': 'No Return Bounding',
    'h_hard_target_copy': 'Hard Target',
    'i_no_lr_decay': 'No LR Decay',
    'j_scaler_init_1': '$s_{init}: 1$',
    'k_scaler_scale_1': '$s_{scale}: 1$',
    'l_alpha_init_05': '$\\alpha_{init}: 0.5$',
    'm_alpha_scale_1': '$\\alpha_{scale}: 1$',
}

In [6]:
eval_df['exp_name'] = eval_df['exp_name'].map(design_choices)
eval_df['exp_name'].unique()

array(['$s_{scale}: 1$', 'MSE Loss', 'No Return Bounding',
       '$s_{init}: 1$', 'No Reward Scaling', 'No LR Decay', 'No Shifting',
       'Hard Target', '$c_{shift}: 1 $', '$\\alpha_{init}: 0.5$',
       'No L2 Normalize', '$\\alpha_{scale}: 1$', 'Resize Projection',
       'HyperSimba'], dtype=object)

#### Set Environmet Name

In [9]:
from scale_rl.envs.mujoco import MUJOCO_ALL, MUJOCO_RANDOM_SCORE, MUJOCO_TD3_SCORE
from scale_rl.envs.dmc import DMC_EASY_MEDIUM, DMC_HARD
from scale_rl.envs.humanoid_bench import HB_LOCOMOTION_NOHAND, HB_RANDOM_SCORE, HB_SUCCESS_SCORE
from scale_rl.envs.myosuite import MYOSUITE_TASKS

In [10]:
def replace_underbar_to_hypen(env_name_list):
    for idx in range(len(env_name_list)):
        env_name_list[idx] = env_name_list[idx].replace('_', '-')
    return env_name_list

def replace_underbar_in_dict_keys(old_dict):
    new_dict = {}
    for k, v in old_dict.items():
        new_key = k.replace('_', '-')  # <--- direct string replace
        new_dict[new_key] = v
    return new_dict

MUJOCO_ALL = replace_underbar_to_hypen(MUJOCO_ALL)
DMC_EM = replace_underbar_to_hypen(DMC_EASY_MEDIUM)
DMC_HARD = replace_underbar_to_hypen(DMC_HARD)
MYOSUITE_TASKS = replace_underbar_to_hypen(MYOSUITE_TASKS)
HB_LOCOMOTION_NOHAND = replace_underbar_to_hypen(HB_LOCOMOTION_NOHAND)

MUJOCO_RANDOM_SCORE = replace_underbar_in_dict_keys(MUJOCO_RANDOM_SCORE)
MUJOCO_TD3_SCORE = replace_underbar_in_dict_keys(MUJOCO_TD3_SCORE)
HB_RANDOM_SCORE = replace_underbar_in_dict_keys(HB_RANDOM_SCORE)
HB_SUCCESS_SCORE = replace_underbar_in_dict_keys(HB_SUCCESS_SCORE)

eval_df['env_name'] = eval_df['env_name'].str.replace('_', '-')
eval_df

Unnamed: 0,exp_name,env_name,seed,metric,env_step,value
0,$s_{scale}: 1$,h1-pole-v0,9000,avg_return,0.0,35.859411
1,$s_{scale}: 1$,h1-pole-v0,9000,avg_return,100000.0,142.686500
2,$s_{scale}: 1$,h1-pole-v0,9000,avg_return,200000.0,547.310449
3,$s_{scale}: 1$,h1-pole-v0,9000,avg_return,300000.0,533.528960
4,$s_{scale}: 1$,h1-pole-v0,9000,avg_return,400000.0,674.806596
...,...,...,...,...,...,...
13535,HyperSimba,HalfCheetah-v4,0,avg_success,800000.0,0.000000
13536,HyperSimba,HalfCheetah-v4,0,avg_success,850000.0,0.000000
13537,HyperSimba,HalfCheetah-v4,0,avg_success,900000.0,0.000000
13538,HyperSimba,HalfCheetah-v4,0,avg_success,950000.0,0.000000


### Generate Table

In [22]:
def collect_full_results_per_env(
    eval_df,
    aggregate_func,
    cell_color_func,
    base_exp_name,
    exp_names,
    env_type: str,
    env_steps: int = 1e6,
    metric: str = "avg_return",
):
    """
    Collects and aggregates results per environment for given experiments.

    Parameters:
    - eval_df (pd.DataFrame): Evaluation DataFrame containing metrics.
    - aggregate_func (callable): Function to aggregate metrics.
    - exp_names (list of str): List of experiment names.
    - env_steps (int): Number of environment steps to filter.
    - metric (str): Metric to evaluate.

    Returns:
    - pd.DataFrame: Aggregated results suitable for LaTeX table generation.
    """
    # Filter the DataFrame based on metric and env_steps
    filtered_df = eval_df[
        (eval_df["metric"] == metric) & 
        (eval_df["env_step"] == env_steps) & 
        (eval_df["exp_name"].isin(exp_names))
    ]
    
    # Display experiment names and number of unique seeds
    for exp_name in exp_names:
        exp_data = filtered_df[filtered_df["exp_name"] == exp_name]
        num_seeds = exp_data["seed"].nunique()
        print(f"exp_name: {exp_name} - num_seeds: {num_seeds}")

    # Initialize the results DataFrame
    full_results_df = pd.DataFrame()
    
    ############################################
    # per environment score
    environments = sorted(filtered_df["env_name"].unique())
    for env in environments:
        env_data = filtered_df[filtered_df["env_name"] == env]
        base_data = env_data[env_data["exp_name"] == base_exp_name]
        base_mean = base_data["value"].mean()
        row = {"Task": f"\\texttt{{{env}}}"}

        for exp_name in exp_names:
            exp_data = env_data[env_data["exp_name"] == exp_name]
            if exp_data.empty:
                row[exp_name] = "N/A"
                continue

            mean = float(exp_data['value'].mean())
            std_dev = float(exp_data['value'].std())
            num_seeds = exp_data["seed"].nunique()
            
            # Confidence interval
            low_CI = mean - 1.960 * std_dev / math.sqrt(num_seeds)
            high_CI = mean + 1.960 * std_dev / math.sqrt(num_seeds)
            
            if metric == 'avg_success':
                base_mean = round(base_mean, 3)
                mean = round(mean, 3)
                low_CI = round(low_CI, 3)
                high_CI = round(high_CI, 3)
            else:
                mean, low_CI, high_CI = (
                    int(mean), 
                    int(low_CI), 
                    int(high_CI)
                )
            # ---------------------------
            # Apply cell-color difference
            # ---------------------------
            diff = 100.0 * (mean - base_mean) / base_mean
            color = cell_color_func(diff)

            cell_text = ""
            if color:  # Only prepend \cellcolor if color is non-empty
                cell_text += f"\\cellcolor{{{color}}}"
            # Then add the numeric + CI
            cell_text += f"{mean} \\textcolor{{gray}}{{[{low_CI}, {high_CI}]}}\n"
            row[exp_name] = cell_text
        
        # Append to the results DataFrame
        full_results_df = pd.concat(
            [full_results_df, pd.DataFrame([row])], 
            ignore_index=True,
        )

    ############################################
    # aggregated score
    if env_type == "MUJOCO_ALL":
        # Apply MUJOCO-specific normalization
        # Assuming normalize_score_with_random_and_base_score returns a DataFrame
        filtered_df = normalize_score_with_random_and_base_score(
            filtered_df,
            MUJOCO_RANDOM_SCORE,
            MUJOCO_TD3_SCORE,
        )
    elif env_type in ["DMC_EM", "DMC_HARD"]:
        filtered_df['value'] /= 1000.0
    elif env_type == "MYOSUITE_TASKS":
        pass
    elif env_type == "HB_LOCOMOTION_NOHAND":
        # Apply HB-specific normalization
        filtered_df = normalize_score_with_random_and_base_score(
            filtered_df,
            HB_RANDOM_SCORE,
            HB_SUCCESS_SCORE,
        )
    else:
        raise ValueError
    
    # Generate metric matrix and aggregate scores
    metric_matrix_dict = generate_metric_matrix_dict(
        filtered_df, 
        env_step=env_steps, 
        metric_type=metric,
    )
    
    aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(
        metric_matrix_dict, aggregate_func, reps=10000
    )

    for idx, agg in enumerate(["IQM", "Median", "Mean"]):
        row = {"Task": agg}

        # Base experiment's aggregate
        if base_exp_name in aggregate_scores:
            base_val = aggregate_scores[base_exp_name][idx]
        else:
            base_val = None

        for exp_name in exp_names:
            if exp_name not in aggregate_scores:
                row[exp_name] = "N/A"
                continue

            mean = aggregate_scores[exp_name][idx]
            low_CI = aggregate_score_cis[exp_name][0][idx]
            high_CI = aggregate_score_cis[exp_name][1][idx]

            mean = round(mean, 3)
            low_CI = round(low_CI, 3)
            high_CI = round(high_CI, 3)
            
            # Percent difference from base for aggregates
            diff = 100.0 * (mean - base_val) / abs(base_val)
            color = cell_color_func(diff)  # choose color

            cell_text = ""
            if color:
                cell_text += f"\\cellcolor{{{color}}}"
            cell_text += f"{mean} \\textcolor{{gray}}{{[{low_CI}, {high_CI}]}}\n"

            row[exp_name] = cell_text
        
        full_results_df = pd.concat(
            [full_results_df, pd.DataFrame([row])],
            ignore_index=True
        )

    return full_results_df

In [23]:
def get_cell_color(diff):
    if diff >= 5:
        return 'ab_better'
    elif 2 <= diff < 5:
        return 'ab_good'
    elif -2 < diff < 2:
        return ''
    elif -5 <= diff <= -2:
        return 'ab_bad'
    elif -10 <= diff < -5:
        return 'ab_worse'
    else:
        return 'ab_worst'

In [24]:
input_exp_names = ['HyperSimba', 'No L2 Normalize', 'No Shifting', '$c_{shift}: 1 $', 'Resize Projection']
output_exp_names = ['HyperSimba', 'MSE Loss', 'No Reward Scaling', 'No Return Bounding', 'Hard Target']
training_exp_names = ['HyperSimba', 'No LR Decay', '$s_{init}: 1$', '$s_{scale}: 1$', '$\\alpha_{init}: 0.5$', '$\\alpha_{scale}: 1$']

In [45]:
cur_domain = HB_LOCOMOTION_NOHAND # MUJOCO_ALL, DMC_EM, DMC_HARD, MYOSUITE_TASKS, HB_LOCOMOTION_NOHAND
cur_env_type = 'HB_LOCOMOTION_NOHAND'
cur_exp_names = training_exp_names # input_exp_names, output_exp_names, training_exp_names

In [46]:
cur_eval_df = eval_df[eval_df['env_name'].isin(cur_domain)]
cur_full_results_df = collect_full_results_per_env(
    cur_eval_df,
    base_exp_name='HyperSimba',
    cell_color_func=get_cell_color,
    exp_names=cur_exp_names,
    env_type=cur_env_type,
    aggregate_func=aggregate_func,
    env_steps=1e6,
    metric='avg_return',
)

exp_name: HyperSimba - num_seeds: 10
exp_name: No LR Decay - num_seeds: 4
exp_name: $s_{init}: 1$ - num_seeds: 10
exp_name: $s_{scale}: 1$ - num_seeds: 10
exp_name: $\alpha_{init}: 0.5$ - num_seeds: 4
exp_name: $\alpha_{scale}: 1$ - num_seeds: 4


In [47]:
print(cur_full_results_df.to_latex(index=False))

\begin{tabular}{lllllll}
\toprule
Task & HyperSimba & No LR Decay & $s_{init}: 1$ & $s_{scale}: 1$ & $\alpha_{init}: 0.5$ & $\alpha_{scale}: 1$ \\
\midrule
\texttt{h1-balance-hard-v0} & 143 \textcolor{gray}{[128, 157]}
 & \cellcolor{ab_worst}118 \textcolor{gray}{[86, 150]}
 & \cellcolor{ab_bad}139 \textcolor{gray}{[119, 159]}
 & \cellcolor{ab_bad}139 \textcolor{gray}{[116, 162]}
 & \cellcolor{ab_better}152 \textcolor{gray}{[140, 164]}
 & \cellcolor{ab_bad}139 \textcolor{gray}{[119, 159]}
 \\
\texttt{h1-balance-simple-v0} & 723 \textcolor{gray}{[651, 795]}
 & \cellcolor{ab_better}812 \textcolor{gray}{[758, 867]}
 & \cellcolor{ab_better}815 \textcolor{gray}{[794, 836]}
 & \cellcolor{ab_better}813 \textcolor{gray}{[793, 833]}
 & \cellcolor{ab_better}763 \textcolor{gray}{[620, 907]}
 & 732 \textcolor{gray}{[645, 820]}
 \\
\texttt{h1-crawl-v0} & 946 \textcolor{gray}{[933, 959]}
 & 955 \textcolor{gray}{[933, 977]}
 & 956 \textcolor{gray}{[947, 965]}
 & 946 \textcolor{gray}{[929, 963]}
 & 933