In [1]:
import pickle
import os

with open(os.path.join("..", "data", "runs", "iqm_envs.pickle"), "rb") as f:
    iqm_scores, iqm_cis = pickle.load(f)

iqm_scores, iqm_cis

({'SafetyAntVelocity-v1': {'Reward': {'C-TRPO (ours)': array([1820.53198771, 2657.099268  , 2831.05348946, 2958.59102869,
           2993.1904289 , 3004.32905843, 3041.31483854, 3070.26581746,
           3074.18263696, 3098.9735843 ]),
    'TRPO-Lag': array([1557.90376246, 2755.98411702, 2871.85865924, 2969.50008905,
           3017.41170882, 2988.89375141, 3122.70598716, 3060.35607028,
           3122.43067623, 3001.86956915]),
    'CPPO-PID': array([  44.64752112,  850.46453855, 1641.2859461 , 2288.81459177,
           2580.97370037, 2816.48576643, 2968.07666147, 3029.28647766,
           3161.48292977, 3205.25982965]),
    'P3O': array([  15.65464698,  816.07300382, 1374.60893898, 1931.03623362,
           2252.57892194, 2551.22191307, 2740.63596927, 2952.09051328,
           3085.24772277, 3122.52175845]),
    'PCPO': array([  58.16963113,  926.82466339, 1226.87439789, 1439.89969134,
           1566.96016535, 1643.57924792, 1566.59007713, 1691.89260202,
           1710.98551262, 17

In [12]:
from helpers import *

new_eval_score_dict = {}
for env in ENVS:
    for algo in ALGOS:
        new_eval_score_dict[algo.upper()] = {}
        for metric in ["Reward", "Final Cost", "Cumulative Cost"]:
            metric_name = "Cost Regret" if metric == "Cumulative Cost" else metric
            new_eval_score_dict[algo.upper()][metric_name] = {}
            for env in ENVS:
                mean = round(iqm_scores[env][metric][algo][-1], 1)
                ci_up = round(iqm_cis[env][metric][algo][0,-1] - mean, 1)
                ci_down = round(iqm_cis[env][metric][algo][1,-1] - mean, 1)
                new_eval_score_dict[algo.upper()][metric_name][env.replace("Safety", "").split("-")[0].strip("1")] = f"{mean} ± {ci_up}/{ci_down}"

latex_lookup = {
    "Reward": r'$V_r$', "Final Cost": r'$V_c$', "Cost Regret": r"Reg$_+$"
}

df_eval = pd.DataFrame.from_dict({(i, latex_lookup[j]): new_eval_score_dict[i][j] 
                           for i in new_eval_score_dict.keys() 
                           for j in new_eval_score_dict[i].keys()},
                           orient='index')

df_eval

Unnamed: 0,Unnamed: 1,AntVelocity,HalfCheetahVelocity,HumanoidVelocity,HopperVelocity,CarButton,PointGoal,RacecarCircle,PointPush
C-TRPO (OURS),$V_r$,3099.0 ± -38.5/46.5,2833.8 ± -123.6/70.9,5513.1 ± -66.5/53.8,1669.2 ± -839.8/55.2,1.0 ± -0.7/0.5,20.9 ± -0.7/0.4,30.3 ± -2.1/1.4,0.9 ± -0.3/0.4
C-TRPO (OURS),$V_c$,18.8 ± -3.2/6.5,20.0 ± -8.1/5.7,12.2 ± -2.1/0.9,20.4 ± -3.2/2.9,36.8 ± -9.9/9.9,29.6 ± -3.2/15.4,21.7 ± -2.3/4.7,27.1 ± -8.3/8.5
C-TRPO (OURS),Reg$_+$,244.0 ± -24.6/60.5,105.3 ± -30.6/25.2,185.9 ± -109.0/72.8,1056.4 ± -804.5/1014.9,6299.9 ± -465.5/686.1,2459.6 ± -442.6/230.9,1465.8 ± -265.7/552.4,2626.2 ± -436.0/591.3
TRPO-LAG,$V_r$,3001.9 ± -86.5/177.8,2879.0 ± -64.6/104.9,5326.9 ± -137.0/173.1,700.8 ± -493.5/868.2,-0.0 ± -0.5/0.6,18.5 ± -8.4/3.8,34.6 ± -3.9/0.6,2.7 ± -2.4/5.0
TRPO-LAG,$V_c$,20.8 ± -10.0/18.4,25.0 ± -9.1/3.9,22.0 ± -7.0/14.3,13.7 ± -4.5/26.5,24.4 ± -7.1/5.7,28.4 ± -13.5/6.8,20.6 ± -5.0/4.4,13.7 ± -3.9/3.2
TRPO-LAG,Reg$_+$,2773.5 ± -738.5/414.3,2546.5 ± -318.1/977.8,1110.4 ± -607.6/719.7,6075.9 ± -1439.3/1447.5,10343.1 ± -734.2/624.4,4094.7 ± -727.7/1029.6,4889.9 ± -202.4/1164.8,2828.3 ± -408.5/653.6
CPPO-PID,$V_r$,3205.3 ± -186.5/76.7,3036.1 ± -36.7/10.7,5877.3 ± -111.4/84.8,1657.5 ± -65.5/61.0,-1.2 ± -0.5/0.6,6.1 ± -3.0/4.8,8.1 ± -5.5/4.3,1.0 ± -0.6/1.1
CPPO-PID,$V_c$,26.2 ± -5.3/4.4,26.5 ± -2.7/7.2,20.3 ± -8.6/6.0,18.6 ± -9.0/8.1,23.8 ± -8.4/6.0,21.8 ± -4.4/6.8,33.3 ± -6.5/5.9,22.8 ± -11.1/9.9
CPPO-PID,Reg$_+$,1416.8 ± -201.6/328.3,2094.1 ± -351.0/417.9,913.9 ± -228.4/304.9,3649.6 ± -1018.2/695.1,3256.0 ± -488.9/211.4,2233.6 ± -274.0/681.0,2573.5 ± -317.7/897.3,1981.3 ± -237.1/293.2
P3O,$V_r$,3122.5 ± -111.4/24.6,3020.3 ± -44.8/12.8,5492.1 ± -45.0/118.7,1633.5 ± -107.7/49.0,0.2 ± -0.2/0.3,5.7 ± -0.5/0.3,0.9 ± -0.1/0.1,0.7 ± -0.4/0.6


In [30]:
admissible = {
    env: list(df_eval[env].loc[ids[:, r"$V_c$"]].apply(lambda x: float(x.split(" ± ")[0]))[df_eval[env].loc[ids[:, r"$V_c$"]].apply(lambda x: float(x.split(" ± ")[0])) < 25.0].index) for env in df_eval.columns
}
admissible

{'AntVelocity': ['C-TRPO (OURS)', 'TRPO-LAG', 'P3O', 'PCPO'],
 'HalfCheetahVelocity': ['C-TRPO (OURS)', 'PCPO', 'CPO', 'PPO-LAG'],
 'HumanoidVelocity': ['C-TRPO (OURS)',
  'TRPO-LAG',
  'CPPO-PID',
  'P3O',
  'PCPO',
  'CPO',
  'CUP',
  'FOCOPS',
  'PPO-LAG',
  'IPO'],
 'HopperVelocity': ['C-TRPO (OURS)',
  'TRPO-LAG',
  'CPPO-PID',
  'P3O',
  'CUP',
  'IPO'],
 'CarButton': ['TRPO-LAG', 'CPPO-PID'],
 'PointGoal': ['CPPO-PID', 'P3O', 'PPO-LAG'],
 'RacecarCircle': ['C-TRPO (OURS)',
  'TRPO-LAG',
  'P3O',
  'PCPO',
  'CPO',
  'CUP',
  'FOCOPS',
  'IPO'],
 'PointPush': ['TRPO-LAG',
  'CPPO-PID',
  'P3O',
  'PCPO',
  'FOCOPS',
  'PPO-LAG',
  'IPO']}

In [32]:
df_eval['AntVelocity'].loc[ids[admissible['AntVelocity'], r"$V_r$"]].apply(lambda x: float(x.split(" ± ")[0])).max()

3122.5

In [34]:
from functools import partial

def bold_formatter(x, value):
    xn = float(x.split(" ± ")[0])
    if xn == value:
        return f"{{\\bfseries{x}}}"
    else:
        return f"{{{x}}}"

def underline_formatter(x, value):
    xn = float(x.split(" ± ")[0])
    if xn == value:
        return f"{{\\fbox{{{x}}}}}"
    else:
        return f"{{{x}}}"

ids = pd.IndexSlice

fmts1 = {column: partial(bold_formatter, value=df_eval[column].loc[ids[admissible[column], r"$V_r$"]].apply(lambda x: float(x.split(" ± ")[0])).max()) for column in df_eval.columns}
fmts2 = {column: partial(underline_formatter, value=df_eval[column].loc[ids[admissible[column], r"Reg$_+$"]].apply(lambda x: float(x.split(" ± ")[0])).min()) for column in df_eval.columns}

styled = df_eval.style.format(subset=ids[:, r"$V_r$", :], formatter=fmts1).format(subset=ids[:, r"Reg$_+$", :], formatter=fmts2)

print(styled.to_latex(position_float="centering", hrules=True, multirow_align="t", multicol_align="r", clines="skip-last;data",))

\begin{table}
\centering
\begin{tabular}{llllllllll}
\toprule
 &  & AntVelocity & HalfCheetahVelocity & HumanoidVelocity & HopperVelocity & CarButton & PointGoal & RacecarCircle & PointPush \\
\midrule
\multirow[t]{3}{*}{C-TRPO (OURS)} & $V_r$ & {3099.0 ± -38.5/46.5} & {2833.8 ± -123.6/70.9} & {5513.1 ± -66.5/53.8} & {\bfseries1669.2 ± -839.8/55.2} & {1.0 ± -0.7/0.5} & {20.9 ± -0.7/0.4} & {30.3 ± -2.1/1.4} & {0.9 ± -0.3/0.4} \\
 & $V_c$ & 18.8 ± -3.2/6.5 & 20.0 ± -8.1/5.7 & 12.2 ± -2.1/0.9 & 20.4 ± -3.2/2.9 & 36.8 ± -9.9/9.9 & 29.6 ± -3.2/15.4 & 21.7 ± -2.3/4.7 & 27.1 ± -8.3/8.5 \\
 & Reg$_+$ & {244.0 ± -24.6/60.5} & {105.3 ± -30.6/25.2} & {185.9 ± -109.0/72.8} & {1056.4 ± -804.5/1014.9} & {6299.9 ± -465.5/686.1} & {2459.6 ± -442.6/230.9} & {1465.8 ± -265.7/552.4} & {2626.2 ± -436.0/591.3} \\
\cline{1-10}
\multirow[t]{3}{*}{TRPO-LAG} & $V_r$ & {3001.9 ± -86.5/177.8} & {2879.0 ± -64.6/104.9} & {5326.9 ± -137.0/173.1} & {700.8 ± -493.5/868.2} & {\bfseries-0.0 ± -0.5/0.6} & {18.5 ± -8.4/3