In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

%matplotlib inline
import matplotlib.pyplot as plt
import copy

import pickle
import time

In [2]:
# functions
from src import utils, main, post_process

In [3]:
# preprocessed directory
version_infer = "20250418_infer"
version_input = "20250403_input"
version_prep = "20250403_prep"
n_user = 200  

n_user_exp = 10

# LLM names
d_model = {
    "gpt-4.1-mini-2025-04-14" : "gpt-4.1-mini",
    "llama3-3-70b-instruct-v1" : "llama3.3-70b", 
    "gpt-4o-mini-2024-07-18" : "gpt-4o-mini", 
    "phi4" : "phi4",
    "nova-lite-v1" : "amazon-nova-lite"
}
model_names = list(d_model.keys())
model_names_short = list(d_model.values())

# data names
data_names = ["Yelp", "MIND", "Food"] + [f"Amazon_{a}" for a in ["Movie", "Music", "Grocery", "Clothes", "Book"]]

# prompt names
L = ["Rephrase", "StepBack", "ReAct"]
T = ["Rephrase_ReAct", "Rephrase_StepBack"]
for a in L:
    t = f"SelfRefine_{a}"
    T.append(t)    

for a in L:
    t = f"SelfConsistency_{a}"
    T.append(t)    

types_prompt = [
    f"ItemAll_Method{b}" for b in ["Baseline"] + L + [f"Combo_{t}" for t in T]
]

# user types
exp_names = ["light", "heavy"]

# nDCG@k, Hit@k
k = 3

# set random seed
utils.set_seed()

In [4]:
# load computed data if exists; run otherwise
ds = dict()
dc = dict()

for model_name, model_name_short in d_model.items():
    print(model_name)
    llm = utils.load_llm(model_name)
    
    dict_res, df_cost = main.run(
        llm=llm, data_names=data_names, types_prompt=types_prompt, exp_names=exp_names, k=k,
        model_name=model_name, version_infer=version_infer, 
        n_user=n_user, version_input=version_input, version_prep=version_prep, n_user_exp=n_user_exp
    )
    
    ds[model_name_short] = dict_res
    dc[model_name_short] = df_cost

gpt-4.1-mini-2025-04-14


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.02it/s]


finished
llama3-3-70b-instruct-v1


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.95it/s]


finished
gpt-4o-mini-2024-07-18


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.18it/s]


finished
phi4


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:08<00:00,  1.06s/it]


finished
nova-lite-v1


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.98it/s]

finished





In [5]:
# select mode
# - "latex" will give all table info in latex format
# - "view" will give all table info in pandas dataframe html style
presentation = ["latex", "view"][1]

# Section 5.1
## Table 11

In [6]:
# nDCG@3 by all model
metric = "nDCG"
k = 3
d_ = dict()
for model_name in model_names_short:
    dict_res = ds[model_name]
    df_res = pd.concat(dict_res[f"{metric}@{k}"].values())
    df = post_process.convert_stat_table_with_latex(df_res, l_select=[])
    df_ = df[["tex", "score"]]
    d_[model_name] = df_

df_model = pd.DataFrame({model_name : df_["tex"] for model_name, df_ in d_.items()})

# sort prompts by mean accuracy
df = pd.DataFrame({model_name : df_["score"] for model_name, df_ in d_.items()})
df = df[["gpt-4.1-mini", "llama3.3-70b"]]
s_avg = df.T.mean().sort_values(ascending=False)
df_model = df_model.loc[s_avg.index]
df_model.index = [i.replace("SelfRefine", "SR").replace("SelfConsistency", "SC") for i in df_model.index]

if presentation == "latex":
    print(df_model.to_latex(escape=False))
else:
    df_model = pd.DataFrame({model_name : df_["score"] for model_name, df_ in d_.items()})
    df_model = df_model.loc[s_avg.index]
    df_model.index = [i.replace("SelfRefine", "SR").replace("SelfConsistency", "SC") for i in df_model.index] 
    display(df_model.astype(float).round(3))

Unnamed: 0,gpt-4.1-mini,llama3.3-70b,gpt-4o-mini,phi4,amazon-nova-lite
Rephrase,0.55,0.548,0.479,0.487,0.364
ReAct,0.547,0.533,0.407,0.407,0.367
SC (ReAct),0.535,0.537,0.389,0.402,0.441
SC (Step-Back),0.543,0.518,0.454,0.427,0.369
Step-Back,0.516,0.543,0.444,0.439,0.359
SC (Rephrase),0.522,0.536,0.468,0.46,0.397
SR (ReAct),0.54,0.516,0.378,0.369,0.407
SR (Rephrase),0.514,0.533,0.462,0.471,0.455
SR (Step-Back),0.52,0.527,0.458,0.392,0.39
Rephrase$\rightarrow$ReAct,0.534,0.506,0.376,0.472,0.402


In [7]:
# [unused] LMEM
df_long_all = post_process.convert_long_table(ds, k=k, metrics=["nDCG", "Hit"])

d_lmem = dict()
for data_name in tqdm(["Heavy", "Light", "_"]):
    df_ = df_long_all.copy()
    df_long = df_.iloc[[
        i for i, u in df_["user"].to_dict().items() 
        if data_name in u.split("__DataName")[1]
    ]]
    s_pos, s_neg = post_process.lmem(df_long)
    
    if data_name == "_":
        data_name = "ALL"
    
    d_lmem[data_name] = {
        "pos" : s_pos, 
        "neg" : s_neg
    }

d_pos = {data_name : ", ".join(d["pos"].index.values) for data_name, d in d_lmem.items()}
d_neg = {data_name : ", ".join(d["neg"].index.values) for data_name, d in d_lmem.items()}
df = pd.concat([pd.Series(d_pos), pd.Series(d_neg)], axis=1)
df.columns = ["Positive", "Negative"]

if presentation == "latex":
    print(df.to_latex(escape=False))
else:
    display(df)

100%|██████████████████████████████████████████████████████████████████████| 3/3 [00:01<00:00,  2.59it/s]


Unnamed: 0,Positive,Negative
Heavy,Rephrase$\rightarrow$ReAct,
Light,"ReAct, SelfConsistency (Step-Back), SelfRefine...",
ALL,"ReAct, Rephrase, SelfConsistency (Step-Back), ...",
