In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

%matplotlib inline
import matplotlib.pyplot as plt
import copy

import pickle
import time

In [2]:
# functions
from src import utils, main, post_process

In [3]:
# preprocessed directory
version_infer = "20250404_infer"
version_input = "20250403_input"
version_prep = "20250403_prep"
n_user = 200  
n_user_exp = 100

# LLM names
d_model = {
    "gpt-4.1-mini-2025-04-14" : "gpt-4.1-mini",
    "llama3-3-70b-instruct-v1" : "llama3.3-70b", 
    "gpt-4o-mini-2024-07-18" : "gpt-4o-mini", 
    "phi4" : "phi4",
    "nova-lite-v1" : "amazon-nova-lite"
}
model_names = list(d_model.keys())
model_names_short = list(d_model.values())

# data names
data_names = ["Yelp", "MIND", "Food"] + [f"Amazon_{a}" for a in ["Movie", "Music", "Grocery", "Clothes", "Book"]]

# prompt names
# comments are renamed prompt names for submitted paper
types_prompt = [
    f"ItemAll_Method{b}" 
    for b in [
        "Baseline", 
        "Emotion", 
        "Re-Reading",
        "Bothinst",  # Both-Inst
        "RecencyFocused",  # Recency-Focused
        "Pretend",  # RolePlay-User
        "Baseline_SystemRole",  # RolePlay-Expert
        "Baseline_Naming",  # RolePlay-Frederick
        "Baseline_ItemSummary",  # Summarize-Item
        "StepBack",  # Step-Back
        "ReAct",
        "Rephrase",
        "Echo",
        "UserSummarization",  # Summarize-User
        "ItemGenerate",  # Generate-Item
        "ItemGenerateTrue",  # Reuse-Item
        "Explain",
        "Mock",                
        "ZSCoT",  # Step-by-Step
        "TakeBreath",  # Deep-Breath
        "PlanSolve"  # Plan-Solve
    ]
]

# user types
exp_names = ["light", "heavy"]

# nDCG@k, Hit@k
k = 3

# set random seed
utils.set_seed()

In [4]:
d_prompt = {
    "UserSummarization" : "Summarize-User",
    "ItemGenerateTrue" : "Reuse-Item",
    "ItemGenerate" : "Generate-Item",
    "Bothinst" : "Both-Inst",
    "PlanSolve" : "Plan-Solve",
    "StepBack" : "Step-Back",
    "TakeBreath" : "Deep-Breath",
    "RecencyFocused" : "Recency-Focused",
    "ZSCoT" : "Step-by-Step",
    "Pretend" : "RolePlay-User",
    "Baseline_SystemRole" : "RolePlay-Expert",
    "Baseline_Naming" : "RolePlay-Frederick",
    "Baseline_ItemSummary" : "Summarize-Item"
}

def _load_error(model_name, types_prompt):
    llm = utils.load_llm(model_name)
    
    dict_error = dict()
    for data_name in tqdm(data_names):
        dict_error[data_name] = dict()
        for exp_name in exp_names:
    
            dict_error[data_name][exp_name] = dict()
            for type_prompt in types_prompt:
                d_, path_log = main.run_single(
                    llm=llm,
                    model_name=model_name, 
                    data_name=data_name,
                    version_infer=version_infer, 
                    n_user=n_user, 
                    exp_name=exp_name, 
                    type_prompt=type_prompt,
                    version_input=version_input, 
                    version_prep=version_prep,
                    n_user_exp=n_user_exp,
                    for_error_analysis=True
                )
    
                # load all generated text
                llm.path_log = path_log
                df_log = llm.load_log()
    
                # collect fail information
                dt = dict()
                users_failed = [user for user, d in d_.items() if "F" in d["pred"]]
                if len(users_failed) > 0:
                    s_input = df_log["input text"]
                    s_output = df_log["output text"]
    
                    n_retry = 5  
                    consecutives = (s_input == s_input.shift())
                    
                    for i in range(2, n_retry + 1):
                        consecutives &= (s_input == s_input.shift(i - 1))
                    
                    result_indices = consecutives.loc[consecutives].index - (n_retry - 1)
                    unique_failed_indices = result_indices.unique()[-1*(len(users_failed)):]
    
                    for i, idx in enumerate(unique_failed_indices):
                        dt[i] = {
                            "user" : users_failed[i],
                            "thought" : s_output.iloc[idx-1],
                            "answers" : s_output.iloc[idx:idx+n].values
                            }   
                
                # collect successful ranking results
                tt = "- Do not explain the reason and include any other words."  # words in final_task
                idx = [i for i, t in enumerate(df_log["input text"].drop_duplicates().values) if tt in t]
                s = df_log["output text"].iloc[idx]
                
                d = dict()
                error_count = 0
                for i, t in enumerate(s.values):
                    try:
                        l = [int(a) for a in t.split("[")[1].split("]")[0].replace("\\", "").split(",")]
                        n = len(l)
                        if n <= 10 and n >= 1:
                            d[i] = n
                    except:
                        pass

                # lendth of rankings; 10 means all items are sucessfully ranked
                v = np.array(list(d.values()))  

                if len(v) > 0:
                    de = {
                        3 : np.mean(v >= 3),
                        5 : np.mean(v >= 5), 
                        10: np.mean(v == 10),
                        "F" : len(users_failed) / len(d_)
                    }
                    de = pd.Series(de).fillna(0).to_dict()
                else:
                    de = {i : 0 for i in [3,5,10]}
                    de["F"] = 1
                
                de = {k : v for k,v in de.items()}
                
                type_prompt = type_prompt.split("_Method")[1]
                for k,v in d_prompt.items():
                    try:
                        type_prompt = type_prompt.replace(k,v)
                    except:
                        pass
                
                dict_error[data_name][exp_name][type_prompt] = {
                    "stat" : de, 
                    "text" : dt
                }
    return dict_error

In [5]:
ddict_error = dict()
for model_name, model_name_short in d_model.items():
    print(model_name)
    ddict_error[model_name_short] = _load_error(model_name, types_prompt)

gpt-4.1-mini-2025-04-14


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:41<00:00,  5.17s/it]


llama3-3-70b-instruct-v1


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:39<00:00,  4.90s/it]


gpt-4o-mini-2024-07-18


100%|██████████████████████████████████████████████████████████████████████| 8/8 [01:16<00:00,  9.59s/it]


phi4


100%|██████████████████████████████████████████████████████████████████████| 8/8 [01:09<00:00,  8.74s/it]


nova-lite-v1


100%|██████████████████████████████████████████████████████████████████████| 8/8 [00:43<00:00,  5.40s/it]


In [6]:
# select mode
# - "latex" will give all table info in latex format
# - "view" will give all table info in pandas dataframe html style
presentation = ["latex", "view"][1]

# Section 4.3

## Table 10

In [7]:
# failure and lendth of rankings >= 3
de = dict()
for model_name, dict_error in ddict_error.items():
    da = dict()
    for data_name, d1 in dict_error.items():
        for exp_name, d2 in d1.items():
            db = dict()
            for type_prompt, d3 in d2.items():
                db[type_prompt] = d3["stat"]
    
            df = pd.DataFrame(db)
        da[f"{data_name}_{exp_name}"] = df

    ds = dict()
    for a in [3,5,10,"F"]:
        d = dict()
        for e, df in da.items():
            d[e] = df.loc[a]
    
        ds[a] = pd.DataFrame(d).mean(axis=1)
    df = pd.DataFrame(ds).T
    de[model_name] = df

# add \textbf to emphasize error cases
def _tmp(s):
    s = s * 100
    if s >= 10:
        return "\\textbf{" + f"{s:.1f}" + "}"
    elif s > 0:
        return f"{s:.1f}"
    else:
        return "0"

d2 = dict()
for model_name, df in de.items():
    s3 = 1 - df.loc[3]
    sf = df.loc["F"]
    d2[model_name] = "$" + sf.apply(_tmp) + " / " + s3.apply(_tmp) + "$" 
df = pd.DataFrame(d2)

# Table 10 : failure and lendth of rankings >= 3
if presentation == "latex":
    print(df.to_latex(escape=False))
else:
    display(df)

Unnamed: 0,gpt-4.1-mini,llama3.3-70b,gpt-4o-mini,phi4,amazon-nova-lite
Baseline,$0 / 0$,$0 / 0$,$0.4 / 1.7$,$\textbf{16.6} / \textbf{12.5}$,$0.1 / 0$
Emotion,$0 / 0$,$0 / 0$,$0 / 1.0$,$\textbf{17.0} / \textbf{12.6}$,$0 / 0$
Re-Reading,$0 / 0$,$0 / 0$,$0.4 / 2.1$,$\textbf{46.1} / \textbf{26.1}$,$0 / 0$
Both-Inst,$0 / 0$,$0 / 0$,$0.4 / 1.0$,$\textbf{16.4} / \textbf{12.5}$,$0.1 / 0$
Recency-Focused,$0 / 0$,$0 / 0$,$0.8 / 2.1$,$\textbf{18.0} / \textbf{12.6}$,$0.1 / 0$
RolePlay-User,$0 / 0$,$0 / 0$,$0 / 0.1$,$\textbf{16.2} / \textbf{12.5}$,$0 / 0$
RolePlay-Expert,$0 / 0$,$0 / 0$,$0.4 / 1.3$,$\textbf{16.6} / \textbf{12.5}$,$0 / 0$
RolePlay-Frederick,$0 / 0$,$0 / 0$,$0.5 / 1.6$,$\textbf{16.2} / \textbf{12.8}$,$0.1 / 0$
Summarize-Item,$0 / 0$,$0 / 0$,$0 / 0.3$,$7.9 / 0$,$0 / 0$
Step-Back,$0 / 0$,$0 / 0$,$0 / 1.7$,$\textbf{35.5} / \textbf{13.6}$,$\textbf{39.6} / 0.5$


## Supplementary material (Table 17-19)

In [8]:
# endth of rankings >= 5 and == 10
def _tmp(s):
    s = s * 100
    if s >= 10:
        return "\\textbf{" + f"{s:.1f}" + "}"
    elif s > 0:
        return f"{s:.1f}"
    else:
        return "0"


d2 = dict()
for model_name, df in de.items():
    s5 = 1 - df.loc[5]
    s10 = 1 - df.loc[10]
    d2[model_name] = "$" + s5.apply(_tmp) + " / " + s10.apply(_tmp) + "$" 
df = pd.DataFrame(d2)
if presentation == "latex":
    print(df.to_latex(escape=False))
else:
    display(df)

Unnamed: 0,gpt-4.1-mini,llama3.3-70b,gpt-4o-mini,phi4,amazon-nova-lite
Baseline,$0 / 0.1$,$0 / 0.1$,$4.9 / 5.9$,$\textbf{12.9} / \textbf{13.5}$,$0.6 / 0.8$
Emotion,$0 / 0.1$,$0 / 0.1$,$4.1 / 4.9$,$\textbf{12.8} / \textbf{13.0}$,$0.2 / 0.4$
Re-Reading,$0 / 0.1$,$0 / 0.1$,$\textbf{15.6} / \textbf{18.0}$,$\textbf{27.8} / \textbf{28.3}$,$0 / 0.1$
Both-Inst,$0 / 0.1$,$0 / 0.1$,$2.8 / 3.6$,$\textbf{13.4} / \textbf{13.5}$,$0.6 / 0.8$
Recency-Focused,$0 / 0.1$,$0 / 0.1$,$6.2 / 6.9$,$\textbf{12.9} / \textbf{12.9}$,$0.5 / 0.6$
RolePlay-User,$0 / 0.1$,$0 / 0.1$,$0.4 / 0.8$,$\textbf{12.6} / \textbf{12.9}$,$0.1 / 0.2$
RolePlay-Expert,$0 / 0.1$,$0 / 0.1$,$4.5 / 5.4$,$\textbf{12.5} / \textbf{12.9}$,$1.9 / 2.0$
RolePlay-Frederick,$0 / 0.1$,$0 / 0.1$,$4.6 / 5.4$,$\textbf{13.4} / \textbf{13.6}$,$0.8 / 0.9$
Summarize-Item,$0 / 0.1$,$0 / 0.1$,$1.6 / 2.5$,$0 / 1.1$,$1.0 / 1.1$
Step-Back,$0 / 0.2$,$0 / 0.1$,$3.9 / 7.9$,$\textbf{14.5} / \textbf{15.6}$,$1.8 / 2.8$


In [9]:
# token length statistics Baseline
d_data = {
    "Amazon_Music" : "Music",
    "Amazon_Movie" : "Movie",
    "Amazon_Grocery" : "Groceries",
    "Amazon_Clothes" : "Clothes",
    "Amazon_Book" : "Book",
    "Yelp" : "Yelp",
    "MIND" : "News",
    "Food" : "Food"
}

d_exp = {
    "light" : "Light",
    "heavy" : "Heavy"
}


model_name = "gpt-4o-mini-2024-07-18"
llm = utils.load_llm(model_name)

dict_token= dict()
for data_name in data_names:
    dict_token[data_name] = dict()
    for exp_name in exp_names:
        for type_prompt in ["ItemAll_MethodBaseline"]:
            d_, path_log = main.run_single(
                llm=llm,
                model_name=model_name, 
                data_name=data_name,
                version_infer=version_infer, 
                n_user=n_user, 
                exp_name=exp_name, 
                type_prompt=type_prompt,
                version_input=version_input, 
                version_prep=version_prep,
                n_user_exp=n_user_exp,
                for_error_analysis=True
            )

            # load all generated text
            llm.path_log = path_log
            df_log = llm.load_log()

            tt = "- Do not explain the reason and include any other words."
            idx = [i for i, t in enumerate(df_log["input text"].drop_duplicates().values) if tt in t]
            s = df_log.iloc[idx]["input token"]
            dict_token[data_name][exp_name] = s


dss = dict()
for exp_name in exp_names:
    ds = dict()
    for data_name in data_names:
        s = dict_token[data_name][exp_name]
        ds[d_data[data_name]]= {
            "max" : f"{int(s.max())}", 
            "min" : f"{int(s.min())}",
            "mean" : f"{s.mean():.1f}",
            "std" : f"{s.std():.1f}"
        }
    df = pd.DataFrame(ds).T
    dss[d_exp[exp_name]] = df
df = pd.concat(dss, axis=1)
if presentation == "latex":
    print(df.to_latex(escape=False))
else:
    display(df)

Unnamed: 0_level_0,Light,Light,Light,Light,Heavy,Heavy,Heavy,Heavy
Unnamed: 0_level_1,max,min,mean,std,max,min,mean,std
Yelp,9662,2959,6218.2,1199.7,48660,13363,24749.6,4674.3
News,1966,1227,1537.3,151.3,5516,2932,3890.8,495.5
Food,6218,3991,4859.1,353.2,18552,12373,15463.6,1117.0
Movie,3573,1121,2146.0,487.3,13282,3173,6793.2,1964.9
Music,3884,1038,1943.1,465.5,12315,2767,6579.7,2180.9
Groceries,4427,1668,2921.1,533.8,15994,5453,9414.0,2211.5
Clothes,4050,1340,2571.5,486.4,14379,3905,7168.7,1794.8
Book,4316,1004,1896.2,527.5,12014,3007,6477.8,1764.1


In [10]:
# token length statistics Summarize-Item

model_name = "gpt-4o-mini-2024-07-18"
llm = utils.load_llm(model_name)

dict_token= dict()
for data_name in data_names:
    dict_token[data_name] = dict()
    for exp_name in exp_names:
        for type_prompt in ["ItemAll_MethodBaseline_ItemSummary"]:
            d_, path_log = main.run_single(
                llm=llm,
                model_name=model_name, 
                data_name=data_name,
                version_infer=version_infer, 
                n_user=n_user, 
                exp_name=exp_name, 
                type_prompt=type_prompt,
                version_input=version_input, 
                version_prep=version_prep,
                n_user_exp=n_user_exp,
                for_error_analysis=True
            )

            # load all generated text
            llm.path_log = path_log
            df_log = llm.load_log()

            tt = "- Do not explain the reason and include any other words."
            idx = [i for i, t in enumerate(df_log["input text"].drop_duplicates().values) if tt in t]
            s = df_log.iloc[idx]["input token"]
            dict_token[data_name][exp_name] = s


dss = dict()
for exp_name in exp_names:
    ds = dict()
    for data_name in data_names:
        s = dict_token[data_name][exp_name]
        ds[d_data[data_name]]= {
            "max" : f"{int(s.max())}", 
            "min" : f"{int(s.min())}",
            "mean" : f"{s.mean():.1f}",
            "std" : f"{s.std():.1f}"
        }
    df = pd.DataFrame(ds).T
    dss[d_exp[exp_name]] = df
df = pd.concat(dss, axis=1)
if presentation == "latex":
    print(df.to_latex(escape=False))
else:
    display(df)

Unnamed: 0_level_0,Light,Light,Light,Light,Heavy,Heavy,Heavy,Heavy
Unnamed: 0_level_1,max,min,mean,std,max,min,mean,std
Yelp,7735,3464,4685.1,703.0,41732,12718,17863.0,3784.7
News,2670,1894,2285.6,176.9,7277,4776,5974.3,650.0
Food,5461,3716,4198.0,271.3,16199,10683,13388.6,1029.1
Movie,4807,2299,3050.8,435.3,16693,6427,9431.4,2106.2
Music,4936,2317,3048.5,452.6,16820,6124,9952.4,2374.8
Groceries,4160,2426,3064.5,398.0,16654,6605,9706.0,2059.7
Clothes,4328,2125,2751.8,384.5,14796,5884,8092.4,1588.4
Book,5287,2369,3104.1,517.2,16754,6367,9881.5,1995.0


In [11]:
# why inference failed in each thought process
model_name = "gpt-4o-mini"
dict_error = ddict_error[model_name]

for data_name, d1 in dict_error.items():
    for exp_name, d2 in d1.items():
        for type_prompt, d3 in d2.items():
            idx = 0
            for k,v in d3["text"].items():
                t = v["answers"][-1]
                try:
                    l = [int(a) for a in t.split("[")[1].split("]")[0].replace("\\", "").split(",")]
                except:
                    print(v["thought"])
                    print("- " * 50)
                    print(t)
                    print("--" * 50)
                    idx += 1

            if idx > 0:
                print(model_name, data_name, exp_name, type_prompt)
                print("==" * 50)

"### Observation:
The user has a history of reviewing ballet and dance-related workout DVDs. They express a strong preference for challenging workouts that provide effective results, particularly in toning and defining muscles. The user has rated several products, with the highest rating being 5.0 for a challenging ballet workout, while others received lower ratings due to being too easy or not effective enough.

### Thought:
The user seems to prefer products that are intense and provide a good workout, particularly in ballet and dance fitness. They have explicitly mentioned the need for challenging workouts that yield visible results. Therefore, any candidate products that align with these preferences, especially in the fitness category, would likely be more appealing to the user.

### Action:
I will examine the candidate products to identify any that align with the user's preferences for challenging workouts, particularly in the exercise and fitness category.

1. **Emmet Otter's Jug-