In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
import os
import collections
from tqdm import tqdm
HERE = %pwd
sys.path.append(os.path.dirname(HERE))

%matplotlib inline
import matplotlib.pyplot as plt
import copy

In [None]:
from src import utils

utils.set_seed()

# LLM names
d_model = {
    "gpt-4.1-mini-2025-04-14" : "gpt-4.1-mini",
    "llama3-3-70b-instruct-v1" : "llama3.3-70b", 
    "gpt-4o-mini-2024-07-18" : "gpt-4o-mini", 
    "phi4" : "phi4",
    "nova-lite-v1" : "amazon-nova-lite"
}
model_names = list(d_model.keys())
model_name = model_names[0]
llm = utils.load_llm(model_name)

data_names = ["Yelp", "MIND", "Food"] + [f"Amazon_{a}" for a in ["Movie", "Music", "Grocery", "Clothes", "Book"]]

In [None]:
for data_name in data_names:
    print(utils.now(), data_name)
    
    # preprocessed directory
    version_prep = "20250403_prep"
    dir_prep_data = f"../data/preprocessed_data/{version_prep}/{data_name}"
    llm.path_log = f"{dir_prep_data}/llm_log_item_summary_{model_name}.txt"

    if "Amazon" in data_name:
        columns_item = ["title", "categories", "description"]
        domain = data_name.split("Amazon_")[1].lower()
    elif "Yelp" in data_name:
        columns_item = ['name', 'attributes', 'categories'] 
        domain = "business"
    elif "MIND" in data_name:
        columns_item = ['title', 'category', 'subcategory', 'abstract']  
        domain = "news"
    elif "Food" in data_name:
        columns_item = ["name", "tags", "description", "ingredients"]
        domain = "recipe"

    df_items = pd.read_csv(f"{dir_prep_data}/items_slim.csv", index_col=0).fillna("")
    df_items = df_items[columns_item]

    # generate summarized item text
    try:
        df_ = pd.read_csv(f"{dir_prep_data}/items_slim_with_summary_{model_name}.csv", index_col=0)
    except:
        d_s = dict()
        for id_item, d_ in tqdm(df_items.T.to_dict().items()):

            # summarizing prompt
            prompt = f"""We have the following item details in the {domain} domain:
# Item Information
{d_}

Please summarize this item information, ensuring to include key features and attributes to enhance clarity and understanding wihtin 10-200 words"""

            # retry function
            i = 0
            n = 5
            while i < n:
                try:
                    output, log = llm(prompt, log=True)
                    assert log["output token"] > 10
                    assert log["output token"] < 500
                    i = n + 1
                except:
                    i += 1

            # if failed, we use the summarized text as the top 1000 characters of item description.
            if i == n:
                print(id_item)
                print(d_)
                print(log)
                print("==")
                output = str(d_)[:1000]
        
            d_s[id_item] = output
    
        df_ = df_items.copy()
        df_["summary"] = pd.Series(d_s).copy()
        df_.to_csv(f"{dir_prep_data}/items_slim_with_summary_{model_name}.csv")
    
    print(llm.compute_log())