# Session Generator Without Fine-tuning

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

In [2]:
import re
import ast
import pandas as pd
from utils import load_pickle
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


# Part I Load model

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.81s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [4]:
def extract_product_name(raw_output, fallback="Unknown Product"):
    # Split the response into lines
    lines = raw_output.strip().split("\n")

    # Find the last non-empty line
    for line in reversed(lines):
        clean_line = line.strip()
        if clean_line:
            # Remove common prefixes
            for prefix in ["Sure,", "Here is", "Here are", "Of course,", "I'd be happy to help!"]:
                if clean_line.lower().startswith(prefix.lower()):
                    clean_line = clean_line[len(prefix):].strip()
            # Remove category hints if present
            clean_line = re.sub(r'for the ".+?" category', "", clean_line).strip()
            
            # Return the cleaned line if it looks like a product name
            if clean_line and len(clean_line.split()) >= 2:
                return clean_line

    # Fallback if no valid name is found
    print(f"⚠️ No valid product name found in: '{raw_output}'")
    return fallback

# === Extract a Python List from LLM Output ===
def extract_list_from_text(text, fallback=None):
    if not text or not isinstance(text, str):
        print("⚠️ Invalid input. Using fallback.")
        return fallback or []

    # Strip leading instructions and whitespace
    cleaned_text = text.strip()

    # Extract the first valid list
    match = re.search(r'\[\s*(?:[^\[\]]|\[[^\[\]]*\])*\s*\]', cleaned_text, re.DOTALL)
    if match:
        try:
            extracted = match.group(0).replace("\n", "").replace("    ", "").strip()
            # Basic sanity check to avoid explanation leakage
            if extracted.startswith("[") and extracted.endswith("]"):
                return ast.literal_eval(extracted)
            else:
                print("⚠️ Unexpected non-list format. Using fallback.")
        except Exception as e:
            print(f"⚠️ Parsing failed: {e}")
    
    print("⚠️ No valid list found. Using fallback.")
    return fallback or []

# Part II Functions & Prompts



## prompt #1: Get Categories form History Sessions

In [5]:
def get_categories_from_history(history_str, category_list, n=9):
    categories_str = ", ".join(category_list)
    # Define the system and user prompts
    system_prompt = "You are a fashion recommendation assistant."
    user_prompt = f"""
    Given this purchase history:
    {history_str}

    Select exactly {n} categories from the following list:
    {categories_str}

    ⚠️ Respond **only** with a Python list of exactly {n} categories, **without any additional text, punctuation, or explanations.**

    ✅ Format exactly like this:
    ["Category1", "Category2", "Category3", ..., "Category{n}"]

    ❗ Do not add any introductory words, explanations, or context.
    ❗ Do not include line breaks within the list.
    ❗ Do not add extra punctuation or bullet points.
    """

    prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt} [/INST]"
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    response = response.split("[/INST]")[-1].strip()
    return response


## Prompt #2: Generate New Session from Categories

In [6]:
from datetime import datetime
def get_user_info(user_id: str, user_dataset: pd.DataFrame):
    user_info = user_dataset[user_dataset["customer_id"] == user_id]
    return user_info

def get_current_season():
    current_date = datetime.now()
    current_month = current_date.month
    if current_month in [12, 1, 2]:
        current_season = "Winter"
    elif current_month in [3, 4, 5]:
        current_season = "Spring"
    elif current_month in [6, 7, 8]:
        current_season = "Summer"
    else:
        current_season = "Autumn"
    
    return current_season


def get_category_examples(category):
    """
    從 CSV 檔案中讀取特定類別的商品名稱範例
    
    Args:
        category (str): 商品類別名稱
        
    Returns:
        str: 該類別的商品名稱範例，以逗號分隔
    """
    try:
        # 讀取 CSV 檔案，指定分隔符為 ':'
        category_examples = pd.read_csv("data/product_name_examples.csv", 
                                      sep=':', 
                                      names=['category', 'examples'])
        
        # 找到對應類別並回傳範例
        examples = category_examples[category_examples["category"] == category]["examples"].values
        
        if len(examples) == 0:
            print(f"⚠️ 找不到類別 '{category}' 的範例")
            return ""
            
        return examples[0]
        
    except Exception as e:
        print(f"⚠️ 讀取範例時發生錯誤: {e}")
        return ""

In [7]:
def get_item_from_category(category, user_info: pd.DataFrame):
    
    current_season = get_current_season()
    
    # Define the system and user prompts
    system_prompt = "You are a fashion product naming assistant."
    user_prompt = f"""
    Customer information:
    - Age: {user_info["age"]}
    - Subcription to fashion magazines: {user_info["fashion_news_frequency"]}
    - Club membership: {user_info["club_member_status"]}
    
    Category: {category}
    Season: {current_season}
    Generate a realistic product name that:
       1. Clearly belongs to the {category} category
       2. Is appropriate for {current_season}
       
    Examples for {category}:
       {get_category_examples(category)}
    
    ⚠️ **Output Requirements:**  
    - Respond with only the product name as a single line of text.  
    - Do not include quotes, bullet points, or explanations.  
    """

    # Construct the full prompt
    full_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt.strip()} [/INST]"

    # Generate the response
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128)

    # Decode and return the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    response = response.split("[/INST]")[-1].strip()
    return response


In [None]:
def get_items_from_categories(category_dict: dict[str, list[str]], user_info: pd.DataFrame) -> dict[str, list[str]]:
    """為每個使用者的每個類別生成對應的商品名稱
    
    Args:
        category_dict: {user_id: [category1, category2, ...]}
        user_info: 使用者資料，包含 age, fashion_news_frequency, club_member_status
        
    Returns:
        dict[str, list[str]]: {user_id: [product_name1, product_name2, ...]}
        每個 product_name 對應 category_dict 中相同位置的 category
    """
    current_season = get_current_season()
    result = {}
    
    for user_id, categories in category_dict.items():
        if user_info.empty:
            print(f"⚠️ No user info found for {user_id}")
            result[user_id] = ["Unknown"] * len(categories)
            continue
        
        product_names = []
        
        for category in categories:
            
            # 為這個使用者的所有類別生成商品名稱
            system_prompt = """You are a fashion product naming assistant. 
                            You MUST:
                            1. Output ONLY ONE product name
                            2. Follow the format strictly
                            3. Never include explanations
                            4. Never include quotes or brackets
                            5. Never include emojis
                            6. Never include any other text like "Sure, here is a product name for a jacket that meets the requirements:"

                            If you fail to follow these rules, the output will be rejected."""
            user_prompt = f"""
            Generate **ONE** product name for {category} that is:
            1. Appropriate for {current_season}, But not too specific and do not include season in the name
            2. 2-4 words long
            3. Unique and creative

            Customer context:
            - Age: {user_info["age"]}
            - Fashion magazine subscription: {user_info["fashion_news_frequency"]}
            - Club status: {user_info["club_member_status"]}

            Example: {get_category_examples(category)}

            ⚠️ CRITICAL: Output ONLY **ONE** product name. No explanations, quotes, or additional text.
            Format: Product Name
            """
        
            # 生成回應
            full_prompt = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{user_prompt.strip()} [/INST]"
            inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda")
            outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.3, do_sample=True)  # 增加 token 數以容納多個名稱
            response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
            response = response.split("[/INST]")[-1].strip()
            print(f"Response for {user_id} - {category}:")
            print(response)
            print("===" * 20)
        
        # 解析回應
            product_name = extract_product_name(response, fallback=f"Unknown {category}")
            product_names.append(product_name)
        result[user_id] = product_names
        
    return result

### Utils Function

- **prefilter_categories(items_list, category_list, top_k=20)**

    此函式用於在進入 LLM 前，根據使用者歷史購買商品所對應的 product type（商品類別），從所有可選類別中預先篩選出最相關的前 top_k 個類別，避免因為類別數量過多導致超出 LLM 的上下文長度限制（context window overflow）。

- **article_to_product_type(article_id)**

    此函式將 articleId 轉換成相對應的 product type (category)

In [8]:
# === Pre-Filter Categories with Embeddings ===
def prefilter_categories(items_list, category_list, top_k=20):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Encode the history as a single vector
    history_embedding = model.encode(items_list, convert_to_tensor=True)
    
    # Encode all categories
    category_embeddings = model.encode(category_list, convert_to_tensor=True)
    
    # Compute similarity
    cos_scores = util.cos_sim(history_embedding, category_embeddings)[0]
    top_results = cos_scores.topk(k=top_k)
    
    # Extract top-K categories
    top_categories = [category_list[i] for i in top_results.indices]
    top_scores = [cos_scores[i].item() for i in top_results.indices]
    
    # Return as a dictionary for better interpretability
    return top_categories


def article_to_product_type(article_id):
    mapping = load_pickle("data/article_to_product_mapping.pkl")
    return mapping.get(article_id, "Unknown Product Type")

## Main Pipeline

**enrich_user_session(user_sessions, category_list, n=9, max_history_length=10)**

這個函式是整個推薦系統流程的核心，它的目的在於將原本的使用者購買紀錄 user session 擴充為更完整、更擬真的購物序列，藉由語意相似度 + LLM 輔助推理生成更多合理的商品。

In [9]:
# === Generate a New Enriched User Session ===
def enrich_user_session(user_sessions, customer_df, category_list, n=9, max_history_length=10):
    session = {uid: items.copy() for uid, items in user_sessions.items()}

    for uid, items in session.items():
        # Convert item IDs to product names
        history_str = " ".join([article_to_product_type(item) for item in items])
        print(f"User: {uid}, History: {history_str}\n")
        top_categories = prefilter_categories(history_str, category_list, top_k=20)
        print(f"Top categories: {top_categories}\n")
        # Generate Categories
        raw_output = get_categories_from_history(history_str, top_categories)
        print("LLM raw output:", raw_output, "\n")
        print("End of the output")
        categories = extract_list_from_text(raw_output, fallback=["tops", "accessories", "shoes"])
        print(f"Selected Categories: {categories}\n")

        user_info = get_user_info(uid, customer_df)
        
        # Generate Items for Each Category
        while len(items) < max_history_length:
            for category in categories:
                raw_item = get_item_from_category(category, user_info)
                print("Raw item LLM output: \n", raw_item)
                item = extract_product_name(raw_item, fallback="Unkown")
                print("Extracted: ", item)
                if len(items) > 0:
                    items.insert(-1, item)
                else:
                    # If the list is empty, just append
                    items.append(item)
                if len(items) >= max_history_length:
                    break

    return session

# Part III Implementation

## Load pickle file

##  Load Category List

In [10]:
import pandas as pd

categories_df = pd.read_csv("product_types.csv")
category_list = categories_df["category_name"].tolist()
print("Total categories loaded:", len(category_list))
print("Sample categories:", category_list[:10])

# 讀取 customers 檔案
customer_df = pd.read_parquet('data/customers.parquet')
print(customer_df['club_member_status'])


Total categories loaded: 131
Sample categories: ['Vest top', 'Bra', 'Underwear Tights', 'Socks', 'Leggings/Tights', 'Sweater', 'Top', 'Trousers', 'Hair clip', 'Umbrella']
0              ACTIVE
1              ACTIVE
2              ACTIVE
3              ACTIVE
4              ACTIVE
              ...    
1371975        ACTIVE
1371976        ACTIVE
1371977        ACTIVE
1371978        ACTIVE
1371979    PRE-CREATE
Name: club_member_status, Length: 1371980, dtype: object


In [15]:
## Test New LLM #2
user_categories = {
    "00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657": [
        "T-shirt", "Sweater", "Blazer", "Vest top", "Jacket", "Shirt"
    ],
    # "0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa": [
    #     "Dress", "Cardigan", "Bikini top", "Swimsuit", "Vest top", "Dress"
    # ]
}

print(get_items_from_categories(user_categories, customer_df))

OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU 0 has a total capacity of 23.65 GiB of which 230.25 MiB is free. Process 834212 has 13.76 GiB memory in use. Including non-PyTorch memory, this process has 9.66 GiB memory in use. Of the allocated memory 8.54 GiB is allocated by PyTorch, and 686.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

##  Test Category Prefiltering

## Test Session Enrichment

This block is only for demonstration, not real data.

In [31]:
import time 

start_time = time.time()
user_session = {
    "00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657": [841260003, 887593002, 890498002, 795440001, 859416011, 568601043],
    "0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa": [759191008, 800436010, 814686001, 590928022, 698276006, 749699013],
}

new_session = enrich_user_session(user_session, customer_df, category_list, n=3, max_history_length=9)
print("Enriched Session:", new_session)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"執行時間：{elapsed_time:.4f} 秒")

User: 00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657, History: T-shirt Top Jacket Vest top Sweater Blazer

Top categories: ['Vest top', 'T-shirt', 'Sweater', 'Blazer', 'Jacket', 'Tailored Waistcoat', 'Swimwear top', 'Hoodie', 'Shirt', 'Garment Set', 'Outdoor Waistcoat', 'Polo shirt', 'Bodysuit', 'Bikini top', 'Swimsuit', 'Swimwear bottom', 'Swimwear set', 'Clothing mist', 'Zipper head', 'Robe']

LLM raw output: ["T-shirt", "Sweater", "Blazer", "Jacket", "Vest top", "Hoodie", "Shirt", "Garment Set", "Outdoor Waistcoat"] 

End of the output
Selected Categories: ['T-shirt', 'Sweater', 'Blazer', 'Jacket', 'Vest top', 'Hoodie', 'Shirt', 'Garment Set', 'Outdoor Waistcoat']

Raw item LLM output: 
 Sure, here's a realistic product name for a T-shirt that belongs to the T-shirt category, is appropriate for Spring, and has a realistic brand name and product description:

"Blooming Blossom Tee by FreshFit"
Extracted:  "Blooming Blossom Tee by FreshFit"
Raw item LLM output: 
 Her

In [34]:
user_info = get_user_info("00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657", customer_df)

raw_item = get_item_from_category('Top', user_info)
print("Raw item LLM output: \n", raw_item)

Raw item LLM output: 
 Sure, I'd be happy to help! Based on the information provided, here is a realistic product name that belongs to the Top category and is appropriate for Spring:

Aerin Ascend Tank
