In [4]:
# import torch
# import dill  # pip install dill
# import os

# # Define file paths (adjust as needed)
# processed_dir = "./data/processed"
# original_data_path = os.path.join(processed_dir, "lightgcn_data.pt")
# safe_data_path = os.path.join(processed_dir, "lightgcn_data_safe.pt")

# # Attempt to load the original file using dill as the pickle_module
# print("Attempting to load original data using dill...")
# data = torch.load(original_data_path, map_location="cpu", pickle_module=dill, weights_only=False)
# print("Original data loaded successfully.")

# # Check and convert edge attributes to standard float tensors if needed
# if hasattr(data, "edge_attr") and data.edge_attr is not None:
#     try:
#         # Convert custom edge_attr to a standard float tensor by calling .tolist() and re-wrapping as a tensor
#         safe_edge_attr = torch.tensor(data.edge_attr.tolist(), dtype=torch.float)
#         data.edge_attr = safe_edge_attr
#         print("Converted edge attributes to a standard float tensor.")
#     except Exception as e:
#         print("Error converting edge_attr to float tensor:", e)
# else:
#     print("No edge attributes found or already in safe format.")

# # Save the safe data file
# torch.save(data, safe_data_path)
# print(f"Safe data saved to: {safe_data_path}")


##### Graph prediction head (inference)

In [25]:
import os
import re
import torch
import pickle
import pandas as pd
import dill
from torch.serialization import safe_globals
import torch_geometric.data.data as tg_data
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# ========= REUSED FUNCTIONS =========
# Define dummy DataEdgeAttr if not present
if not hasattr(tg_data, "DataEdgeAttr"):
    class DummyDataEdgeAttr:
        pass
    tg_data.DataEdgeAttr = DummyDataEdgeAttr
    print("Defined dummy DataEdgeAttr.")

def load_safe_data(data_path, device):
    print(f"Loading safe data from: {data_path}")
    with safe_globals([tg_data.DataEdgeAttr]):
        data = torch.load(data_path, map_location=device, pickle_module=dill, weights_only=False)
    return data

def load_meta(meta_path):
    with open(meta_path, "rb") as f:
        meta = pickle.load(f)
    return meta

def load_trained_model(model_path, num_users, num_items, embed_dim, num_layers, model_type, device):
    from model import LightGCN, EnhancedLightGCN
    if model_type == "enhanced":
        model = EnhancedLightGCN(num_users, num_items, embed_dim=embed_dim, num_layers=num_layers).to(device)
    else:
        model = LightGCN(num_users, num_items, embed_dim=embed_dim, num_layers=num_layers).to(device)
    print(f"Loading model from {model_path}")
    state_dict = torch.load(model_path, map_location=device)
    model.load_state_dict(state_dict)
    model.eval()
    return model

def get_recommendations(model, data, meta, user_id, top_k=10):
    device = next(model.parameters()).device
    if user_id not in meta["customer_id_map"]:
        print(f"User ID {user_id} not found in meta.")
        return []
    u_idx = meta["customer_id_map"][user_id]
    num_users = meta["num_customers"]
    with torch.no_grad():
        embeddings = model(data.edge_index.to(device), data.edge_attr.to(device))
    user_embedding = embeddings[u_idx].unsqueeze(0)
    item_embeddings = embeddings[num_users:]
    scores = torch.matmul(user_embedding, item_embeddings.t()).squeeze(0)
    _, topk_indices = torch.topk(scores, top_k)
    topk_indices = topk_indices.cpu().tolist()
    reverse_article_map = {v: k for k, v in meta["article_id_map"].items()}
    recommendations = [reverse_article_map.get(i, f"Unknown({i})") for i in topk_indices]
    return recommendations

def enrich_product_description(article_id, articles_df):
    row = articles_df[articles_df["article_id"] == article_id]
    if row.empty:
        return f"Product with ID {article_id} (details not found)."
    row = row.iloc[0]
    description = f"{row['prod_name']} – a {row['product_type_name']} from {row['product_group_name']} in {row['colour_group_name']}. {row.get('detail_desc', '')}"
    return description

def get_customer_profile(customer_id, customers_df):
    row = customers_df[customers_df["customer_id"] == customer_id]
    if row.empty:
        return "Customer details not found."
    row = row.iloc[0]
    profile = f"Age: {row.get('age', 'N/A')}, Membership: {row.get('club_member_status', 'N/A')}"
    return profile

def parse_customer_id(query):
    match = re.search(r'customer\s*id[:\-]?\s*([0-9a-fA-F]+)', query, re.IGNORECASE)
    if match:
        return match.group(1)
    return None

# ========= NEW CODE: Constructing the Final Informational Prompt =========

# Define file paths and parameters
data_dir = "./data/processed"           # Adjust as needed
model_dir = "./output"                  # Adjust as needed
safe_data_path = os.path.join(data_dir, "lightgcn_data_safe.pt")
meta_path = os.path.join(data_dir, "lightgcn_meta.pkl")
model_file = "standard_lightgcN_best.pth"  # Adjust as needed
customers_csv_path = "./data/filtered_customers.csv"  # Customer profiles
articles_csv_path = "./data/filtered_articles.csv"     # Product metadata
transactions_csv_path = "./data/filtered_transactions_train.csv"  # Transactions data

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load safe graph data and metadata
data = load_safe_data(safe_data_path, device)
meta = load_meta(meta_path)
num_users = meta["num_customers"]
num_items = meta["num_articles"]

# Load the trained graph model
model_path = os.path.join(model_dir, model_file)
model_type = "standard"  # or "enhanced"
embed_dim = 64
num_layers = 3
model = load_trained_model(model_path, num_users, num_items, embed_dim, num_layers, model_type, device)

# Load customer and product metadata from CSV files
customers_df = pd.read_csv(customers_csv_path)
articles_df = pd.read_csv(articles_csv_path)
transactions_df = pd.read_csv(transactions_csv_path)

# Determine customer ID from a natural language query (or default)
user_query = "Please give me product recommendations for customer id: 071ba51649f345894a944da3e9a0e3658299780f46a7fe89e03b221ac4a604e9."
parsed_id = parse_customer_id(user_query)
if parsed_id is None or parsed_id not in meta["customer_id_map"]:
    print("Could not parse a valid customer id from input. Using default customer.")
    customer_id = list(meta["customer_id_map"].keys())[0]
else:
    customer_id = parsed_id
print(f"\nUsing customer ID: {customer_id}")

# Build the Customer Profile section
profile_text = get_customer_profile(customer_id, customers_df)

# Extract the Purchase History for the customer from transactions CSV
customer_transactions = transactions_df[transactions_df["customer_id"] == customer_id]
if customer_transactions.empty:
    purchase_history_list = ["No purchase history found."]
else:
    purchased_article_ids = customer_transactions["article_id"].unique().tolist()
    purchase_history_list = [enrich_product_description(aid, articles_df) for aid in purchased_article_ids]

# Get Candidate Product Recommendations from the Graph Model and Enrich Them
raw_recs = get_recommendations(model, data, meta, customer_id, top_k=10)
recommended_products_list = [enrich_product_description(aid, articles_df) for aid in raw_recs]

# Construct the final informational prompt to be fed into the first (paraphrasing) LLM
# The prompt is dynamically built using the extracted customer profile, real purchase history, and candidate recommendations.
# ----- Load Golden Example from a File -----
golden_example_path = "golden_example_copy.txt"  # Ensure this file exists with your ideal output format
with open(golden_example_path, "r") as f:
    golden_example = f.read()

# ----- Construct the Final Informational Prompt with the Golden Example -----
final_prompt = (
    "Customer Profile:\n" + profile_text + "\n\n" +
    "Purchase History:\n" + "\n".join([f"- {item}" for item in purchase_history_list]) + "\n\n" +
    "Product Recommendations:\n" + "\n".join([f"- {item}" for item in recommended_products_list]) + "\n\n" +
    "Below is an example of the ideal output format:\n" +
    golden_example + "\n\n" +
    "Based on the above information, please rewrite and summarize the data into a clean, human-readable format. "
    "The output should include a concise summary of the customer's purchase history and a friendly, numbered list of product recommendations." +
    "\n\n### Response:"
)

print("\nConstructed Informational Prompt:\n")
print(final_prompt)


Using device: cuda
Loading safe data from: ./data/processed\lightgcn_data_safe.pt
Loading model from ./output\standard_lightgcN_best.pth

Using customer ID: 071ba51649f345894a944da3e9a0e3658299780f46a7fe89e03b221ac4a604e9

Constructed Informational Prompt:

Customer Profile:
Age: 49.0, Membership: ACTIVE

Purchase History:
- &DENIM+ Curvy jegging HW – a Trousers from Garment Lower body in Dark Blue. Ankle-length jeggings in stretch denim with worn details. Extra-high waist, zip fly and button, fake front pockets, real back pockets and slim legs with raw-edge hems.
- RONNY R-NECK – a T-shirt from Garment Upper body in Dark Blue. Round-necked T-shirt in soft cotton jersey.
- Woody hoodie – a Hoodie from Garment Upper body in Grey. Wide top in sweatshirt fabric with a lined drawstring hood, kangaroo pocket and ribbing at the cuffs and hem.
- Bama1 – a Sweater from Garment Upper body in Greyish Beige. Boxy-style jumper in a soft, fine knit containing some wool with dropped shoulders, long 

##### Paraphrasing

In [26]:
print(final_prompt)

Customer Profile:
Age: 49.0, Membership: ACTIVE

Purchase History:
- &DENIM+ Curvy jegging HW – a Trousers from Garment Lower body in Dark Blue. Ankle-length jeggings in stretch denim with worn details. Extra-high waist, zip fly and button, fake front pockets, real back pockets and slim legs with raw-edge hems.
- RONNY R-NECK – a T-shirt from Garment Upper body in Dark Blue. Round-necked T-shirt in soft cotton jersey.
- Woody hoodie – a Hoodie from Garment Upper body in Grey. Wide top in sweatshirt fabric with a lined drawstring hood, kangaroo pocket and ribbing at the cuffs and hem.
- Bama1 – a Sweater from Garment Upper body in Greyish Beige. Boxy-style jumper in a soft, fine knit containing some wool with dropped shoulders, long sleeves and ribbing around the neckline, cuffs and hem. The polyester content of the jumper is recycled.
- Woody(1) – a Hoodie from Garment Upper body in Dark Grey. Wide top in sweatshirt fabric with a lined drawstring hood, kangaroo pocket and ribbing at th

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

# Set the model name for the instruct model
llm_model_name = "unsloth/Llama-3.2-1B-Instruct"

# Load the tokenizer and model with trust_remote_code enabled
tokenizer_llm = AutoTokenizer.from_pretrained(llm_model_name, trust_remote_code=True)
model_llm = AutoModelForCausalLM.from_pretrained(llm_model_name, trust_remote_code=True)

# Set the device for the LLM (0 for CUDA if available, else -1 for CPU)
device_llm = 0 if torch.cuda.is_available() else -1

# Create a text-generation pipeline using the instruct model
llm_pipe = pipeline(
    "text-generation",
    model=model_llm,
    tokenizer=tokenizer_llm,
    device=device_llm
)

# Pass the final prompt to the instruct model.
# Note: final_prompt should have been constructed in the previous cell using our dynamic template.
llm_out = llm_pipe(final_prompt, max_new_tokens=1500, do_sample=True, top_p=0.95, temperature=0.7)

# Extract only the part after our delimiter "### Response:" to get the paraphrased output.
generated_text = llm_out[0]["generated_text"]
if "### Response:" in generated_text:
    final_output = generated_text.split("### Response:")[-1].strip()
else:
    final_output = generated_text.strip()


print("\nFinal Paraphrased Output from LLM:\n", final_output)


Device set to use cuda:0



Final Paraphrased Output from LLM:
 **Customer Summary:**
The customer is 49 years old, an Active member. Their purchase history includes a variety of clothing items such as tops, sweaters, and jeans. They have also purchased home goods like bras, sweatshirts, and hoodies.

**Product Recommendations:**

1. **NT Alva 2-Pack Nursing Tops**: Soft, organic cotton tops with adjustable shoulder straps and convenient nursing access.
2. **Cat Tee**: Versatile soft jersey T-shirt in White, Grey, or Pink.
3. **Cassia Crew Sweater**: Relaxed cotton-blend sweatshirt with dropped shoulders and ribbed details.
4. **Becka Hoodie**: Cozy, casual hoodie made from soft fabric featuring a jersey-lined hood and a convenient kangaroo pocket.
5. **Penny Wide Culotte**: Stylish wide-leg trousers with a flattering high waist and hidden side pockets.
6. **Seamless Cheeky Brief**: Comfortable microfiber briefs with minimal seams, high waist, and ideal for daily wear.
7. **Babe LS Top**: Fashionable ribbed jers

In [28]:
# --- Post-Processing the LLM Output ---

# Remove markdown emphasis by removing asterisks
final_output_clean = final_output.replace("*", "")

# Optionally, you can also remove extra spaces or adjust formatting further:
final_output_clean = "\n".join([line.strip() for line in final_output_clean.splitlines() if line.strip() != ""])

# Save the clean output to a text file named "SFT_data.txt"
output_file = "SFT_data_2.txt"
with open(output_file, "w") as f:
    f.write(final_output_clean)

print("\nCleaned Final Output saved to", output_file)
print("\nCleaned Final Output:\n", final_output_clean)



Cleaned Final Output saved to SFT_data_2.txt

Cleaned Final Output:
 Customer Summary:
The customer is 49 years old, an Active member. Their purchase history includes a variety of clothing items such as tops, sweaters, and jeans. They have also purchased home goods like bras, sweatshirts, and hoodies.
Product Recommendations:
1. NT Alva 2-Pack Nursing Tops: Soft, organic cotton tops with adjustable shoulder straps and convenient nursing access.
2. Cat Tee: Versatile soft jersey T-shirt in White, Grey, or Pink.
3. Cassia Crew Sweater: Relaxed cotton-blend sweatshirt with dropped shoulders and ribbed details.
4. Becka Hoodie: Cozy, casual hoodie made from soft fabric featuring a jersey-lined hood and a convenient kangaroo pocket.
5. Penny Wide Culotte: Stylish wide-leg trousers with a flattering high waist and hidden side pockets.
6. Seamless Cheeky Brief: Comfortable microfiber briefs with minimal seams, high waist, and ideal for daily wear.
7. Babe LS Top: Fashionable ribbed jersey to