# **Amazon Sales Dataset - Recommendation System**

**Objective**: Build and evaluate various recommendation models to identify the top-performing system.

**Deliverables:**
- Multiple recommendation models (Content-Based, Collaborative, Hybrid).
- Model performance report (NDCG@10, HitRate@10, Recall@10).
- A saved, reusable model artifact .
- Prediction function for user recommendations.

### 1. Import Libraries

In [1]:
import os
import re
import json
import math
import warnings
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

warnings.filterwarnings("ignore")

### 2. Configure the Pipeline

In [14]:
# ---- Notebook Configuration ----
@dataclass
class NBConfig:
    data_path: str = "../data/processed/amazon.csv"  
    model_dir: str = "../models/recommendation" 
    results_dir: str = "../results" # Directory to save evaluation results
    seed: int = 42
    min_interactions: int = 3 
    n_factors: int = 64 # Number of latent factors for SVD
    alpha: float = 0.2 # Weight for the Collaborative model in the Hybrid recommender
    tfidf_max_features: int = 20000
    tfidf_ngram_min: int = 1
    tfidf_ngram_max: int = 2
    top_k_eval: int = 10 # Evaluate the top 10 recommendations
    eval_sample_users: int = 100 # Number of sample users for evaluation
    center_by_user: bool = True  # Normalize ratings by subtracting the user's mean

CFG = NBConfig()

def set_seed(seed: int = 42):
    np.random.seed(seed)

def safe_str(x) -> str:
    if pd.isna(x): return ""
    x = re.sub(r"\\s+", " ", str(x))
    return x.strip()


### 3. DATA LOADING AND PREPARATION

In [15]:
def load_and_prepare(cfg: NBConfig) -> pd.DataFrame:
    df = pd.read_csv(cfg.data_path)
    # "Explode" the user_id column to create distinct user-item interaction rows
    if "user_id" in df.columns and df["user_id"].dtype == object and df["user_id"].astype(str).str.contains(",").any():
        tmp = df.copy()
        tmp["user_id_list"] = tmp["user_id"].astype(str).str.split(",")
        tmp = tmp.explode(["user_id_list"])
        tmp["user_id"] = tmp["user_id_list"].astype(str).str.strip()
        df = tmp.drop(columns=["user_id_list"], errors="ignore")

    need = ["user_id", "product_id", "rating", "category_main", "about_product", "product_name"]
    df = df.dropna(subset=need)
    df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
    df = df.dropna(subset=["rating"]).drop_duplicates(subset=["user_id", "product_id"])

    # Filter out users with few interactions to reduce noise
    uc = df.groupby("user_id").size()
    valid_users = uc[uc >= cfg.min_interactions].index
    df = df[df["user_id"].isin(valid_users)].copy()
    
    # Reset index to ensure consistency
    df.reset_index(drop=True, inplace=True)
    return df

interactions = load_and_prepare(CFG)

### 4. EVALUATION STRATEGY: LEAVE-ONE-OUT SPLIT

In [16]:
def leave_one_out_split(interactions: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Add a small amount of jitter to ensure a unique sort order
    jitter = np.random.uniform(0, 1e-6, size=len(interactions))
    tmp = interactions.copy()
    tmp["__j"] = jitter
    train, test = [], []
    for uid, g in tmp.groupby("user_id", sort=False):
        # Sort by rating and jitter, taking the last item for the test set
        g = g.sort_values(["rating", "__j"], ascending=[False, True])
        if len(g) >= 2:
            train.append(g.iloc[:-1])
            test.append(g.iloc[-1:])
        else:
            train.append(g)
    return pd.concat(train, ignore_index=True), (pd.concat(test, ignore_index=True) if len(test) else pd.DataFrame())

train_df, test_df = leave_one_out_split(interactions)
print(f"Training set: {len(train_df)} - Test set: {len(test_df)}.")

Training set: 1000 - Test set: 357.


### 5. MODEL COMPONENT CONSTRUCTION

#### 5.1. Encoders and User-Product Matrix (UPM)

In [17]:
def build_encoders(train_df: pd.DataFrame):
    ue, pe = LabelEncoder(), LabelEncoder()
    ue.fit(train_df["user_id"].unique())
    pe.fit(train_df["product_id"].unique())
    return ue, pe

def build_upm(train_df: pd.DataFrame, ue: LabelEncoder, pe: LabelEncoder, center_by_user=True):
    td = train_df.copy()
    td["u"] = ue.transform(td["user_id"])
    td["i"] = pe.transform(td["product_id"])
    vals = td["rating"].astype(float).values.copy()
    # Normalize ratings by subtracting the user's mean rating
    if center_by_user:
        vals = vals - td.groupby("u")["rating"].transform("mean").values
    upm = csr_matrix((vals, (td["u"].values, td["i"].values)),
                     shape=(len(ue.classes_), len(pe.classes_)))
    return upm

ue, pe = build_encoders(train_df)
upm = build_upm(train_df, ue, pe, center_by_user=CFG.center_by_user)
print(f"UPM shape: {upm.shape}")

UPM shape: (357, 239)


#### 5.2. Baseline Model: Popularity

In [18]:
def build_popularity(train_df: pd.DataFrame) -> List[str]:
    # Calculate a popularity score combining mean rating and rating count
    pop = (train_df.groupby("product_id")
           .agg(mean_rating=("rating", "mean"), count=("rating", "count"))
           .reset_index())
    gmean, m = train_df["rating"].mean(), 5
    pop["bayes_mean"] = (pop["count"] * pop["mean_rating"] + m * gmean) / (pop["count"] + m)
    pop["score"] = pop["bayes_mean"] * np.log1p(pop["count"])
    pop = pop.sort_values("score", ascending=False)
    return pop["product_id"].tolist()

pop_rank = build_popularity(train_df)

#### 5.3. Content-Based Model

In [19]:
def build_content(train_df: pd.DataFrame, cfg: NBConfig):
    # Create metadata for content-based model
    meta = train_df[["product_id", "category_main", "about_product", "product_name"]].drop_duplicates("product_id").copy()
    meta["combined"] = (meta["category_main"].map(safe_str) + " " + 
                        meta["about_product"].map(safe_str) + " " +
                        meta["product_name"].map(safe_str)).str.lower()

    tfidf = TfidfVectorizer(max_features=cfg.tfidf_max_features, ngram_range=(cfg.tfidf_ngram_min, cfg.tfidf_ngram_max), stop_words="english")
    X = tfidf.fit_transform(meta["combined"])
    sim = cosine_similarity(X, X)
    pid2idx = {pid: i for i, pid in enumerate(meta["product_id"].tolist())}
    idx2pid = {i: pid for pid, i in pid2idx.items()}
    return {"sim": sim, "pid2idx": pid2idx, "idx2pid": idx2pid}

content_artifacts = build_content(train_df, CFG)

#### 5.4. Collaborative Filtering Model (SVD)

In [20]:
def build_svd(upm: csr_matrix, n_factors: int, seed: int):
    svd = TruncatedSVD(n_components=n_factors, random_state=seed)
    U = svd.fit_transform(upm)  # User-feature matrix
    V = svd.components_.T       # Item-feature matrix
    return svd, U, V

svd, U, V = build_svd(upm, CFG.n_factors, CFG.seed)
print(f" U shape: {U.shape}, V shape: {V.shape}")

 U shape: (357, 64), V shape: (239, 64)


### 6. EVALUATING MODEL PERFORMANCE

#### 6.1. Recommendation and Metric Functions

In [21]:
# Recommendation functions for each model
def recommend_content(uid: str, train_df: pd.DataFrame, content_art: Dict[str,Any], k: int, pop_rank: List[str]) -> List[str]:
    # Use the last item the user interacted with as a base
    last_pid = train_df[train_df["user_id"] == uid].tail(1)["product_id"].values
    if not last_pid: return pop_rank[:k]
    pid = last_pid[0]
    
    if pid not in content_art["pid2idx"]: return pop_rank[:k]
    idx = content_art["pid2idx"][pid]
    scores = list(enumerate(content_art["sim"][idx]))
    scores.sort(key=lambda x: x[1], reverse=True)
    interacted = set(train_df[train_df["user_id"] == uid]['product_id'].tolist())
    recs = [content_art["idx2pid"][i] for i, _ in scores if content_art["idx2pid"][i] not in interacted]
    return recs[:k]

def recommend_collab(uid: str, enc: Dict[str,Any], U: np.ndarray, V: np.ndarray, upm: csr_matrix, k: int, pop_rank: List[str]) -> List[str]:
    ue, pe = enc["user"], enc["product"]
    if uid not in ue.classes_: return pop_rank[:k]
    uidx = ue.transform([uid])[0]
    interacted = upm[uidx].nonzero()[1]
    scores = U[uidx] @ V.T
    if len(interacted) > 0:
        scores = scores.copy()
        scores[interacted] = -np.inf
    best = np.argsort(scores)[::-1][:k]
    return pe.inverse_transform(best).tolist()

def recommend_hybrid(uid: str, train_df: pd.DataFrame, enc: Dict[str,Any], U: np.ndarray, V: np.ndarray, upm: csr_matrix, content_art: Dict[str,Any], alpha: float, k: int, pop_rank: List[str]) -> List[str]:
    collab_list = recommend_collab(uid, enc, U, V, upm, k*5, pop_rank) # Get more candidates for re-ranking
    
    content_list = recommend_content(uid, train_df, content_art, k*5, pop_rank)
    
    # Calculate hybrid scores
    hybrid_scores = {}
    for i, p in enumerate(collab_list):
        hybrid_scores[p] = hybrid_scores.get(p, 0.0) + alpha * (1.0 / (i + 1))
    for i, p in enumerate(content_list):
        hybrid_scores[p] = hybrid_scores.get(p, 0.0) + (1.0 - alpha) * (1.0 / (i + 1))
        
    return [p for p, _ in sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:k]]


# Metric calculation functions
def ndcg_at_k(rec: List[str], rel: List[str], k: int) -> float:
    rec = rec[:k]
    dcg = sum(1.0 / math.log2(i + 2) for i, it in enumerate(rec) if it in rel)
    idcg = sum(1.0 / math.log2(i + 2) for i in range(min(len(rel), k)))
    return float(dcg / idcg) if idcg > 0 else 0.0

def hitrate_at_k(rec: List[str], rel: List[str], k: int) -> float:
    return float(any(it in rel for it in rec[:k]))

def recall_at_k(rec: List[str], rel: List[str], k: int) -> float:
    if not rel: return 0.0
    return float(len(set(rec[:k]) & set(rel)) / len(rel))

# Main evaluation function
def evaluate_all(train_df, test_df, enc, U, V, upm, content_art, pop_rank, cfg: NBConfig) -> pd.DataFrame:
    users = list(set(train_df["user_id"]) & set(test_df["user_id"]))
    if len(users) > cfg.eval_sample_users:
        users = list(np.random.choice(users, size=cfg.eval_sample_users, replace=False))
    
    all_pids = train_df["product_id"].unique().tolist()
    rng = np.random.default_rng(cfg.seed)

    def do_eval(name, fn):
        metrics = {'NDCG@10': [], 'HitRate@10': [], 'Recall@10': []}
        for uid in users:
            rel = test_df.loc[test_df["user_id"] == uid, "product_id"].tolist()
            if not rel: continue
            recs = fn(uid)
            metrics['NDCG@10'].append(ndcg_at_k(recs, rel, cfg.top_k_eval))
            metrics['HitRate@10'].append(hitrate_at_k(recs, rel, cfg.top_k_eval))
            metrics['Recall@10'].append(recall_at_k(recs, rel, cfg.top_k_eval))
        return {"Model": name, **{k: np.mean(v) for k, v in metrics.items()}, "NumUsers": len(users)}

    rows = [
        do_eval("Popularity", lambda uid: pop_rank[:cfg.top_k_eval]),
        do_eval("Random", lambda uid: rng.choice(all_pids, size=cfg.top_k_eval, replace=False).tolist()),
        do_eval("Content-Based", lambda uid: recommend_content(uid, train_df, content_art, cfg.top_k_eval, pop_rank)),
        do_eval("Collaborative", lambda uid: recommend_collab(uid, enc, U, V, upm, cfg.top_k_eval, pop_rank)),
        do_eval("Hybrid", lambda uid: recommend_hybrid(uid, train_df, enc, U, V, upm, content_art, cfg.alpha, cfg.top_k_eval, pop_rank))
    ]
    return pd.DataFrame(rows).sort_values("NDCG@10", ascending=False)


#### 6.2. Run and Analyze Evaluation Results

In [22]:
enc = {"user": ue, "product": pe}
results = evaluate_all(train_df, test_df, enc, U, V, upm, content_artifacts, pop_rank, CFG)

print("\n----- MODEL PERFORMANCE EVALUATION RESULTS -----")
display(results)

# Save results
os.makedirs(CFG.results_dir, exist_ok=True)
results_path = os.path.join(CFG.results_dir, "recommendation_results.csv")
results.to_csv(results_path, index=False)
print(f"\nEvaluation results saved to: {results_path}")


----- MODEL PERFORMANCE EVALUATION RESULTS -----


Unnamed: 0,Model,NDCG@10,HitRate@10,Recall@10,NumUsers
2,Content-Based,0.816818,0.86,0.86,100
4,Hybrid,0.815973,0.86,0.86,100
3,Collaborative,0.026572,0.04,0.04,100
1,Random,0.014906,0.04,0.04,100
0,Popularity,0.0,0.0,0.0,100



Evaluation results saved to: ../results\recommendation_results.csv


### 7. SAVING AND USING THE MODEL FOR INFERENCE

In [23]:
def save_artifacts(model_dir: str, cfg: NBConfig, ue, pe, svd, U, V, content_art, pop_rank):
    os.makedirs(model_dir, exist_ok=True)
    artifacts = {
        "config": asdict(cfg), "user_encoder": ue, "product_encoder": pe,
        "svd": svd, "U": U, "V": V,
        "content_pid2idx": content_art["pid2idx"], "content_idx2pid": content_art["idx2pid"],
        "content_similarity": content_art["sim"],
        "pop_rank": pop_rank, "train_df_cols": train_df.columns.tolist()
    }
    dump(artifacts, os.path.join(model_dir, "hybrid_model.joblib"))
    print(f"Model artifacts saved to the '{model_dir}' directory.")

save_artifacts(CFG.model_dir, CFG, ue, pe, svd, U, V, content_artifacts, pop_rank)

# Example of how to use the saved model for inference
def predict_for_user(user_id: str, model_path: str = "../models/recommendation/hybrid_model.joblib", top_k: int = 10) -> List[str]:
    artifacts = load(model_path)
    # Reload components
    enc = {"user": artifacts["user_encoder"], "product": artifacts["product_encoder"]}
    U, V = artifacts["U"], artifacts["V"]
    pop_rank = artifacts["pop_rank"]
    
    # Generate recommendations using the collaborative model
    recommendations = recommend_collab(user_id, enc, U, V, csr_matrix(U.shape), top_k, pop_rank)
    return recommendations

# Get a sample user and make a prediction
some_user = train_df["user_id"].iloc[10] 
print(f"\n----- Recommendations for User ID: {some_user} -----")
user_recommendations = predict_for_user(some_user, top_k=20)
print(user_recommendations)

Model artifacts saved to the '../models/recommendation' directory.

----- Recommendations for User ID: AHCTC6ULH4XB6YHDY6PCH2R772LQ -----
['B0BMGG6NKT', 'B07WJV6P1R', 'B088ZFJY82', 'B086Q3QMFS', 'B0859M539M', 'B084PJSSQ1', 'B083T5G5PM', 'B082LZGK39', 'B082LSVT4B', 'B081FJWN52', 'B081FG1QYX', 'B0811VCGL5', 'B07XLCFSSN', 'B07WJWRNVK', 'B07WHSJXLF', 'B08C7TYHPB', 'B07WHQWXL7', 'B07WHQBZLS', 'B07WGPKTS4', 'B07WGPKMP5']
