In [6]:
# ==== Imports & Config (Notebook) ====
import os, re, json, math, warnings
from dataclasses import dataclass, asdict
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

warnings.filterwarnings("ignore")

# ---- Config cho Notebook ----
@dataclass
class NBConfig:
    data_path: str = "../data/processed/amazon.csv"
    model_dir: str = "../models/recommendation"
    results_dir: str = "../results"
    seed: int = 42
    min_interactions: int = 3
    n_factors: int = 64
    alpha: float = 0.55
    tfidf_max_features: int = 20000
    tfidf_ngram_min: int = 1
    tfidf_ngram_max: int = 2
    top_k_eval: int = 10
    eval_sample_users: int = 100
    center_by_user: bool = True  # nên bật

CFG = NBConfig()  # chỉnh tại đây nếu cần

def set_seed(seed: int = 42):
    np.random.seed(seed)

def safe_str(x) -> str:
    if pd.isna(x): return ""
    x = re.sub(r"\s+", " ", str(x))
    return x.strip()


In [7]:
# ==== Data load & prep ====
def load_and_prepare(cfg: NBConfig) -> pd.DataFrame:
    df = pd.read_csv(cfg.data_path)
    if "user_id" in df.columns and df["user_id"].dtype == object and df["user_id"].astype(str).str.contains(",").any():
        tmp = df.copy()
        tmp["user_id_list"] = tmp["user_id"].astype(str).str.split(",")
        if "review_id" in tmp.columns:
            tmp["review_id_list"] = tmp["review_id"].astype(str).str.split(",")
            tmp = tmp.explode(["user_id_list", "review_id_list"])
            tmp["review_id"] = tmp["review_id_list"].astype(str).str.strip()
            tmp.drop(columns=["review_id_list"], inplace=True)
        else:
            tmp = tmp.explode(["user_id_list"])
        tmp["user_id"] = tmp["user_id_list"].astype(str).str.strip()
        df = tmp.drop(columns=["user_id_list"], errors="ignore")

    need = ["user_id","product_id","rating","category","about_product"]
    missing = [c for c in need if c not in df.columns]
    if missing: raise ValueError(f"Missing cols: {missing}")

    inter = df[need].copy()
    inter = inter.dropna(subset=["user_id","product_id","rating"])
    inter["user_id"] = inter["user_id"].astype(str).str.strip()
    inter["product_id"] = inter["product_id"].astype(str).str.strip()
    inter["rating"] = pd.to_numeric(inter["rating"], errors="coerce")
    inter = inter.dropna(subset=["rating"]).drop_duplicates(subset=["user_id","product_id"])

    # Filter users with >= min_interactions
    uc = inter.groupby("user_id").size()
    valid_users = uc[uc >= cfg.min_interactions].index
    inter = inter[inter["user_id"].isin(valid_users)].copy()
    return inter

# ==== Leave-One-Out split ====
def leave_one_out_split(interactions: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    jitter = np.random.uniform(0, 1e-6, size=len(interactions))
    tmp = interactions.copy()
    tmp["__j"] = jitter
    train, test = [], []
    for uid, g in tmp.groupby("user_id", sort=False):
        g = g.sort_values(["rating","__j"], ascending=[False, True])
        if len(g) >= 2:
            train.append(g.iloc[:-1])
            test.append(g.iloc[-1:])
        else:
            train.append(g)
    return pd.concat(train, ignore_index=True), (pd.concat(test, ignore_index=True) if len(test) else pd.DataFrame())

# ==== Encoders & Matrix ====
def build_encoders(train_df: pd.DataFrame):
    ue, pe = LabelEncoder(), LabelEncoder()
    ue.fit(train_df["user_id"].unique())
    pe.fit(train_df["product_id"].unique())
    return ue, pe

def build_upm(train_df: pd.DataFrame, ue: LabelEncoder, pe: LabelEncoder, center_by_user=True):
    td = train_df.copy()
    td["u"] = ue.transform(td["user_id"])
    td["i"] = pe.transform(td["product_id"])
    vals = td["rating"].astype(float).values.copy()
    if center_by_user:
        vals = vals - td.groupby("u")["rating"].transform("mean").values
    upm = csr_matrix((vals, (td["u"].values, td["i"].values)),
                     shape=(len(ue.classes_), len(pe.classes_)))
    return upm

# ==== Popularity (Bayesian smoothing) ====
def build_popularity(train_df: pd.DataFrame) -> List[str]:
    pop = (train_df.groupby("product_id")
           .agg(mean_rating=("rating","mean"), count=("rating","count"))
           .reset_index())
    gmean, m = train_df["rating"].mean(), 5
    pop["bayes_mean"] = (pop["count"]*pop["mean_rating"] + m*gmean)/(pop["count"]+m)
    pop["score"] = pop["bayes_mean"] * np.log1p(pop["count"])
    pop = pop.sort_values("score", ascending=False)
    return pop["product_id"].tolist()

# ==== Content model ====
def build_content(train_df: pd.DataFrame, cfg: NBConfig):
    meta = train_df[["product_id","category","about_product"]].drop_duplicates("product_id").copy()
    meta["category"] = meta["category"].map(safe_str)
    meta["about_product"] = meta["about_product"].map(safe_str)
    meta["combined"] = (meta["category"] + " " + meta["about_product"]).str.lower()

    tfidf = TfidfVectorizer(max_features=cfg.tfidf_max_features,
                            ngram_range=(cfg.tfidf_ngram_min, cfg.tfidf_ngram_max),
                            stop_words="english")
    X = tfidf.fit_transform(meta["combined"])
    sim = cosine_similarity(X, X)
    pid2idx = {pid:i for i, pid in enumerate(meta["product_id"].tolist())}
    idx2pid = {i:pid for pid, i in pid2idx.items()}
    return {"meta":meta, "tfidf":tfidf, "X":X, "sim":sim, "pid2idx":pid2idx, "idx2pid":idx2pid}

def content_recommend(pid: str, art: Dict[str,Any], k: int, pop_rank: List[str]) -> List[str]:
    if pid not in art["pid2idx"]: return pop_rank[:k]
    idx = art["pid2idx"][pid]
    scores = list(enumerate(art["sim"][idx]))
    scores.sort(key=lambda x: x[1], reverse=True)
    return [art["idx2pid"][i] for i,_ in scores[1:k+1]]

# ==== Collaborative (SVD) ====
def build_svd(upm: csr_matrix, n_factors: int, seed: int):
    svd = TruncatedSVD(n_components=n_factors, random_state=seed)
    U = svd.fit_transform(upm)
    V = svd.components_.T
    return svd, U, V

def collab_recommend(uid: str, enc: Dict[str,Any], U: np.ndarray, V: np.ndarray,
                     upm: csr_matrix, k: int, pop_rank: List[str]) -> List[str]:
    ue, pe = enc["user"], enc["product"]
    if uid not in ue.classes_: return pop_rank[:k]
    uidx = ue.transform([uid])[0]
    interacted = upm[uidx].nonzero()[1]
    scores = U[uidx] @ V.T
    if len(interacted) > 0:
        scores = scores.copy()
        scores[interacted] = -np.inf
    best = np.argsort(scores)[::-1][:k]
    return pe.inverse_transform(best).tolist()

# ==== Hybrid ====
def hybrid_recommend(uid: str, train_df: pd.DataFrame, enc: Dict[str,Any],
                     U: np.ndarray, V: np.ndarray, upm: csr_matrix,
                     content_art: Dict[str,Any], alpha: float, k: int,
                     pop_rank: List[str]) -> List[str]:
    collab_list = collab_recommend(uid, enc, U, V, upm, max(50,k), pop_rank)
    hist = train_df[train_df["user_id"] == uid]
    liked = hist[hist["rating"] >= 4]["product_id"].tolist() or hist["product_id"].tolist()
    content_scores = {}
    for pid in liked[:10]:
        for p in content_recommend(pid, content_art, 50, pop_rank):
            content_scores[p] = content_scores.get(p, 0.0) + 1.0
    if content_scores:
        mx = max(content_scores.values())
        for p in list(content_scores.keys()):
            content_scores[p] /= mx
    hybrid = {}
    for i, p in enumerate(collab_list):
        hybrid[p] = hybrid.get(p, 0.0) + alpha * (1.0/(i+1))
    for p, s in content_scores.items():
        hybrid[p] = hybrid.get(p, 0.0) + (1.0 - alpha) * s
    interacted = set(hist["product_id"].tolist())
    hybrid = {p:s for p,s in hybrid.items() if p not in interacted}
    return [p for p,_ in sorted(hybrid.items(), key=lambda x:x[1], reverse=True)[:k]]

# ==== Metrics & Eval ====
def ndcg_at_k(rec: List[str], rel: List[str], k: int) -> float:
    rec = rec[:k]
    dcg = 0.0
    for i, it in enumerate(rec):
        if it in rel: dcg += 1.0 / math.log2(i+2)
    idcg = sum(1.0 / math.log2(i+2) for i in range(min(len(rel), k)))
    return float(dcg/idcg) if idcg > 0 else 0.0

def hitrate_at_k(rec: List[str], rel: List[str], k: int) -> float:
    return float(any(it in rel for it in rec[:k]))

def recall_at_k(rec: List[str], rel: List[str], k: int) -> float:
    if len(rel)==0: return 0.0
    return float(len(set(rec[:k]) & set(rel)) / len(rel))

def evaluate_all(train_df, test_df, enc, U, V, upm, content_art, pop_rank, cfg: NBConfig) -> pd.DataFrame:
    users = list(set(train_df["user_id"]) & set(test_df["user_id"]))
    if len(users) > cfg.eval_sample_users:
        users = list(np.random.choice(users, size=cfg.eval_sample_users, replace=False))
    all_pids = train_df["product_id"].unique().tolist()
    rng = np.random.default_rng(cfg.seed)

    def do_eval(name, fn):
        s_ndcg, s_hit, s_rec = [], [], []
        for uid in users:
            rel = test_df.loc[test_df["user_id"] == uid, "product_id"].tolist()
            if not rel: continue
            recs = fn(uid)
            s_ndcg.append(ndcg_at_k(recs, rel, cfg.top_k_eval))
            s_hit.append(hitrate_at_k(recs, rel, cfg.top_k_eval))
            s_rec.append(recall_at_k(recs, rel, cfg.top_k_eval))
        return {"Model":name,
                "NDCG@10": np.mean(s_ndcg) if s_ndcg else 0.0,
                "HitRate@10": np.mean(s_hit) if s_hit else 0.0,
                "Recall@10": np.mean(s_rec) if s_rec else 0.0,
                "NumUsers": len(users)}

    top10_pop = pop_rank[:cfg.top_k_eval]
    rows = []
    rows.append(do_eval("Popularity", lambda uid: top10_pop))
    rows.append(do_eval("Random", lambda uid: rng.choice(all_pids, size=min(cfg.top_k_eval, len(all_pids)), replace=False).tolist()))
    rows.append(do_eval("Content-Based", lambda uid: content_recommend(
        train_df[train_df["user_id"]==uid].tail(1)["product_id"].values[0]
        if len(train_df[train_df["user_id"]==uid]) else top10_pop[0],
        content_art, cfg.top_k_eval, pop_rank)))
    rows.append(do_eval("Collaborative", lambda uid: collab_recommend(uid, enc, U, V, upm, cfg.top_k_eval, pop_rank)))
    rows.append(do_eval("Hybrid", lambda uid: hybrid_recommend(uid, train_df, enc, U, V, upm, content_art, cfg.alpha, cfg.top_k_eval, pop_rank)))
    res = pd.DataFrame(rows).sort_values("NDCG@10", ascending=False)
    return res

# ==== Save / Load artifacts ====
def save_artifacts(model_dir: str, cfg: NBConfig, ue, pe, svd, U, V, upm_shape, content_art, pop_rank):
    os.makedirs(model_dir, exist_ok=True)
    dump({
        "config": asdict(cfg),
        "user_encoder": ue, "product_encoder": pe,
        "svd": svd, "U": U, "V": V, "upm_shape": upm_shape,
        "content_pid2idx": content_art["pid2idx"],
        "content_idx2pid": content_art["idx2pid"],
        "content_tfidf": content_art["tfidf"],
        "content_X": content_art["X"],
        "content_similarity": content_art["sim"],
        "pop_rank": pop_rank,
    }, os.path.join(model_dir, "hybrid_model.joblib"))
    with open(os.path.join(model_dir, "manifest.json"), "w", encoding="utf-8") as f:
        json.dump({
            "n_users": len(ue.classes_),
            "n_products": len(pe.classes_),
            "n_factors": cfg.n_factors,
            "alpha": cfg.alpha,
            "tfidf_max_features": cfg.tfidf_max_features
        }, f, ensure_ascii=False, indent=2)

def load_artifacts(model_dir: str) -> Dict[str,Any]:
    return load(os.path.join(model_dir, "hybrid_model.joblib"))

# ==== Run training in Notebook ====
set_seed(CFG.seed)

interactions = load_and_prepare(CFG)
train_df, test_df = leave_one_out_split(interactions)

ue, pe = build_encoders(train_df)
upm = build_upm(train_df, ue, pe, center_by_user=CFG.center_by_user)

pop_rank = build_popularity(train_df)
content_art = build_content(train_df, CFG)
svd, U, V = build_svd(upm, CFG.n_factors, CFG.seed)

enc = {"user": ue, "product": pe}
results = evaluate_all(train_df, test_df, enc, U, V, upm, content_art, pop_rank, CFG)
display(results)

# Lưu kết quả & artifacts
os.makedirs(CFG.results_dir, exist_ok=True)
results_path = os.path.join(CFG.results_dir, "recommendation_results.csv")
results.to_csv(results_path, index=False)

save_artifacts(CFG.model_dir, CFG, ue, pe, svd, U, V, upm.shape, content_art, pop_rank)
print("Saved:", results_path, "and model to", CFG.model_dir)


Unnamed: 0,Model,NDCG@10,HitRate@10,Recall@10,NumUsers
2,Content-Based,0.544271,0.78,0.78,100
4,Hybrid,0.153654,0.39,0.39,100
3,Collaborative,0.028256,0.06,0.06,100
1,Random,0.007317,0.02,0.02,100
0,Popularity,0.0,0.0,0.0,100


Saved: ../results\recommendation_results.csv and model to ../models/recommendation


In [8]:
# ==== Inference helpers (Notebook) ====
def nb_recommend_for_user(user_id: str, model_dir: str = "../model", top_k: int = 10) -> List[str]:
    art = load_artifacts(model_dir)
    ue, pe = art["user_encoder"], art["product_encoder"]
    U, V = art["U"], art["V"]
    pop_rank = art["pop_rank"]
    if user_id not in ue.classes_:
        return pop_rank[:top_k]
    uidx = ue.transform([user_id])[0]
    scores = U[uidx] @ V.T
    best = np.argsort(scores)[::-1][:top_k]
    return pe.inverse_transform(best).tolist()

# ví dụ nhanh với 1 user trong train:
some_user = train_df["user_id"].iloc[0]
nb_recommend_for_user(some_user, model_dir=CFG.model_dir, top_k=10)


['B0BMGG6NKT',
 'B07WJWRNVK',
 'B08B42LWKN',
 'B088ZFJY82',
 'B086Q3QMFS',
 'B0859M539M',
 'B084PJSSQ1',
 'B083T5G5PM',
 'B082LZGK39',
 'B082LSVT4B']