In [1]:
import os, json, random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ====== Config ======
DATA_DIR = "Dataset"    # chứa users.dat, movies.dat, ratings.dat
ART_DIR  = "artifacts"       # nơi lưu model & embeddings
os.makedirs(ART_DIR, exist_ok=True)

# Train config
SEED            = 42
BATCH_SIZE      = 1024
EPOCHS          = 4
LR              = 1e-3
EMBED_DIM       = 16
MLP_DIMS        = [128, 64]
DROPOUT         = 0.2
NEG_PER_POS     = 4      # số negative mỗi positive
VAL_SIZE        = 0.1    # 10% validation
TEST_SIZE       = 0.1    # 10% test (từ phần còn lại)
MAX_SAMPLES_PER_USER = 20   # đặt số nguyên để giới hạn (demo nhanh), None = không giới hạn

# ====== Reproducibility ======
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

In [2]:
# MovieLens 1M delimiter là '::', encoding latin-1
users_path   = os.path.join(DATA_DIR, "users.dat")
movies_path  = os.path.join(DATA_DIR, "movies.dat")
ratings_path = os.path.join(DATA_DIR, "ratings.dat")

# Đọc USERS
users = pd.read_csv(
    users_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "gender", "age", "occupation", "zip"]
)

# Đọc MOVIES
movies = pd.read_csv(
    movies_path, sep="::", engine="python", encoding="latin-1",
    names=["item_id", "title", "genres"]
)

# Đọc RATINGS
ratings = pd.read_csv(
    ratings_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "item_id", "rating", "timestamp"]
)

print(users.head(2))
print(movies.head(2))
print(ratings.head(2))
print("Counts:", len(users), len(movies), len(ratings))


   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
   item_id             title                        genres
0        1  Toy Story (1995)   Animation|Children's|Comedy
1        2    Jumanji (1995)  Adventure|Children's|Fantasy
   user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
Counts: 6040 3883 1000209


In [3]:
# Bản đồ giống preprocess.py trong backend của bạn
gender_map = {'F': 0, 'M': 1}
age_map    = {1:0, 18:1, 25:2, 35:3, 45:4, 50:5, 56:6}   # giữ đúng bucket
# occupation đã là số 0..20 -> giữ nguyên

users["gender_enc"]     = users["gender"].map(gender_map).astype(int)
users["age_enc"]        = users["age"].map(age_map).astype(int)
users["occupation_enc"] = users["occupation"].astype(int)

# Genre: lấy genre đầu tiên (trùng với predict.py của bạn)
def first_genre(s):
    if isinstance(s, str) and '|' in s:
        return s.split('|')[0]
    return s

movies["genre_first"] = movies["genres"].apply(first_genre)
genre_to_index = {g:i for i, g in enumerate(sorted(movies["genre_first"].unique()))}
movies["genre_enc"] = movies["genre_first"].map(genre_to_index).astype(int)

# Map item_id -> index liên tục (embedding)
unique_item_ids = movies["item_id"].unique()
item_id_to_index = {int(i): idx for idx, i in enumerate(sorted(unique_item_ids))}
index_to_item_id = {idx: int(i) for i, idx in item_id_to_index.items()}

# Lưu mapping CSV như bạn đang dùng (để predict.py vẫn dùng được)
map_df = pd.DataFrame({"item_id": list(item_id_to_index.keys()),
                       "index":   list(item_id_to_index.values())})
map_df.to_csv(os.path.join("item_id_mapping.csv"), index=False)

print("Num genres:", len(genre_to_index))
print("Num items:", len(item_id_to_index))
print("Sample mappings:", list(genre_to_index.items())[:5], list(item_id_to_index.items())[:5])

Num genres: 18
Num items: 3883
Sample mappings: [('Action', 0), ('Adventure', 1), ('Animation', 2), ("Children's", 3), ('Comedy', 4)] [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]


In [4]:
# Merge ratings với users & movies
df = ratings.merge(users[["user_id","gender_enc","age_enc","occupation_enc"]], on="user_id", how="left")
df = df.merge(movies[["item_id","title","genre_enc"]], on="item_id", how="left")

# Label implicit: rating >= 4 -> 1, else -> 0
df["label"] = (df["rating"] >= 4).astype(int)

# Map item_id sang index liên tục (embedding)
df["item_idx"] = df["item_id"].map(item_id_to_index).astype(int)

# Gọn cột cần dùng
df = df[["user_id","gender_enc","age_enc","occupation_enc","item_idx","genre_enc","label"]]

df.head(3), df["label"].value_counts(normalize=True)

(   user_id  gender_enc  age_enc  occupation_enc  item_idx  genre_enc  label
 0        1           0        0              10      1176          7      1
 1        1           0        0              10       655          2      0
 2        1           0        0              10       902         11      0,
 label
 1    0.575161
 0    0.424839
 Name: proportion, dtype: float64)

In [5]:
# Tạo danh sách items user đã tương tác (positive)
user_pos_items = df[df["label"]==1].groupby("user_id")["item_idx"].apply(set).to_dict()
all_item_indices = set(item_id_to_index.values())

def build_training_rows(df_pos, neg_per_pos=4, max_samples_per_user=None):
    rows = []
    grouped = df_pos.groupby("user_id")
    for uid, g in tqdm(grouped, desc="Sampling negatives"):
        pos_items = list(g["item_idx"].values)
        if max_samples_per_user:
            pos_items = pos_items[:max_samples_per_user]
        pos_set = set(pos_items)
        # negatives = items user chưa like
        neg_pool = list(all_item_indices - pos_set)
        # đảm bảo có đủ neg
        if len(neg_pool) == 0:
            continue

        # Lấy user features (giả sử ổn định trong g)
        gender = int(g["gender_enc"].iloc[0])
        age    = int(g["age_enc"].iloc[0])
        occ    = int(g["occupation_enc"].iloc[0])

        for pos in pos_items:
            # Positive row
            genre = int(df_pos[(df_pos["user_id"]==uid)&(df_pos["item_idx"]==pos)]["genre_enc"].iloc[0])
            rows.append([uid, gender, age, occ, pos, genre, 1])

            # Negatives
            neg_sample = np.random.choice(neg_pool, size=neg_per_pos, replace=False)
            for neg in neg_sample:
                # genre của neg item (lấy nhanh từ movies)
                # để tránh join đắt, chuẩn bị mảng genre_by_item trước
                rows.append([uid, gender, age, occ, int(neg), int(genre_by_item[neg]), 0])
    return rows

# Chuẩn bị mảng genre_by_item để tra cứu nhanh
# index -> genre_enc
genre_by_item = np.zeros(len(item_id_to_index), dtype=np.int32)
for _, r in movies.iterrows():
    idx = item_id_to_index[int(r["item_id"])]
    genre_by_item[idx] = int(r["genre_enc"])

df_pos = df[df["label"]==1]
rows = build_training_rows(df_pos, NEG_PER_POS, MAX_SAMPLES_PER_USER)
train_df = pd.DataFrame(rows, columns=["user_id","gender","age","occupation","item_idx","genre","label"])
print("Train rows:", len(train_df))
train_df.head(5)


Sampling negatives: 100%|██████████| 6038/6038 [03:14<00:00, 31.03it/s]


Train rows: 580405


Unnamed: 0,user_id,gender,age,occupation,item_idx,genre,label
0,1,0,0,10,1176,7,1
1,1,0,0,10,1485,13,0
2,1,0,0,10,3191,7,0
3,1,0,0,10,2690,4,0
4,1,0,0,10,2095,7,0


In [6]:
# Split theo hàng (không theo user), đơn giản cho demo
train_part, test_part = train_test_split(train_df, test_size=TEST_SIZE, random_state=SEED, stratify=train_df["label"])
train_part, val_part  = train_test_split(train_part, test_size=VAL_SIZE, random_state=SEED, stratify=train_part["label"])

print("Split sizes:", len(train_part), len(val_part), len(test_part))

Split sizes: 470127 52237 58041


In [7]:
class FMTrainDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.gender = df["gender"].values.astype(np.int64)
        self.age = df["age"].values.astype(np.int64)
        self.occ = df["occupation"].values.astype(np.int64)
        self.item = df["item_idx"].values.astype(np.int64)
        self.genre = df["genre"].values.astype(np.int64)
        self.label = df["label"].values.astype(np.float32)

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return {
            "gender": self.gender[idx],
            "age": self.age[idx],
            "occupation": self.occ[idx],
            "item_id": self.item[idx],
            "genre": self.genre[idx],
            "label": self.label[idx]
        }

def collate_fn(batch):
    out = {k: [] for k in ["gender","age","occupation","item_id","genre","label"]}
    for b in batch:
        for k in out: out[k].append(b[k])
    for k in out: out[k] = torch.tensor(out[k])
    labels = out.pop("label").float()
    return out, labels

train_ds = FMTrainDataset(train_part)
val_ds   = FMTrainDataset(val_part)
test_ds  = FMTrainDataset(test_part)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0,                 # <— quan trọng: 0 để tránh treo
    collate_fn=collate_fn,
    pin_memory=False,              # <— tắt để an toàn khi không dùng CUDA
    persistent_workers=False
)

b, y = next(iter(train_loader))
print({k: b[k].shape for k in b}, y.shape)

val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, collate_fn=collate_fn, pin_memory=False, persistent_workers=False
)
test_loader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, collate_fn=collate_fn, pin_memory=False, persistent_workers=False
)

len(train_ds), len(val_ds), len(test_ds)


{'gender': torch.Size([1024]), 'age': torch.Size([1024]), 'occupation': torch.Size([1024]), 'item_id': torch.Size([1024]), 'genre': torch.Size([1024])} torch.Size([1024])


(470127, 52237, 58041)

In [8]:
class DeepFM(nn.Module):
    def __init__(self, field_dims: dict, embed_dim=16, mlp_dims=[128,64], dropout=0.2):
        """
        field_dims: dict tên_field -> cardinality (số lượng giá trị rời rạc)
            ví dụ:
            {
                "gender": 2,
                "age": 7,
                "occupation": 21,
                "item_id": num_items,
                "genre": num_genres
            }
        """
        super().__init__()
        self.fields = list(field_dims.keys())

        # Embedding cho FM/DNN (kích thước d)
        self.emb = nn.ModuleDict({
            k: nn.Embedding(field_dims[k], embed_dim) for k in self.fields
        })
        # Linear term: mỗi field một Embedding dim=1
        self.lin = nn.ModuleDict({
            k: nn.Embedding(field_dims[k], 1) for k in self.fields
        })

        # DNN
        in_dim = embed_dim * len(self.fields)
        layers = []
        d = in_dim
        for h in mlp_dims:
            layers += [nn.Linear(d, h), nn.ReLU(), nn.Dropout(dropout)]
            d = h
        self.dnn = nn.Sequential(*layers)
        self.dnn_out = nn.Linear(d, 1)

        # init
        self._init_weights()

    def _init_weights(self):
        for emb in self.emb.values():
            nn.init.xavier_uniform_(emb.weight.data)
        for l in self.lin.values():
            nn.init.zeros_(l.weight.data)

    def forward(self, x: dict):
        """
        x: dict tensor Long: ["gender","age","occupation","item_id","genre"] shape [B]
        Output: logits [B, 1]
        """
        # Linear
        lin_terms = [self.lin[k](x[k]) for k in self.fields]  # list [B,1]
        lin = torch.stack(lin_terms, dim=1).sum(dim=1)        # [B,1]

        # Embeddings
        embs = [self.emb[k](x[k]) for k in self.fields]       # list [B,d]
        E = torch.stack(embs, dim=1)                          # [B,F,d]

        # FM 2nd order: 0.5 * (sum^2 - sum of squares)
        sum_of_emb = E.sum(dim=1)                             # [B,d]
        sum_of_emb_square = sum_of_emb * sum_of_emb           # [B,d]
        square_of_emb = E * E                                 # [B,F,d]
        square_of_emb_sum = square_of_emb.sum(dim=1)          # [B,d]
        fm = 0.5 * (sum_of_emb_square - square_of_emb_sum)    # [B,d]
        fm_logit = fm.sum(dim=1, keepdim=True)                # [B,1]

        # DNN
        dnn_in = torch.cat(embs, dim=1)                       # [B, F*d]
        dnn_hidden = self.dnn(dnn_in)                         # [B, H]
        dnn_logit = self.dnn_out(dnn_hidden)                  # [B,1]

        logits = lin + fm_logit + dnn_logit                   # [B,1]
        return logits


In [9]:
num_items  = len(item_id_to_index)
num_genres = len(genre_to_index)

field_dims = {
    "gender": 2,
    "age": 7,
    "occupation": 21,
    "item_id": num_items,
    "genre": num_genres
}

model = DeepFM(field_dims, embed_dim=EMBED_DIM, mlp_dims=MLP_DIMS, dropout=DROPOUT).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

sum(p.numel() for p in model.parameters())/1e6, "M params"


(0.085516, 'M params')

In [10]:
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    all_logits = []
    all_labels = []
    for batch, labels in loader:
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        labels = labels.to(DEVICE)
        logits = model(batch).squeeze(1)  # [B]
        all_logits.append(logits.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())
    logits = np.concatenate(all_logits)
    labels = np.concatenate(all_labels)
    probs = 1/(1+np.exp(-logits))
    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float("nan")
    ll = log_loss(labels, probs, labels=[0,1])
    return auc, ll

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for bi, (batch, labels) in enumerate(tqdm(loader, total=len(loader), desc="Train")):
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        labels = labels.to(DEVICE)

        optimizer.zero_grad()
        logits = model(batch).squeeze(1)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * labels.size(0)

        if (bi + 1) % 50 == 0:
            print(f"  step {bi+1}/{len(loader)}  loss={loss.item():.4f}")
    return running_loss / len(loader.dataset)


In [11]:
best_val_auc = -1
best_state = None

for epoch in range(1, EPOCHS+1):
    tr_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_auc, val_ll = evaluate(model, val_loader)
    print(f"[Epoch {epoch:02d}] TrainLoss={tr_loss:.4f} | ValAUC={val_auc:.4f} | ValLogLoss={val_ll:.4f}")

    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

# Load best
if best_state is not None:
    model.load_state_dict(best_state)
test_auc, test_ll = evaluate(model, test_loader)
print(f"[TEST] AUC={test_auc:.4f} | LogLoss={test_ll:.4f}")

Train:  14%|█▍        | 65/460 [00:01<00:04, 87.85it/s]

  step 50/460  loss=0.5261


Train:  24%|██▎       | 109/460 [00:01<00:03, 99.30it/s]

  step 100/460  loss=0.3614


Train:  36%|███▌      | 164/460 [00:02<00:02, 99.00it/s] 

  step 150/460  loss=0.3313


Train:  47%|████▋     | 214/460 [00:02<00:02, 98.26it/s]

  step 200/460  loss=0.2943


Train:  57%|█████▋    | 264/460 [00:03<00:02, 96.43it/s]

  step 250/460  loss=0.3359


Train:  68%|██████▊   | 314/460 [00:03<00:01, 95.77it/s]

  step 300/460  loss=0.2512


Train:  78%|███████▊  | 358/460 [00:04<00:01, 99.79it/s] 

  step 350/460  loss=0.2611


Train:  90%|████████▉ | 412/460 [00:04<00:00, 100.01it/s]

  step 400/460  loss=0.2851


Train: 100%|██████████| 460/460 [00:05<00:00, 89.71it/s] 


  step 450/460  loss=0.2751
[Epoch 01] TrainLoss=0.3364 | ValAUC=0.9216 | ValLogLoss=0.2732


Train:  13%|█▎        | 61/460 [00:00<00:03, 100.84it/s]

  step 50/460  loss=0.2822


Train:  25%|██▌       | 115/460 [00:01<00:03, 101.44it/s]

  step 100/460  loss=0.3053


Train:  35%|███▍      | 159/460 [00:01<00:02, 101.05it/s]

  step 150/460  loss=0.2538


Train:  47%|████▋     | 214/460 [00:02<00:02, 102.32it/s]

  step 200/460  loss=0.2532


Train:  58%|█████▊    | 268/460 [00:02<00:01, 100.04it/s]

  step 250/460  loss=0.2880


Train:  68%|██████▊   | 312/460 [00:03<00:01, 99.33it/s] 

  step 300/460  loss=0.2664


Train:  79%|███████▊  | 362/460 [00:03<00:01, 97.61it/s]

  step 350/460  loss=0.2551


Train:  91%|█████████ | 417/460 [00:04<00:00, 99.23it/s] 

  step 400/460  loss=0.2564


Train: 100%|██████████| 460/460 [00:04<00:00, 99.39it/s] 


  step 450/460  loss=0.2720
[Epoch 02] TrainLoss=0.2719 | ValAUC=0.9233 | ValLogLoss=0.2705


Train:  14%|█▎        | 63/460 [00:00<00:03, 100.65it/s]

  step 50/460  loss=0.2777


Train:  26%|██▌       | 118/460 [00:01<00:03, 101.59it/s]

  step 100/460  loss=0.2607


Train:  35%|███▌      | 162/460 [00:01<00:02, 102.19it/s]

  step 150/460  loss=0.2529


Train:  47%|████▋     | 217/460 [00:02<00:02, 100.22it/s]

  step 200/460  loss=0.2532


Train:  56%|█████▌    | 258/460 [00:02<00:02, 93.57it/s] 

  step 250/460  loss=0.2639


Train:  67%|██████▋   | 309/460 [00:03<00:01, 97.02it/s]

  step 300/460  loss=0.2879


Train:  78%|███████▊  | 360/460 [00:03<00:01, 95.48it/s]

  step 350/460  loss=0.2650


Train:  89%|████████▉ | 410/460 [00:04<00:00, 89.96it/s]

  step 400/460  loss=0.2599


Train: 100%|██████████| 460/460 [00:04<00:00, 95.75it/s]

  step 450/460  loss=0.2393





[Epoch 03] TrainLoss=0.2680 | ValAUC=0.9244 | ValLogLoss=0.2691


Train:  15%|█▌        | 69/460 [00:00<00:04, 94.15it/s]

  step 50/460  loss=0.2556


Train:  24%|██▍       | 111/460 [00:01<00:03, 94.68it/s]

  step 100/460  loss=0.2943


Train:  37%|███▋      | 169/460 [00:01<00:03, 90.50it/s]

  step 150/460  loss=0.2880


Train:  45%|████▌     | 209/460 [00:02<00:02, 93.06it/s]

  step 200/460  loss=0.2653


Train:  57%|█████▋    | 260/460 [00:02<00:02, 96.40it/s]

  step 250/460  loss=0.2781


Train:  67%|██████▋   | 310/460 [00:03<00:01, 94.21it/s]

  step 300/460  loss=0.2933


Train:  78%|███████▊  | 361/460 [00:03<00:01, 96.91it/s]

  step 350/460  loss=0.2589


Train:  90%|█████████ | 414/460 [00:04<00:00, 99.89it/s]

  step 400/460  loss=0.2657


Train: 100%|██████████| 460/460 [00:04<00:00, 93.92it/s] 


  step 450/460  loss=0.2559
[Epoch 04] TrainLoss=0.2642 | ValAUC=0.9256 | ValLogLoss=0.2674
[TEST] AUC=0.9240 | LogLoss=0.2701


In [12]:
# 1) Lưu model (state_dict)
pth_path = os.path.join(ART_DIR, "deepfm_pytorch.pth")
torch.save(model.state_dict(), pth_path)
print("[✓] Saved:", pth_path)

# 2) Xuất item embedding để dùng FAISS & /similar_items
item_emb_weight = model.emb["item_id"].weight.detach().cpu().numpy().astype("float32")
np.save(os.path.join(ART_DIR, "item_emb.npy"), item_emb_weight)
print("[✓] Saved:", os.path.join(ART_DIR, "item_emb.npy"), item_emb_weight.shape)

# 3) Lưu mapping
with open(os.path.join(ART_DIR, "item_id_to_index.json"), "w") as f:
    json.dump({int(k): int(v) for k, v in item_id_to_index.items()}, f)
with open(os.path.join(ART_DIR, "index_to_item_id.json"), "w") as f:
    json.dump({int(k): int(v) for k, v in index_to_item_id.items()}, f)
print("[✓] Saved mappings to artifacts/")


[✓] Saved: artifacts\deepfm_pytorch.pth
[✓] Saved: artifacts\item_emb.npy (3883, 16)
[✓] Saved mappings to artifacts/


In [13]:
# Ví dụ: tạo vector u giả (mean của một vài item hành động) rồi chấm điểm
# Trong thực tế bạn sẽ tối ưu u từ lịch sử user hoặc cập nhật online theo event.

# Lấy 10 item ngẫu nhiên
sample_idx = np.random.choice(len(item_id_to_index), size=10, replace=False)
E = item_emb_weight[sample_idx]  # [10, d]
u = E.mean(axis=0)               # [d]
u = u / (np.linalg.norm(u) + 1e-6)

scores = item_emb_weight @ u     # [num_items]
topk = 10
top_idx = np.argpartition(-scores, topk)[:topk]
top_idx = top_idx[np.argsort(-scores[top_idx])]
print("Top-10 item_idx:", top_idx)

# map sang item_id & title
inv_map = {v:k for k, v in item_id_to_index.items()}
top_item_ids = [inv_map[i] for i in top_idx]
top_titles = movies[movies["item_id"].isin(top_item_ids)][["item_id","title"]]
top_titles

Top-10 item_idx: [1542  370 3523 1402  870  876  800 1657 3817 1508]


Unnamed: 0,item_id,title
370,374,Richie Rich (1994)
800,810,Kazaam (1996)
870,881,First Kid (1996)
876,888,Land Before Time III: The Time of the Great Gi...
1402,1426,Zeus and Roxanne (1997)
1508,1547,Shiloh (1997)
1542,1583,"Simple Wish, A (1997)"
1657,1705,Guy (1996)
3523,3592,Time Masters (Les Maîtres du Temps) (1982)
3817,3887,Went to Coney Island on a Mission From God... ...


In [14]:
# Chỉ cần nếu bạn muốn test Similar ngay trong notebook
# !pip install faiss-cpu

import faiss
emb = item_emb_weight.copy()
faiss.normalize_L2(emb)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

def similar_items_by_item_id(raw_item_id, k=10):
    idx = item_id_to_index[int(raw_item_id)]
    q = emb[idx:idx+1]
    D, I = index.search(q, k+1)
    I = I[0].tolist()
    D = D[0].tolist()
    out = []
    for d, i in zip(D, I):
        if i == idx: 
            continue
        iid = index_to_item_id[i]
        title = movies.loc[movies["item_id"]==iid, "title"].values
        title = title[0] if len(title) else str(iid)
        out.append({"item_id": int(iid), "title": title, "score": float(d)})
        if len(out) == k: break
    return out

# Demo với 1 item bất kỳ:
some_item = int(movies["item_id"].sample(1, random_state=SEED).iloc[0])
similar_items_by_item_id(some_item, k=10)


[{'item_id': 3886,
  'title': 'Steal This Movie! (2000)',
  'score': 0.9638875722885132},
 {'item_id': 3636,
  'title': "Those Who Love Me Can Take the Train (Ceux qui m'aiment prendront le train) (1998)",
  'score': 0.9467151165008545},
 {'item_id': 2165,
  'title': 'Your Friends and Neighbors (1998)',
  'score': 0.9466490745544434},
 {'item_id': 2131,
  'title': 'Autumn Sonata (Höstsonaten ) (1978)',
  'score': 0.9396710395812988},
 {'item_id': 882,
  'title': 'Trigger Effect, The (1996)',
  'score': 0.9308792352676392},
 {'item_id': 3619,
  'title': 'Hollywood Knights, The (1980)',
  'score': 0.9290317296981812},
 {'item_id': 872,
  'title': 'Aiqing wansui (1994)',
  'score': 0.9279145002365112},
 {'item_id': 2904, 'title': 'Rain (1932)', 'score': 0.9272639751434326},
 {'item_id': 3437, 'title': 'Cool as Ice (1991)', 'score': 0.9268709421157837},
 {'item_id': 713,
  'title': 'Of Love and Shadows (1994)',
  'score': 0.9261972308158875}]

In [29]:
# === Helpers + Patch (FINAL) — dùng đúng model.fields ===
import numpy as np, torch

# Pool item và map user->set item đã like (rating >= 4)
ALL_ITEM_IDS = np.array(sorted(movies["item_id"].unique()))
watched_by_user = ratings[ratings["rating"] >= 4].groupby("user_id")["item_id"].apply(set).to_dict()

def sample_negatives_for_user(user_id: int, num_neg: int = 99):
    pos_set = watched_by_user.get(user_id, set())
    pool = (ALL_ITEM_IDS if len(pos_set) == 0
            else np.setdiff1d(ALL_ITEM_IDS, np.fromiter(pos_set, dtype=int), assume_unique=True))
    if len(pool) == 0: pool = ALL_ITEM_IDS
    replace = len(pool) < num_neg
    return np.random.choice(pool, size=num_neg, replace=replace).tolist()

def _tensors_for_user_items(user_id: int, item_ids: list[int]):
    """Chuẩn bị (u,i,g,a,o,ge) tensors theo đúng thứ tự item_ids."""
    # user feats
    urow = users.loc[users["user_id"] == user_id, ["gender_enc","age_enc","occupation_enc"]]
    if urow.empty:
        raise ValueError(f"User {user_id} không tồn tại trong users df")
    g = int(urow["gender_enc"].values[0])
    a = int(urow["age_enc"].values[0])
    o = int(urow["occupation_enc"].values[0])

    # item feats (theo đúng thứ tự item_ids)
    msub = movies.loc[movies["item_id"].isin(item_ids), ["item_id","genre_enc"]].copy()
    msub = msub.set_index("item_id").reindex(item_ids).reset_index()
    item_idx = [item_id_to_index[int(i)] for i in msub["item_id"].values]
    genre_enc = msub["genre_enc"].astype(int).values

    n = len(item_ids)
    uT = torch.full((n,), int(user_id), dtype=torch.long, device=DEVICE)
    iT = torch.tensor(item_idx, dtype=torch.long, device=DEVICE)
    gT = torch.full((n,), g, dtype=torch.long, device=DEVICE)
    aT = torch.full((n,), a, dtype=torch.long, device=DEVICE)
    oT = torch.full((n,), o, dtype=torch.long, device=DEVICE)
    geT= torch.tensor(genre_enc, dtype=torch.long, device=DEVICE)
    return uT, iT, gT, aT, oT, geT

@torch.no_grad()
def score_candidates(user_id: int, item_ids: list[int]) -> np.ndarray:
    """
    Gọi model theo đúng chữ ký dict dựa vào model.fields.
    Ví dụ nếu model.fields = ["gender","age","occupation","item_id","genre"],
    ta sẽ cung cấp một dict với đúng các khóa đó.
    """
    model.eval()
    uT, iT, gT, aT, oT, geT = _tensors_for_user_items(user_id, item_ids)

    # Lấy danh sách khóa model yêu cầu
    fields = getattr(model, "fields", None)
    if fields is None:
        # Fallback phổ biến: dùng tên khóa tiêu chuẩn
        fields = ["user_id","item_id","gender","age","occupation","genre"]

    batch = {}
    for k in fields:
        lk = k.lower()
        if lk in ("user", "user_id", "uid"):
            batch[k] = uT
        elif lk in ("item", "item_id", "item_idx", "iid"):
            # DÙ tên là item_id nhưng thực chất model thường expect chỉ số embedding (item_idx)
            batch[k] = iT
        elif lk in ("gender", "sex"):
            batch[k] = gT
        elif lk == "age":
            batch[k] = aT
        elif lk in ("occupation", "occ", "job"):
            batch[k] = oT
        elif lk in ("genre", "genre_enc"):
            batch[k] = geT
        else:
            raise KeyError(f"Không biết map khóa '{k}' trong model.fields -> cung cấp tensor nào")

    out = model(batch)          # forward(self, x: dict)
    # Chuẩn về numpy 1D
    out = out.detach().float().view(-1).cpu().numpy()
    return out


In [30]:
# === Evaluate Hit@K & NDCG@K (REPLACED; place after Cell 15) ===
# Nhớ chạy Cell 11 (load best model) trước cell này.

K = TOPK if "TOPK" in globals() else 10
NEG_PER_USER_EVAL = 99

# Dùng test_df nếu đã alias ở Cell 15a; nếu không có thì dùng test_part
_eval_source = test_part if "test_df" in globals() else test_part

# Xác định cột item: ưu tiên item_id; nếu không có thì dùng item_idx và map lại
if "item_id" in _eval_source.columns:
    ITEM_COL = "item_id"
elif "item_idx" in _eval_source.columns:
    ITEM_COL = "item_idx"
else:
    raise RuntimeError("Không thấy cột item_id hoặc item_idx trong test set.")

# Lọc positive và lấy ngẫu nhiên 1 positive / user
test_pos = _eval_source[_eval_source["label"] == 1].copy()
if len(test_pos) == 0:
    raise RuntimeError("Không tìm thấy positive nào trong test set để đánh giá Hit@K/NDCG@K.")

test_pos = test_pos.groupby("user_id", group_keys=False).apply(
    lambda x: x.sample(n=1, random_state=SEED)
).reset_index(drop=True)

hits, ndcgs = [], []

# Dùng iterrows để truy cập theo tên cột ổn định
for _, row in tqdm(test_pos.iterrows(), total=len(test_pos), desc=f"Eval Hit@{K}/NDCG@{K}"):
    u = int(row["user_id"])
    if ITEM_COL == "item_id":
        pos_item = int(row["item_id"])
    else:
        # map item_idx -> item_id gốc
        pos_item = int(index_to_item_id[int(row["item_idx"])])

    neg_items = sample_negatives_for_user(u, num_neg=NEG_PER_USER_EVAL)

    # 1 positive + negatives (loại trùng)
    candidates = [pos_item] + [it for it in neg_items if it != pos_item]
    scores = score_candidates(u, candidates)

    # Xếp hạng giảm dần
    order = np.argsort(-scores)
    ranked_items = np.array(candidates, dtype=int)[order]

    # Hit@K
    topk_items = ranked_items[:K]
    hit = 1.0 if pos_item in topk_items else 0.0

    # NDCG@K (IDCG=1). Nếu pos ngoài top-K => 0
    r_idx = np.where(ranked_items == pos_item)[0]
    if len(r_idx) == 0:
        dcg_at_k = 0.0
    else:
        r = int(r_idx[0]) + 1  # 1-based
        dcg_at_k = (1.0 / np.log2(r + 1)) if r <= K else 0.0
    ndcg = dcg_at_k

    hits.append(hit)
    ndcgs.append(ndcg)

print(f"[TEST] Users evaluated: {len(test_pos)}")
print(f"[TEST] Hit@{K} = {np.mean(hits):.4f} | NDCG@{K} = {np.mean(ndcgs):.4f}")


  test_pos = test_pos.groupby("user_id", group_keys=False).apply(
Eval Hit@10/NDCG@10: 100%|██████████| 5213/5213 [00:27<00:00, 187.83it/s]

[TEST] Users evaluated: 5213
[TEST] Hit@10 = 0.7506 | NDCG@10 = 0.4916



