In [1]:
# Cell 1 (ĐÃ SỬA TOÀN BỘ): Imports, Config & Robust Loader cho MovieLens 1M (ratings/users/movies)

import os, math, random
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score

# ==== Seed & Device ====
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

# ==== Đường dẫn dữ liệu (đi theo bản bạn đang dùng) ====
DATA_DIR = Path("./Dataset")         # đổi nếu khác (vd: ./Data/Dataset/ml-1m)
RATINGS_PATH = DATA_DIR / "ratings.dat"
USERS_PATH   = DATA_DIR / "users.dat"
MOVIES_PATH  = DATA_DIR / "movies.dat"

# (fallback nếu bạn chỉ có .csv)
RATINGS_CSV = DATA_DIR / "ratings.csv"  # userId,movieId,rating,timestamp (ML-latest)

# ==== Hyperparams (giữ nguyên để các cell sau dùng) ====
EMBED_DIM   = 32
HIDDEN_DIMS = [128, 64]
DROPOUT     = 0.2
LR          = 1e-3
BATCH_SIZE  = 4096
EPOCHS      = 5
NUM_NEG_TRAIN = 3
NUM_NEG_EVAL  = 99
TOPK = 10

# ==== Hàm đọc .dat chống UnicodeDecodeError ====
def read_ml1m_dat(path: Path, names):
    """
    Đọc file .dat ML-1M với sep='::' và fallback encoding để tránh UnicodeDecodeError.
    Ưu tiên latin-1/cp1252 cho movies.dat (tiêu đề phim có ký tự đặc biệt).
    """
    if not path.exists():
        raise FileNotFoundError(f"Không tìm thấy file: {path}")

    encodings = ["utf-8", "ISO-8859-1", "latin-1", "cp1252"]
    last_err = None
    for enc in encodings:
        try:
            return pd.read_csv(
                path,
                sep="::",
                engine="python",
                names=names,
                encoding=enc,
                encoding_errors="strict",  # đổi "replace" nếu muốn tránh lỗi bằng ký tự �
            )
        except UnicodeDecodeError as e:
            last_err = e
            continue

    # Phương án cuối: thay ký tự lỗi để không crash
    print(f"[WARN] All strict decodes failed for {path}. Using cp1252 with replacement.")
    return pd.read_csv(
        path,
        sep="::",
        engine="python",
        names=names,
        encoding="cp1252",
        encoding_errors="replace",
    )

# ==== Đọc dữ liệu ====
if RATINGS_PATH.exists() and USERS_PATH.exists() and MOVIES_PATH.exists():
    ratings = read_ml1m_dat(RATINGS_PATH, ["user_id","movie_id","rating","timestamp"])
    users   = read_ml1m_dat(USERS_PATH,   ["user_id","gender","age","occupation","zip"])
    movies  = read_ml1m_dat(MOVIES_PATH,  ["movie_id","title","genres"])

    # Merge (giữ sẵn metadata nếu sau này cần mở rộng mô hình)
    df_full = ratings.merge(users, on="user_id").merge(movies, on="movie_id")

    # Baseline Wide&Deep implicit chỉ cần các cột lõi:
    df = df_full[["user_id","movie_id","rating","timestamp"]].copy()

elif RATINGS_CSV.exists():  # Fallback: chỉ có ratings.csv (phiên bản MovieLens mới)
    df_csv = pd.read_csv(RATINGS_CSV)
    df_csv.columns = [c.lower() for c in df_csv.columns]
    # yêu cầu: userId, movieId, rating, timestamp
    req = {"userid","movieid","rating","timestamp"}
    assert req.issubset(set(df_csv.columns)), f"ratings.csv thiếu cột {req - set(df_csv.columns)}"
    df = df_csv.rename(columns={"userid":"user_id","movieid":"movie_id"})[["user_id","movie_id","rating","timestamp"]].copy()
else:
    raise FileNotFoundError(
        f"Không thấy dữ liệu. Đặt ML-1M vào {DATA_DIR} gồm ratings.dat/users.dat/movies.dat "
        f"hoặc ít nhất ratings.csv (userId,movieId,rating,timestamp)."
    )

# ==== Chuẩn hóa kiểu dữ liệu & nhị phân hoá nhãn ====
# rating >= 4 => 1 (positive), ngược lại 0
df["label"] = (df["rating"] >= 4).astype(np.int64)

# ép kiểu timestamp về int64 (nếu là object/float)
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce").fillna(0).astype(np.int64)

# ==== Map ID rời rạc sang index liên tục ====
# (giữ nguyên thứ tự xuất hiện để so sánh công bằng với các mô hình trước)
user_ids = df["user_id"].unique()
item_ids = df["movie_id"].unique()
user2idx = {u:i for i,u in enumerate(user_ids)}
item2idx = {m:i for i,m in enumerate(item_ids)}

df["u"] = df["user_id"].map(user2idx).astype(np.int64)
df["i"] = df["movie_id"].map(item2idx).astype(np.int64)

N_USERS = len(user2idx)
N_ITEMS = len(item2idx)

print(f"Loaded MovieLens: {len(df):,} interactions | Users={N_USERS:,} | Items={N_ITEMS:,}")
df.head()


Device: cpu
Loaded MovieLens: 1,000,209 interactions | Users=6,040 | Items=3,706


Unnamed: 0,user_id,movie_id,rating,timestamp,label,u,i
0,1,1193,5,978300760,1,0,0
1,1,661,3,978302109,0,0,1
2,1,914,3,978301968,0,0,2
3,1,3408,4,978300275,1,0,3
4,1,2355,5,978824291,1,0,4


In [2]:
# Cell 2: Temporal split per-user (last->test, second last->valid, rest->train)
from collections import defaultdict

def temporal_split_userwise(df: pd.DataFrame):
    df = df.sort_values(["u","timestamp"])
    grp = df.groupby("u")

    test_idx, valid_idx, train_idx = [], [], []
    for u, g in grp:
        idxs = g.index.to_list()
        if len(idxs) == 1:
            test_idx.append(idxs[-1])
        elif len(idxs) == 2:
            valid_idx.append(idxs[-2])
            test_idx.append(idxs[-1])
        else:
            train_idx.extend(idxs[:-2])
            valid_idx.append(idxs[-2])
            test_idx.append(idxs[-1])

    df_train = df.loc[train_idx].reset_index(drop=True)
    df_valid = df.loc[valid_idx].reset_index(drop=True)
    df_test  = df.loc[test_idx ].reset_index(drop=True)
    print(f"Train={len(df_train):,}  Valid={len(df_valid):,}  Test={len(df_test):,}")
    return df_train, df_valid, df_test

df_train, df_valid, df_test = temporal_split_userwise(df)

# Tạo tập item positive cho mỗi user (phục vụ negative sampling)
user_pos_items = defaultdict(set)
for r in df_train.itertuples(index=False):
    if r.label == 1:
        user_pos_items[r.u].add(r.i)
for r in df_valid.itertuples(index=False):
    if r.label == 1:
        user_pos_items[r.u].add(r.i)

len(user_pos_items), list(user_pos_items.items())[:1]

Train=988,129  Valid=6,040  Test=6,040


(6038,
 [(0,
   {0,
    3,
    4,
    6,
    7,
    8,
    9,
    10,
    11,
    12,
    13,
    14,
    15,
    17,
    18,
    19,
    22,
    23,
    26,
    27,
    28,
    30,
    31,
    32,
    33,
    34,
    35,
    36,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    51,
    52})])

In [3]:
# Cell 3: Dataset & DataLoader

class TrainPairDataset(Dataset):
    """
    Mỗi __getitem__ trả về 1 positive (u,i,1) + NUM_NEG_TRAIN negatives (u,j,0).
    Chỉ dùng positives của df_train để học implicit.
    """
    def __init__(self, df_pos: pd.DataFrame, n_items: int, user_pos: dict, num_neg=3):
        self.pos = df_pos[df_pos["label"] == 1][["u","i"]].values
        self.n_items = n_items
        self.user_pos = user_pos
        self.num_neg = num_neg

    def __len__(self):
        return len(self.pos)

    def __getitem__(self, idx):
        u, i = self.pos[idx]
        us = [u]; is_ = [i]; ys = [1.0]
        cnt = 0
        banned = self.user_pos[u]
        while cnt < self.num_neg:
            j = np.random.randint(0, self.n_items)
            if j not in banned:
                us.append(u); is_.append(j); ys.append(0.0)
                cnt += 1
        return np.array(us, dtype=np.int64), np.array(is_, dtype=np.int64), np.array(ys, dtype=np.float32)

def train_collate(batch):
    u = np.concatenate([b[0] for b in batch], axis=0)
    i = np.concatenate([b[1] for b in batch], axis=0)
    y = np.concatenate([b[2] for b in batch], axis=0)
    return torch.from_numpy(u), torch.from_numpy(i), torch.from_numpy(y)

class PointwiseDataset(Dataset):
    """
    Dùng cho valid/test binary (không sample negative).
    """
    def __init__(self, df_xy: pd.DataFrame):
        self.u = df_xy["u"].astype(np.int64).values
        self.i = df_xy["i"].astype(np.int64).values
        self.y = df_xy["label"].astype(np.float32).values

    def __len__(self):
        return len(self.u)

    def __getitem__(self, idx):
        return (self.u[idx], self.i[idx], self.y[idx])

def pointwise_collate(batch):
    u = torch.tensor([b[0] for b in batch], dtype=torch.long)
    i = torch.tensor([b[1] for b in batch], dtype=torch.long)
    y = torch.tensor([b[2] for b in batch], dtype=torch.float32)
    return u, i, y

train_ds = TrainPairDataset(df_train, N_ITEMS, user_pos_items, NUM_NEG_TRAIN)
valid_ds = PointwiseDataset(df_valid)
test_ds  = PointwiseDataset(df_test)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0, collate_fn=train_collate)
valid_loader = DataLoader(valid_ds, batch_size=8192,     shuffle=False, num_workers=0, collate_fn=pointwise_collate)
test_loader  = DataLoader(test_ds,  batch_size=8192,     shuffle=False, num_workers=0, collate_fn=pointwise_collate)

len(train_ds), len(valid_ds), len(test_ds)


(568232, 6040, 6040)

In [4]:
# Cell 4: Wide&Deep model

class WideAndDeep(nn.Module):
    """
    Wide: user_bias + item_bias + global_bias (tuyến tính)
    Deep: concat(Emb(u), Emb(i)) -> MLP -> logit
    Output: deep_logit + wide_logit (dùng BCEWithLogitsLoss)
    """
    def __init__(self, n_users, n_items, embed_dim=32, hidden_dims=(128,64), dropout=0.2):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, embed_dim)
        self.item_emb = nn.Embedding(n_items, embed_dim)

        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)
        self.global_bias = nn.Parameter(torch.zeros(1))

        layers = []
        in_dim = embed_dim * 2
        for h in hidden_dims:
            layers += [nn.Linear(in_dim, h), nn.ReLU(), nn.Dropout(dropout)]
            in_dim = h
        layers += [nn.Linear(in_dim, 1)]
        self.mlp = nn.Sequential(*layers)

        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, u, i):
        ue = self.user_emb(u)
        ie = self.item_emb(i)
        deep_in = torch.cat([ue, ie], dim=-1)
        deep_logit = self.mlp(deep_in).squeeze(-1)
        wide_logit = (self.user_bias(u).squeeze(-1) +
                      self.item_bias(i).squeeze(-1) +
                      self.global_bias)
        return deep_logit + wide_logit

    @torch.no_grad()
    def predict_proba(self, u, i):
        return torch.sigmoid(self.forward(u, i))

model = WideAndDeep(N_USERS, N_ITEMS, EMBED_DIM, HIDDEN_DIMS, DROPOUT).to(DEVICE)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

model


WideAndDeep(
  (user_emb): Embedding(6040, 32)
  (item_emb): Embedding(3706, 32)
  (user_bias): Embedding(6040, 1)
  (item_bias): Embedding(3706, 1)
  (mlp): Sequential(
    (0): Linear(in_features=64, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [5]:
# Cell 5: Evaluate helpers (AUC, LogLoss, Accuracy)

@torch.no_grad()
def evaluate_binary(model, loader, device=DEVICE):
    model.eval()
    y_true_all, y_prob_all = [], []
    for u, i, y in loader:
        u = u.to(device); i = i.to(device); y = y.to(device)
        logits = model(u, i)
        probs = torch.sigmoid(logits)
        y_true_all.append(y.detach().cpu().numpy())
        y_prob_all.append(probs.detach().cpu().numpy())

    y_true = np.concatenate(y_true_all)
    y_prob = np.concatenate(y_prob_all).clip(1e-7, 1-1e-7)

    try:
        auc = roc_auc_score(y_true, y_prob)
    except ValueError:
        auc = float("nan")
    ll  = log_loss(y_true, y_prob, labels=[0,1])
    acc = accuracy_score(y_true, (y_prob >= 0.5).astype(int))
    return {"auc": auc, "logloss": ll, "acc": acc}


In [6]:
# Cell 6: Training loop + best-by-valid-AUC

best_valid_auc = -1.0
best_state = None

for epoch in range(1, EPOCHS+1):
    model.train()
    running_loss, n_samples = 0.0, 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{EPOCHS}", total=len(train_loader))
    for u, i, y in pbar:
        u = u.to(DEVICE); i = i.to(DEVICE); y = y.to(DEVICE)
        optimizer.zero_grad()
        logits = model(u, i)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        bs = y.size(0)
        running_loss += loss.item() * bs
        n_samples += bs
        pbar.set_postfix(loss=running_loss/max(1, n_samples))

    valid_metrics = evaluate_binary(model, valid_loader)
    print(f"[Epoch {epoch}] TrainLoss={running_loss/n_samples:.4f} | "
          f"Valid AUC={valid_metrics['auc']:.4f}  "
          f"Acc={valid_metrics['acc']:.4f}  "
          f"LogLoss={valid_metrics['logloss']:.4f}")

    if valid_metrics["auc"] > best_valid_auc:
        best_valid_auc = valid_metrics["auc"]
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

if best_state is not None:
    model.load_state_dict(best_state)
    model.to(DEVICE)
    print(f"Loaded best model (Valid AUC={best_valid_auc:.4f}).")

test_metrics = evaluate_binary(model, test_loader)
print("Test (binary) — AUC: {auc:.4f} | Acc: {acc:.4f} | LogLoss: {logloss:.4f}".format(**test_metrics))


Epoch 1/5: 100%|██████████| 139/139 [00:13<00:00, 10.25it/s, loss=0.436]


[Epoch 1] TrainLoss=0.4357 | Valid AUC=0.6923  Acc=0.6147  LogLoss=0.7545


Epoch 2/5: 100%|██████████| 139/139 [00:17<00:00,  7.93it/s, loss=0.365]


[Epoch 2] TrainLoss=0.3650 | Valid AUC=0.6942  Acc=0.6106  LogLoss=0.7575


Epoch 3/5: 100%|██████████| 139/139 [00:17<00:00,  8.00it/s, loss=0.363]


[Epoch 3] TrainLoss=0.3630 | Valid AUC=0.6919  Acc=0.6036  LogLoss=0.7621


Epoch 4/5: 100%|██████████| 139/139 [00:16<00:00,  8.35it/s, loss=0.362]


[Epoch 4] TrainLoss=0.3618 | Valid AUC=0.6903  Acc=0.6015  LogLoss=0.7665


Epoch 5/5: 100%|██████████| 139/139 [00:17<00:00,  8.07it/s, loss=0.359]

[Epoch 5] TrainLoss=0.3587 | Valid AUC=0.6947  Acc=0.6070  LogLoss=0.7568
Loaded best model (Valid AUC=0.6947).
Test (binary) — AUC: 0.6912 | Acc: 0.5967 | LogLoss: 0.7859





In [7]:
# Cell 7: Ranking helpers (LOO candidates, Hit@K, NDCG@K)

@torch.no_grad()
def generate_loo_candidates(df_test, n_items, user_pos_items, num_neg=99):
    """
    Với mỗi user có positive ở test: lấy (u, i_pos) làm positive và sample num_neg items chưa từng positive.
    Trả về dict: u -> (pos_i, [neg_items...])
    """
    df_pos_test = df_test[df_test["label"] == 1]
    user_pos_test = {int(r.u): int(r.i) for r in df_pos_test.itertuples(index=False)}

    loo = {}
    for u, pos_i in user_pos_test.items():
        negs, banned = [], set(user_pos_items[u]) | {pos_i}
        while len(negs) < num_neg:
            j = np.random.randint(0, n_items)
            if j not in banned:
                negs.append(j)
        loo[u] = (pos_i, negs)
    return loo

def hit_ndcg_at_k(scores, pos_index, k=10):
    """
    scores: np.array score cho danh sách ứng viên; pos_index: vị trí positive trong mảng scores.
    """
    rank = (-scores).argsort()
    topk = rank[:k]
    hit = 1.0 if pos_index in topk else 0.0
    # NDCG: 1/log2(rank_of_positive+2)
    pos_rank = np.where(rank == pos_index)[0]
    ndcg = 1.0 / math.log2(pos_rank[0] + 2.0) if len(pos_rank) > 0 else 0.0
    return hit, ndcg

@torch.no_grad()
def evaluate_ranking(model, loo_dict, k=10, device=DEVICE):
    model.eval()
    hits, ndcgs = [], []
    for u, (pos_i, negs) in tqdm(loo_dict.items(), desc="Ranking eval"):
        cand_items = [pos_i] + negs  # positive ở index 0
        u_arr = torch.full((len(cand_items),), u, dtype=torch.long, device=device)
        i_arr = torch.tensor(cand_items, dtype=torch.long, device=device)
        scores = model.predict_proba(u_arr, i_arr).detach().cpu().numpy()
        hit, ndcg = hit_ndcg_at_k(scores, pos_index=0, k=k)
        hits.append(hit); ndcgs.append(ndcg)
    return {"hit@k": float(np.mean(hits)), "ndcg@k": float(np.mean(ndcgs))}


In [8]:
# Cell 8: Compute Hit@10 & NDCG@10

loo = generate_loo_candidates(df_test, N_ITEMS, user_pos_items, NUM_NEG_EVAL)
print("Users in LOO:", len(loo))

rank_metrics = evaluate_ranking(model, loo, k=TOPK)
print(f"Ranking (leave-one-out, K={TOPK}) — Hit@{TOPK}: {rank_metrics['hit@k']:.4f} | "
      f"NDCG@{TOPK}: {rank_metrics['ndcg@k']:.4f}")


Users in LOO: 3563


Ranking eval: 100%|██████████| 3563/3563 [00:01<00:00, 3505.98it/s]

Ranking (leave-one-out, K=10) — Hit@10: 0.5535 | NDCG@10: 0.4064





In [None]:
# Cell 10: Summary print (optional)

summary = {
    "valid_auc_best": float(best_valid_auc),
    "test_auc": float(test_metrics["auc"]),
    "test_acc": float(test_metrics["acc"]),
    "test_logloss": float(test_metrics["logloss"]),
    f"hit@{TOPK}": float(rank_metrics["hit@k"]),
    f"ndcg@{TOPK}": float(rank_metrics["ndcg@k"]),
}
print("=== SUMMARY ===")
for k, v in summary.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")
