In [23]:
import os, json, random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ====== Config ======
DATA_DIR = "Dataset"    # chứa users.dat, movies.dat, ratings.dat
ART_DIR  = "artifacts"       # nơi lưu model & embeddings
os.makedirs(ART_DIR, exist_ok=True)

# Train config
SEED            = 42
BATCH_SIZE      = 1024
EPOCHS          = 4
LR              = 1e-3
EMBED_DIM       = 16
MLP_DIMS        = [128, 64]
DROPOUT         = 0.2
NEG_PER_POS     = 4      # số negative mỗi positive
VAL_SIZE        = 0.1    # 10% validation
TEST_SIZE       = 0.1    # 10% test (từ phần còn lại)
MAX_SAMPLES_PER_USER = 20   # đặt số nguyên để giới hạn (demo nhanh), None = không giới hạn
# Popularity debias training
POP_BETA = 0.6          # for reweighting (Fix 4), recommended 0.3–0.8
NEG_POP_ALPHA = 0.75    # for popularity-based negative sampling distribution (Fix 5)
NEG_POP_MIX = 0.5       # 50% popularity negatives + 50% random negatives

# ====== Reproducibility ======
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
set_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [24]:
# MovieLens 1M delimiter là '::', encoding latin-1
users_path   = os.path.join(DATA_DIR, "users.dat")
movies_path  = os.path.join(DATA_DIR, "movies.dat")
ratings_path = os.path.join(DATA_DIR, "ratings.dat")

# Đọc USERS
users = pd.read_csv(
    users_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "gender", "age", "occupation", "zip"]
)

# Đọc MOVIES
movies = pd.read_csv(
    movies_path, sep="::", engine="python", encoding="latin-1",
    names=["item_id", "title", "genres"]
)

# Đọc RATINGS
ratings = pd.read_csv(
    ratings_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "item_id", "rating", "timestamp"]
)

print(users.head(2))
print(movies.head(2))
print(ratings.head(2))
print("Counts:", len(users), len(movies), len(ratings))


   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
   item_id             title                        genres
0        1  Toy Story (1995)   Animation|Children's|Comedy
1        2    Jumanji (1995)  Adventure|Children's|Fantasy
   user_id  item_id  rating  timestamp
0        1     1193       5  978300760
1        1      661       3  978302109
Counts: 6040 3883 1000209


In [25]:
# Bản đồ giống preprocess.py trong backend của bạn
gender_map = {'F': 0, 'M': 1}
age_map    = {1:0, 18:1, 25:2, 35:3, 45:4, 50:5, 56:6}   # giữ đúng bucket
# occupation đã là số 0..20 -> giữ nguyên

users["gender_enc"]     = users["gender"].map(gender_map).astype(int)
users["age_enc"]        = users["age"].map(age_map).astype(int)
users["occupation_enc"] = users["occupation"].astype(int)

# Genre: lấy genre đầu tiên (trùng với predict.py của bạn)
def first_genre(s):
    if isinstance(s, str) and '|' in s:
        return s.split('|')[0]
    return s

movies["genre_first"] = movies["genres"].apply(first_genre)
genre_to_index = {g:i for i, g in enumerate(sorted(movies["genre_first"].unique()))}
movies["genre_enc"] = movies["genre_first"].map(genre_to_index).astype(int)

# Map item_id -> index liên tục (embedding)
unique_item_ids = movies["item_id"].unique()
item_id_to_index = {int(i): idx for idx, i in enumerate(sorted(unique_item_ids))}
index_to_item_id = {idx: int(i) for i, idx in item_id_to_index.items()}

# Lưu mapping CSV như bạn đang dùng (để predict.py vẫn dùng được)
map_df = pd.DataFrame({"item_id": list(item_id_to_index.keys()),
                       "index":   list(item_id_to_index.values())})
map_df.to_csv(os.path.join("item_id_mapping.csv"), index=False)

print("Num genres:", len(genre_to_index))
print("Num items:", len(item_id_to_index))
print("Sample mappings:", list(genre_to_index.items())[:5], list(item_id_to_index.items())[:5])

Num genres: 18
Num items: 3883
Sample mappings: [('Action', 0), ('Adventure', 1), ('Animation', 2), ("Children's", 3), ('Comedy', 4)] [(1, 0), (2, 1), (3, 2), (4, 3), (5, 4)]


In [26]:
# Merge ratings với users & movies
df = ratings.merge(users[["user_id","gender_enc","age_enc","occupation_enc"]], on="user_id", how="left")
df = df.merge(movies[["item_id","title","genre_enc"]], on="item_id", how="left")

# Label implicit: rating >= 4 -> 1, else -> 0
df["label"] = (df["rating"] >= 4).astype(int)

# Map item_id sang index liên tục (embedding)
df["item_idx"] = df["item_id"].map(item_id_to_index).astype(int)

# Gọn cột cần dùng
df = df[["user_id","gender_enc","age_enc","occupation_enc","item_idx","genre_enc","label"]]

df.head(3), df["label"].value_counts(normalize=True)

(   user_id  gender_enc  age_enc  occupation_enc  item_idx  genre_enc  label
 0        1           0        0              10      1176          7      1
 1        1           0        0              10       655          2      0
 2        1           0        0              10       902         11      0,
 label
 1    0.575161
 0    0.424839
 Name: proportion, dtype: float64)

In [27]:
# === Check 1: Popularity of positives ===
pos = df[df["label"] == 1]
pop = pos["item_idx"].value_counts()
print("Pos rate:", pos.shape[0], "/", df.shape[0], "=", pos.shape[0]/df.shape[0])

top = pop.head(20).reset_index()
top.columns = ["item_idx", "pos_count"]

# map to title
top["item_id"] = top["item_idx"].map(index_to_item_id)
top = top.merge(movies[["item_id","title"]], on="item_id", how="left")
top["share"] = top["pos_count"] / pop.sum()
top[["title","pos_count","share"]].head(20)


Pos rate: 575281 / 1000209 = 0.5751607913945985


Unnamed: 0,title,pos_count,share
0,American Beauty (1999),2853,0.004959
1,Star Wars: Episode IV - A New Hope (1977),2622,0.004558
2,Star Wars: Episode V - The Empire Strikes Back...,2510,0.004363
3,Saving Private Ryan (1998),2260,0.003929
4,Raiders of the Lost Ark (1981),2260,0.003929
5,"Silence of the Lambs, The (1991)",2252,0.003915
6,"Matrix, The (1999)",2171,0.003774
7,"Sixth Sense, The (1999)",2163,0.00376
8,Star Wars: Episode VI - Return of the Jedi (1983),2127,0.003697
9,Fargo (1996),2074,0.003605


In [28]:
# === Cell 5 (REPLACED) — Sampling sạch: random pos + no false negatives + pop-aware negatives + sample_weight ===

# FULL positives per user (từ df gốc, chưa bị truncation)
user_pos_items = df[df["label"]==1].groupby("user_id")["item_idx"].apply(set).to_dict()

all_item_indices = np.array(sorted(item_id_to_index.values()), dtype=np.int64)
all_item_set = set(all_item_indices.tolist())

# ---- Popularity stats on FULL positives ----
pop_by_item = np.zeros(len(item_id_to_index), dtype=np.int64)
pos_counts = df[df["label"]==1]["item_idx"].value_counts()
for idx, cnt in pos_counts.items():
    pop_by_item[int(idx)] = int(cnt)

# Weight per item: w_i = 1 / (1 + pop(i))^beta
POP_BETA = globals().get("POP_BETA", 0.6)
item_weight = 1.0 / np.power(1.0 + pop_by_item.astype(np.float32), POP_BETA)

# ---- genre_by_item lookup (index -> genre_enc) ----
genre_by_item = np.zeros(len(item_id_to_index), dtype=np.int32)
for _, r in movies.iterrows():
    idx = item_id_to_index[int(r["item_id"])]
    genre_by_item[idx] = int(r["genre_enc"])

# ---- pop-aware neg distribution: p(i) ∝ (pop(i)+1)^alpha ----
NEG_POP_ALPHA = globals().get("NEG_POP_ALPHA", 0.75)
neg_base = np.power(pop_by_item.astype(np.float32) + 1.0, NEG_POP_ALPHA)

NEG_POP_MIX = globals().get("NEG_POP_MIX", 0.5)

def sample_negatives_for_user(uid: int, num_neg: int, full_pos_set: set[int]):
    """Sample negatives excluding FULL positives of this user."""
    pool = np.array(sorted(all_item_set - full_pos_set), dtype=np.int64)
    if len(pool) == 0:
        pool = all_item_indices

    n_pop = int(np.ceil(num_neg * NEG_POP_MIX))
    n_uni = num_neg - n_pop

    # pop-based
    w = neg_base[pool]
    w_sum = float(w.sum())
    if w_sum <= 0:
        pop_samples = np.random.choice(pool, size=n_pop, replace=(len(pool) < n_pop))
    else:
        p = w / w_sum
        pop_samples = np.random.choice(pool, size=n_pop, replace=(len(pool) < n_pop), p=p)

    # uniform
    uni_samples = np.random.choice(pool, size=n_uni, replace=(len(pool) < n_uni))

    out = np.concatenate([pop_samples, uni_samples]).astype(np.int64)

    # Nếu pool đủ lớn, ép unique để tránh trùng
    if len(pool) >= num_neg:
        out = np.unique(out)
        while len(out) < num_neg:
            extra = np.random.choice(pool, size=(num_neg - len(out)), replace=False)
            out = np.unique(np.concatenate([out, extra]))
        out = out[:num_neg]
    else:
        # pool nhỏ, cho phép replace
        if len(out) < num_neg:
            extra = np.random.choice(pool, size=(num_neg - len(out)), replace=True)
            out = np.concatenate([out, extra])[:num_neg]

    return out.tolist()

def build_training_rows(df_pos, neg_per_pos=4, max_samples_per_user=None):
    rows = []
    grouped = df_pos.groupby("user_id")

    for uid, g in tqdm(grouped, desc="Sampling negatives (CLEAN)"):
        # FULL positives set for this user (chống false negative)
        full_pos_set = user_pos_items.get(uid, set())

        # pos_items candidates từ df_pos (các row label==1), nhưng nếu giới hạn thì random
        pos_items = g["item_idx"].values.astype(np.int64)

        if max_samples_per_user and len(pos_items) > max_samples_per_user:
            pos_items = np.random.choice(pos_items, size=max_samples_per_user, replace=False)

        pos_items = pos_items.tolist()

        # user features
        gender = int(g["gender_enc"].iloc[0])
        age    = int(g["age_enc"].iloc[0])
        occ    = int(g["occupation_enc"].iloc[0])

        for pos in pos_items:
            # Positive row
            rows.append([uid, gender, age, occ,
                         int(pos), int(genre_by_item[int(pos)]),
                         1, float(item_weight[int(pos)])])

            # Negatives excluding FULL positives
            negs = sample_negatives_for_user(uid, neg_per_pos, full_pos_set)
            for neg in negs:
                rows.append([uid, gender, age, occ,
                             int(neg), int(genre_by_item[int(neg)]),
                             0, float(item_weight[int(neg)])])

    return rows

df_pos = df[df["label"]==1]
rows = build_training_rows(df_pos, NEG_PER_POS, MAX_SAMPLES_PER_USER)

train_df = pd.DataFrame(
    rows,
    columns=["user_id","gender","age","occupation","item_idx","genre","label","sample_weight"]
)

print("Train rows:", len(train_df))
print("Weight stats:", train_df["sample_weight"].describe())
train_df.head(5)


Sampling negatives (CLEAN): 100%|██████████| 6038/6038 [00:56<00:00, 106.11it/s]


Train rows: 580405
Weight stats: count    580405.000000
mean          0.123200
std           0.211401
min           0.008447
25%           0.022938
50%           0.042527
75%           0.109336
max           1.000000
Name: sample_weight, dtype: float64


Unnamed: 0,user_id,gender,age,occupation,item_idx,genre,label,sample_weight
0,1,0,0,10,1189,7,1,0.017814
1,1,0,0,10,1926,10,0,0.141585
2,1,0,0,10,2841,7,0,1.0
3,1,0,0,10,3616,4,0,0.032127
4,1,0,0,10,3682,2,0,0.016396


In [29]:
# === Check 2: Are we truncating positives in a biased way? ===
tmp = df[df["label"]==1].copy()
# xem 20 item đầu tiên theo thứ tự xuất hiện của mỗi user
first20 = tmp.groupby("user_id").head(20)
first20_pop = first20["item_idx"].value_counts().head(20)

full_pop = tmp["item_idx"].value_counts().head(20)

print("Top items in FULL positives:")
print(full_pop.head(10).index.tolist())

print("\nTop items in FIRST-20 positives per user:")
print(first20_pop.head(10).index.tolist())


Top items in FULL positives:
[2789, 257, 1178, 1959, 1180, 589, 2502, 2693, 1192, 604]

Top items in FIRST-20 positives per user:
[589, 585, 0, 2928, 2789, 1250, 1245, 1239, 3724, 257]


In [43]:
# === Correct false-negative check for NEW pipeline ===
uid = list(user_pos_items.keys())[0]
full_pos_set = user_pos_items[uid]

# pool negatives theo pipeline mới
pool = set(all_item_indices.tolist()) - full_pos_set
print("False negatives (should be 0):", len(pool.intersection(full_pos_set)))


False negatives (should be 0): 0


In [31]:
# Split theo hàng (không theo user), đơn giản cho demo
train_part, test_part = train_test_split(train_df, test_size=TEST_SIZE, random_state=SEED, stratify=train_df["label"])
train_part, val_part  = train_test_split(train_part, test_size=VAL_SIZE, random_state=SEED, stratify=train_part["label"])

print("Split sizes:", len(train_part), len(val_part), len(test_part))

Split sizes: 470127 52237 58041


In [32]:
class FMTrainDataset(Dataset):
    def __init__(self, df: pd.DataFrame):
        self.gender = df["gender"].values.astype(np.int64)
        self.age = df["age"].values.astype(np.int64)
        self.occ = df["occupation"].values.astype(np.int64)
        self.item = df["item_idx"].values.astype(np.int64)
        self.genre = df["genre"].values.astype(np.int64)
        self.label = df["label"].values.astype(np.float32)
        self.w = df["sample_weight"].values.astype(np.float32)  # <--- NEW

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return {
            "gender": self.gender[idx],
            "age": self.age[idx],
            "occupation": self.occ[idx],
            "item_id": self.item[idx],
            "genre": self.genre[idx],
            "label": self.label[idx],
            "weight": self.w[idx],  # <--- NEW
        }

def collate_fn(batch):
    out = {k: [] for k in ["gender","age","occupation","item_id","genre","label","weight"]}
    for b in batch:
        for k in out:
            out[k].append(b[k])

    for k in ["gender","age","occupation","item_id","genre"]:
        out[k] = torch.tensor(out[k], dtype=torch.long)

    labels = torch.tensor(out.pop("label"), dtype=torch.float32)
    weights = torch.tensor(out.pop("weight"), dtype=torch.float32)  # <--- NEW

    return out, labels, weights

train_ds = FMTrainDataset(train_part)
val_ds   = FMTrainDataset(val_part)
test_ds  = FMTrainDataset(test_part)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0,
    collate_fn=collate_fn,
    pin_memory=False,
    persistent_workers=False
)

b, y, w = next(iter(train_loader))
print({k: b[k].shape for k in b}, y.shape, w.shape, "w[0:5]=", w[:5])

val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, collate_fn=collate_fn, pin_memory=False, persistent_workers=False
)
test_loader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, collate_fn=collate_fn, pin_memory=False, persistent_workers=False
)

len(train_ds), len(val_ds), len(test_ds)


{'gender': torch.Size([1024]), 'age': torch.Size([1024]), 'occupation': torch.Size([1024]), 'item_id': torch.Size([1024]), 'genre': torch.Size([1024])} torch.Size([1024]) torch.Size([1024]) w[0:5]= tensor([0.0884, 1.0000, 0.0849, 0.0503, 0.0980])


(470127, 52237, 58041)

In [33]:
class DeepFM(nn.Module):
    def __init__(self, field_dims: dict, embed_dim=16, mlp_dims=[128,64], dropout=0.2):
        super().__init__()
        self.fields = list(field_dims.keys())

        # Embedding cho FM/DNN (kích thước d)
        self.emb = nn.ModuleDict({
            k: nn.Embedding(field_dims[k], embed_dim) for k in self.fields
        })
        # Linear term: mỗi field một Embedding dim=1
        self.lin = nn.ModuleDict({
            k: nn.Embedding(field_dims[k], 1) for k in self.fields
        })

        # DNN
        in_dim = embed_dim * len(self.fields)
        layers = []
        d = in_dim
        for h in mlp_dims:
            layers += [nn.Linear(d, h), nn.ReLU(), nn.Dropout(dropout)]
            d = h
        self.dnn = nn.Sequential(*layers)
        self.dnn_out = nn.Linear(d, 1)

        # init
        self._init_weights()

    def _init_weights(self):
        for emb in self.emb.values():
            nn.init.xavier_uniform_(emb.weight.data)
        for l in self.lin.values():
            nn.init.zeros_(l.weight.data)

    def forward(self, x: dict):
        """
        x: dict tensor Long: ["gender","age","occupation","item_id","genre"] shape [B]
        Output: logits [B, 1]
        """
        # Linear
        lin_terms = [self.lin[k](x[k]) for k in self.fields]  # list [B,1]
        lin = torch.stack(lin_terms, dim=1).sum(dim=1)        # [B,1]

        # Embeddings
        embs = [self.emb[k](x[k]) for k in self.fields]       # list [B,d]
        E = torch.stack(embs, dim=1)                          # [B,F,d]

        # FM 2nd order: 0.5 * (sum^2 - sum of squares)
        sum_of_emb = E.sum(dim=1)                             # [B,d]
        sum_of_emb_square = sum_of_emb * sum_of_emb           # [B,d]
        square_of_emb = E * E                                 # [B,F,d]
        square_of_emb_sum = square_of_emb.sum(dim=1)          # [B,d]
        fm = 0.5 * (sum_of_emb_square - square_of_emb_sum)    # [B,d]
        fm_logit = fm.sum(dim=1, keepdim=True)                # [B,1]

        # DNN
        dnn_in = torch.cat(embs, dim=1)                       # [B, F*d]
        dnn_hidden = self.dnn(dnn_in)                         # [B, H]
        dnn_logit = self.dnn_out(dnn_hidden)                  # [B,1]

        logits = lin + fm_logit + dnn_logit                   # [B,1]
        return logits


In [34]:
num_items  = len(item_id_to_index)
num_genres = len(genre_to_index)

field_dims = {
    "gender": 2,
    "age": 7,
    "occupation": 21,
    "item_id": num_items,
    "genre": num_genres
}

model = DeepFM(field_dims, embed_dim=EMBED_DIM, mlp_dims=MLP_DIMS, dropout=DROPOUT).to(DEVICE)

# IMPORTANT: reduction='none' để nhân sample_weight
criterion = nn.BCEWithLogitsLoss(reduction="none")

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

sum(p.numel() for p in model.parameters())/1e6, "M params"

(0.085516, 'M params')

In [35]:
@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    all_logits = []
    all_labels = []
    for batch, labels, weights in loader:
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        labels = labels.to(DEVICE)

        logits = model(batch).squeeze(1)  # [B]
        all_logits.append(logits.detach().cpu().numpy())
        all_labels.append(labels.detach().cpu().numpy())

    logits = np.concatenate(all_logits)
    labels = np.concatenate(all_labels)
    probs = 1/(1+np.exp(-logits))

    try:
        auc = roc_auc_score(labels, probs)
    except ValueError:
        auc = float("nan")
    ll = log_loss(labels, probs, labels=[0,1])

    preds = (probs >= 0.5).astype(int)
    acc = (preds == labels).mean()

    return auc, ll, acc

def train_one_epoch(model, loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    for bi, (batch, labels, weights) in enumerate(tqdm(loader, total=len(loader), desc="Train")):
        for k in batch:
            batch[k] = batch[k].to(DEVICE)
        labels = labels.to(DEVICE)
        weights = weights.to(DEVICE)

        optimizer.zero_grad()
        logits = model(batch).squeeze(1)                     # [B]
        per_sample = criterion(logits, labels)               # [B] because reduction='none'
        loss = (per_sample * weights).mean()                 # weighted mean
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)

        if (bi + 1) % 50 == 0:
            print(f"  step {bi+1}/{len(loader)}  loss={loss.item():.4f}")

    return running_loss / len(loader.dataset)


In [36]:
best_val_auc = -1
best_state = None

for epoch in range(1, EPOCHS+1):
    tr_loss = train_one_epoch(model, train_loader, optimizer, criterion)
    val_auc, val_ll, val_acc = evaluate(model, val_loader)
    print(f"[Epoch {epoch:02d}] TrainLoss={tr_loss:.4f} | ValAUC={val_auc:.4f} | ValLogLoss={val_ll:.4f} | ValAcc={val_acc:.4f}")

    if val_auc > best_val_auc:
        best_val_auc = val_auc
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

# Load best
if best_state is not None:
    model.load_state_dict(best_state)
test_auc, test_ll, test_acc = evaluate(model, test_loader)
print(f"[TEST] AUC={test_auc:.4f} | LogLoss={test_ll:.4f} | Acc={test_acc:.4f}")


Train:  12%|█▏        | 55/460 [00:01<00:07, 51.60it/s]

  step 50/460  loss=0.0312


Train:  23%|██▎       | 106/460 [00:01<00:05, 67.38it/s]

  step 100/460  loss=0.0239


Train:  35%|███▌      | 162/460 [00:02<00:04, 74.30it/s]

  step 150/460  loss=0.0221


Train:  46%|████▌     | 211/460 [00:03<00:03, 76.41it/s]

  step 200/460  loss=0.0221


Train:  57%|█████▋    | 260/460 [00:03<00:02, 76.89it/s]

  step 250/460  loss=0.0187


Train:  67%|██████▋   | 309/460 [00:04<00:01, 77.87it/s]

  step 300/460  loss=0.0187


Train:  78%|███████▊  | 358/460 [00:05<00:01, 72.98it/s]

  step 350/460  loss=0.0165


Train:  90%|█████████ | 414/460 [00:05<00:00, 74.02it/s]

  step 400/460  loss=0.0176


Train: 100%|██████████| 460/460 [00:06<00:00, 69.34it/s]

  step 450/460  loss=0.0197





[Epoch 01] TrainLoss=0.0232 | ValAUC=0.7651 | ValLogLoss=0.4265 | ValAcc=0.8053


Train:  13%|█▎        | 59/460 [00:00<00:06, 62.88it/s]

  step 50/460  loss=0.0200


Train:  23%|██▎       | 107/460 [00:01<00:04, 71.69it/s]

  step 100/460  loss=0.0182


Train:  35%|███▌      | 163/460 [00:02<00:04, 73.52it/s]

  step 150/460  loss=0.0183


Train:  46%|████▌     | 211/460 [00:03<00:03, 72.49it/s]

  step 200/460  loss=0.0179


Train:  56%|█████▋    | 259/460 [00:03<00:02, 73.75it/s]

  step 250/460  loss=0.0150


Train:  67%|██████▋   | 308/460 [00:04<00:02, 75.50it/s]

  step 300/460  loss=0.0179


Train:  77%|███████▋  | 356/460 [00:04<00:01, 74.99it/s]

  step 350/460  loss=0.0160


Train:  90%|████████▉ | 413/460 [00:05<00:00, 74.61it/s]

  step 400/460  loss=0.0215


Train: 100%|██████████| 460/460 [00:06<00:00, 72.23it/s]

  step 450/460  loss=0.0165





[Epoch 02] TrainLoss=0.0178 | ValAUC=0.7738 | ValLogLoss=0.4204 | ValAcc=0.8074


Train:  13%|█▎        | 62/460 [00:00<00:05, 74.56it/s]

  step 50/460  loss=0.0161


Train:  24%|██▍       | 110/460 [00:01<00:04, 73.09it/s]

  step 100/460  loss=0.0159


Train:  35%|███▍      | 159/460 [00:02<00:03, 76.32it/s]

  step 150/460  loss=0.0198


Train:  45%|████▌     | 207/460 [00:02<00:03, 72.49it/s]

  step 200/460  loss=0.0148


Train:  56%|█████▌    | 256/460 [00:03<00:02, 75.74it/s]

  step 250/460  loss=0.0144


Train:  68%|██████▊   | 312/460 [00:04<00:01, 74.08it/s]

  step 300/460  loss=0.0169


Train:  78%|███████▊  | 358/460 [00:04<00:01, 67.46it/s]

  step 350/460  loss=0.0137


Train:  90%|████████▉ | 413/460 [00:05<00:00, 72.10it/s]

  step 400/460  loss=0.0188


Train: 100%|██████████| 460/460 [00:06<00:00, 71.96it/s]

  step 450/460  loss=0.0197





[Epoch 03] TrainLoss=0.0173 | ValAUC=0.7773 | ValLogLoss=0.4182 | ValAcc=0.8092


Train:  13%|█▎        | 59/460 [00:00<00:05, 67.32it/s]

  step 50/460  loss=0.0152


Train:  23%|██▎       | 105/460 [00:01<00:05, 69.35it/s]

  step 100/460  loss=0.0181


Train:  34%|███▎      | 155/460 [00:02<00:04, 66.05it/s]

  step 150/460  loss=0.0159


Train:  45%|████▌     | 207/460 [00:03<00:03, 63.68it/s]

  step 200/460  loss=0.0163


Train:  56%|█████▋    | 259/460 [00:04<00:03, 64.83it/s]

  step 250/460  loss=0.0179


Train:  67%|██████▋   | 309/460 [00:04<00:02, 61.24it/s]

  step 300/460  loss=0.0186


Train:  78%|███████▊  | 360/460 [00:05<00:01, 64.99it/s]

  step 350/460  loss=0.0154


Train:  90%|████████▉ | 412/460 [00:06<00:00, 69.88it/s]

  step 400/460  loss=0.0157


Train: 100%|█████████▉| 459/460 [00:07<00:00, 66.95it/s]

  step 450/460  loss=0.0137


Train: 100%|██████████| 460/460 [00:07<00:00, 64.79it/s]


[Epoch 04] TrainLoss=0.0170 | ValAUC=0.7813 | ValLogLoss=0.4175 | ValAcc=0.8096
[TEST] AUC=0.7841 | LogLoss=0.4156 | Acc=0.8094


In [37]:
# === Check 4: Global winners (average predicted prob over random users) ===
@torch.no_grad()
def avg_item_score_over_users(model, item_ids, sample_users=2000):
    model.eval()
    uids = np.random.choice(users["user_id"].values, size=sample_users, replace=False)
    out = []
    for iid in item_ids:
        # build batch for many users with fixed item
        g = users.set_index("user_id").loc[uids, "gender_enc"].values.astype(np.int64)
        a = users.set_index("user_id").loc[uids, "age_enc"].values.astype(np.int64)
        o = users.set_index("user_id").loc[uids, "occupation_enc"].values.astype(np.int64)

        item_idx = item_id_to_index[int(iid)]
        ge = int(movies.loc[movies["item_id"]==iid, "genre_enc"].iloc[0])

        batch = {
            "gender": torch.tensor(g, device=DEVICE),
            "age": torch.tensor(a, device=DEVICE),
            "occupation": torch.tensor(o, device=DEVICE),
            "item_id": torch.full((sample_users,), item_idx, dtype=torch.long, device=DEVICE),
            "genre": torch.full((sample_users,), ge, dtype=torch.long, device=DEVICE),
        }
        logits = model(batch).view(-1).detach().cpu().numpy()
        prob = 1/(1+np.exp(-logits))
        out.append((iid, prob.mean()))
    return out

# test top-pop items
top_item_ids = (df[df["label"]==1]["item_idx"].value_counts().head(20).index
                .to_series().map(index_to_item_id).tolist())
print("Top popular items:", df[df["label"]==1]["item_idx"]
                .value_counts()
                .head(20).index.tolist())

avg_scores = avg_item_score_over_users(model, top_item_ids, sample_users=2000)
avg_scores = sorted(avg_scores, key=lambda x: -x[1])

res = pd.DataFrame(avg_scores, columns=["item_id","avg_prob"])
res = res.merge(movies[["item_id","title"]], on="item_id", how="left")
res.head(10)


Top popular items: [2789, 257, 1178, 1959, 1180, 589, 2502, 2693, 1192, 604, 523, 315, 585, 847, 108, 1179, 1250, 2327, 1575, 293]


Unnamed: 0,item_id,avg_prob,title
0,2858,0.671588,American Beauty (1999)
1,260,0.618282,Star Wars: Episode IV - A New Hope (1977)
2,2028,0.613051,Saving Private Ryan (1998)
3,593,0.588661,"Silence of the Lambs, The (1991)"
4,1210,0.577288,Star Wars: Episode VI - Return of the Jedi (1983)
5,1196,0.575563,Star Wars: Episode V - The Empire Strikes Back...
6,2571,0.573567,"Matrix, The (1999)"
7,110,0.571739,Braveheart (1995)
8,2762,0.564832,"Sixth Sense, The (1999)"
9,1198,0.552478,Raiders of the Lost Ark (1981)


In [38]:
# 1) Lưu model (state_dict)
pth_path = os.path.join(ART_DIR, "deepfm_pytorch.pth")
torch.save(model.state_dict(), pth_path)
print("[✓] Saved:", pth_path)

# 2) Xuất item embedding để dùng FAISS & /similar_items
item_emb_weight = model.emb["item_id"].weight.detach().cpu().numpy().astype("float32")
np.save(os.path.join(ART_DIR, "item_emb.npy"), item_emb_weight)
print("[✓] Saved:", os.path.join(ART_DIR, "item_emb.npy"), item_emb_weight.shape)

# 3) Lưu mapping
with open(os.path.join(ART_DIR, "item_id_to_index.json"), "w") as f:
    json.dump({int(k): int(v) for k, v in item_id_to_index.items()}, f)
with open(os.path.join(ART_DIR, "index_to_item_id.json"), "w") as f:
    json.dump({int(k): int(v) for k, v in index_to_item_id.items()}, f)
print("[✓] Saved mappings to artifacts/")


[✓] Saved: artifacts\deepfm_pytorch.pth
[✓] Saved: artifacts\item_emb.npy (3883, 16)
[✓] Saved mappings to artifacts/


In [39]:
# Ví dụ: tạo vector u giả (mean của một vài item hành động) rồi chấm điểm
# Trong thực tế bạn sẽ tối ưu u từ lịch sử user hoặc cập nhật online theo event.

# Lấy 10 item ngẫu nhiên
sample_idx = np.random.choice(len(item_id_to_index), size=10, replace=False)
E = item_emb_weight[sample_idx]  # [10, d]
u = E.mean(axis=0)               # [d]
u = u / (np.linalg.norm(u) + 1e-6)

scores = item_emb_weight @ u     # [num_items]
topk = 10
top_idx = np.argpartition(-scores, topk)[:topk]
top_idx = top_idx[np.argsort(-scores[top_idx])]
print("Top-10 item_idx:", top_idx)

# map sang item_id & title
inv_map = {v:k for k, v in item_id_to_index.items()}
top_item_ids = [inv_map[i] for i in top_idx]
top_titles = movies[movies["item_id"].isin(top_item_ids)][["item_id","title"]]
top_titles

Top-10 item_idx: [2890 2502  293   49 3091 1195  585 1192  257   31]


Unnamed: 0,item_id,title
31,32,Twelve Monkeys (1995)
49,50,"Usual Suspects, The (1995)"
257,260,Star Wars: Episode IV - A New Hope (1977)
293,296,Pulp Fiction (1994)
585,589,Terminator 2: Judgment Day (1991)
1192,1210,Star Wars: Episode VI - Return of the Jedi (1983)
1195,1213,GoodFellas (1990)
2502,2571,"Matrix, The (1999)"
2890,2959,Fight Club (1999)
3091,3160,Magnolia (1999)


In [40]:
!pip install numpy==1.24.4

Collecting numpy==1.24.4
  Using cached numpy-1.24.4-cp311-cp311-win_amd64.whl.metadata (5.6 kB)
Using cached numpy-1.24.4-cp311-cp311-win_amd64.whl (14.8 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.4.0
    Uninstalling numpy-2.4.0:


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'c:\\python311\\scripts\\f2py.exe'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [41]:
!pip install --upgrade faiss-cpu

Collecting faiss-cpu
  Using cached faiss_cpu-1.13.2-cp311-cp311-win_amd64.whl.metadata (7.6 kB)
Collecting numpy<3.0,>=1.25.0 (from faiss-cpu)
  Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl.metadata (6.6 kB)
Using cached faiss_cpu-1.13.2-cp311-cp311-win_amd64.whl (18.9 MB)
Using cached numpy-2.4.0-cp311-cp311-win_amd64.whl (12.6 MB)
Installing collected packages: numpy, faiss-cpu


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Python311\\Scripts\\f2py.exe' -> 'C:\\Python311\\Scripts\\f2py.exe.deleteme'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
# Chỉ cần nếu bạn muốn test Similar ngay trong notebook
# !pip install faiss-cpu

import faiss
emb = item_emb_weight.copy()
faiss.normalize_L2(emb)
index = faiss.IndexFlatIP(emb.shape[1])
index.add(emb)

def similar_items_by_item_id(raw_item_id, k=10):
    idx = item_id_to_index[int(raw_item_id)]
    q = emb[idx:idx+1]
    D, I = index.search(q, k+1)
    I = I[0].tolist()
    D = D[0].tolist()
    out = []
    for d, i in zip(D, I):
        if i == idx: 
            continue
        iid = index_to_item_id[i]
        title = movies.loc[movies["item_id"]==iid, "title"].values
        title = title[0] if len(title) else str(iid)
        out.append({"item_id": int(iid), "title": title, "score": float(d)})
        if len(out) == k: break
    return out

# Demo với 1 item bất kỳ:
some_item = int(movies["item_id"].sample(1, random_state=SEED).iloc[0])
similar_items_by_item_id(some_item, k=10)


ImportError: cannot load module more than once per process

In [None]:
# === Helpers + Patch (FINAL) — dùng đúng model.fields ===
import numpy as np, torch

# Pool item và map user->set item đã like (rating >= 4)
ALL_ITEM_IDS = np.array(sorted(movies["item_id"].unique()))
watched_by_user = ratings[ratings["rating"] >= 4].groupby("user_id")["item_id"].apply(set).to_dict()

def sample_negatives_for_user(user_id: int, num_neg: int = 99):
    pos_set = watched_by_user.get(user_id, set())
    pool = (ALL_ITEM_IDS if len(pos_set) == 0
            else np.setdiff1d(ALL_ITEM_IDS, np.fromiter(pos_set, dtype=int), assume_unique=True))
    if len(pool) == 0: pool = ALL_ITEM_IDS
    replace = len(pool) < num_neg
    return np.random.choice(pool, size=num_neg, replace=replace).tolist()

def _tensors_for_user_items(user_id: int, item_ids: list[int]):
    """Chuẩn bị (u,i,g,a,o,ge) tensors theo đúng thứ tự item_ids."""
    # user feats
    urow = users.loc[users["user_id"] == user_id, ["gender_enc","age_enc","occupation_enc"]]
    if urow.empty:
        raise ValueError(f"User {user_id} không tồn tại trong users df")
    g = int(urow["gender_enc"].values[0])
    a = int(urow["age_enc"].values[0])
    o = int(urow["occupation_enc"].values[0])

    # item feats (theo đúng thứ tự item_ids)
    msub = movies.loc[movies["item_id"].isin(item_ids), ["item_id","genre_enc"]].copy()
    msub = msub.set_index("item_id").reindex(item_ids).reset_index()
    item_idx = [item_id_to_index[int(i)] for i in msub["item_id"].values]
    genre_enc = msub["genre_enc"].astype(int).values

    n = len(item_ids)
    uT = torch.full((n,), int(user_id), dtype=torch.long, device=DEVICE)
    iT = torch.tensor(item_idx, dtype=torch.long, device=DEVICE)
    gT = torch.full((n,), g, dtype=torch.long, device=DEVICE)
    aT = torch.full((n,), a, dtype=torch.long, device=DEVICE)
    oT = torch.full((n,), o, dtype=torch.long, device=DEVICE)
    geT= torch.tensor(genre_enc, dtype=torch.long, device=DEVICE)
    return uT, iT, gT, aT, oT, geT

@torch.no_grad()
def score_candidates(user_id: int, item_ids: list[int]) -> np.ndarray:
    """
    Gọi model theo đúng chữ ký dict dựa vào model.fields.
    Ví dụ nếu model.fields = ["gender","age","occupation","item_id","genre"],
    ta sẽ cung cấp một dict với đúng các khóa đó.
    """
    model.eval()
    uT, iT, gT, aT, oT, geT = _tensors_for_user_items(user_id, item_ids)

    # Lấy danh sách khóa model yêu cầu
    fields = getattr(model, "fields", None)
    if fields is None:
        # Fallback phổ biến: dùng tên khóa tiêu chuẩn
        fields = ["user_id","item_id","gender","age","occupation","genre"]

    batch = {}
    for k in fields:
        lk = k.lower()
        if lk in ("user", "user_id", "uid"):
            batch[k] = uT
        elif lk in ("item", "item_id", "item_idx", "iid"):
            # DÙ tên là item_id nhưng thực chất model thường expect chỉ số embedding (item_idx)
            batch[k] = iT
        elif lk in ("gender", "sex"):
            batch[k] = gT
        elif lk == "age":
            batch[k] = aT
        elif lk in ("occupation", "occ", "job"):
            batch[k] = oT
        elif lk in ("genre", "genre_enc"):
            batch[k] = geT
        else:
            raise KeyError(f"Không biết map khóa '{k}' trong model.fields -> cung cấp tensor nào")

    out = model(batch)          # forward(self, x: dict)
    # Chuẩn về numpy 1D
    out = out.detach().float().view(-1).cpu().numpy()
    return out


In [None]:
# === Evaluate Hit@K & NDCG@K (REPLACED; place after Cell 15) ===
# Nhớ chạy Cell 11 (load best model) trước cell này.

K = TOPK if "TOPK" in globals() else 10
NEG_PER_USER_EVAL = 99

# Dùng test_df nếu đã alias ở Cell 15a; nếu không có thì dùng test_part
_eval_source = test_part if "test_df" in globals() else test_part

# Xác định cột item: ưu tiên item_id; nếu không có thì dùng item_idx và map lại
if "item_id" in _eval_source.columns:
    ITEM_COL = "item_id"
elif "item_idx" in _eval_source.columns:
    ITEM_COL = "item_idx"
else:
    raise RuntimeError("Không thấy cột item_id hoặc item_idx trong test set.")

# Lọc positive và lấy ngẫu nhiên 1 positive / user
test_pos = _eval_source[_eval_source["label"] == 1].copy()
if len(test_pos) == 0:
    raise RuntimeError("Không tìm thấy positive nào trong test set để đánh giá Hit@K/NDCG@K.")

test_pos = test_pos.groupby("user_id", group_keys=False).apply(
    lambda x: x.sample(n=1, random_state=SEED)
).reset_index(drop=True)

hits, ndcgs = [], []

# Dùng iterrows để truy cập theo tên cột ổn định
for _, row in tqdm(test_pos.iterrows(), total=len(test_pos), desc=f"Eval Hit@{K}/NDCG@{K}"):
    u = int(row["user_id"])
    if ITEM_COL == "item_id":
        pos_item = int(row["item_id"])
    else:
        # map item_idx -> item_id gốc
        pos_item = int(index_to_item_id[int(row["item_idx"])])

    neg_items = sample_negatives_for_user(u, num_neg=NEG_PER_USER_EVAL)

    # 1 positive + negatives (loại trùng)
    candidates = [pos_item] + [it for it in neg_items if it != pos_item]
    scores = score_candidates(u, candidates)

    # Xếp hạng giảm dần
    order = np.argsort(-scores)
    ranked_items = np.array(candidates, dtype=int)[order]

    # Hit@K
    topk_items = ranked_items[:K]
    hit = 1.0 if pos_item in topk_items else 0.0

    # NDCG@K (IDCG=1). Nếu pos ngoài top-K => 0
    r_idx = np.where(ranked_items == pos_item)[0]
    if len(r_idx) == 0:
        dcg_at_k = 0.0
    else:
        r = int(r_idx[0]) + 1  # 1-based
        dcg_at_k = (1.0 / np.log2(r + 1)) if r <= K else 0.0
    ndcg = dcg_at_k

    hits.append(hit)
    ndcgs.append(ndcg)

print(f"[TEST] Users evaluated: {len(test_pos)}")
print(f"[TEST] Hit@{K} = {np.mean(hits):.4f} | NDCG@{K} = {np.mean(ndcgs):.4f}")


  test_pos = test_pos.groupby("user_id", group_keys=False).apply(
Eval Hit@10/NDCG@10: 100%|██████████| 5213/5213 [00:31<00:00, 165.98it/s]

[TEST] Users evaluated: 5213
[TEST] Hit@10 = 0.7527 | NDCG@10 = 0.4955



