In [6]:
import os
import math
import random
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
from joblib import dump

# ======= CẤU HÌNH =======
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Đặt thư mục dataset MovieLens 1M (chứa users.dat, movies.dat, ratings.dat)
DATA_DIR = Path("Dataset")  # chỉnh path cho phù hợp máy bạn

# Nhãn nhị phân: rating >= 4 là dương
POS_THRESHOLD = 4.0

# Hashing dimension (giảm/ tăng tùy RAM & collision)
N_FEATURES = 2**18
USE_GENRES = True

# Tham số Logistic Regression
LR_C = 1.0
LR_PENALTY = "l2"
LR_MAX_ITER = 200
LR_CLASS_WEIGHT = None  # hoặc "balanced" nếu lệch lớp

# Đánh giá Top-K
NEG_PER_USER = 99
TOPK = 10

print(f"Config | POS_THRESHOLD={POS_THRESHOLD}, N_FEATURES={N_FEATURES}, USE_GENRES={USE_GENRES}")


Config | POS_THRESHOLD=4.0, N_FEATURES=262144, USE_GENRES=True


In [7]:
# MovieLens 1M delimiter là '::', encoding latin-1
users_path   = os.path.join(DATA_DIR, "users.dat")
movies_path  = os.path.join(DATA_DIR, "movies.dat")
ratings_path = os.path.join(DATA_DIR, "ratings.dat")

# Đọc USERS
users = pd.read_csv(
    users_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "gender", "age", "occupation", "zip"]
)

# Đọc MOVIES
movies = pd.read_csv(
    movies_path, sep="::", engine="python", encoding="latin-1",
    names=["item_id", "title", "genres"]
)

# Đọc RATINGS
ratings = pd.read_csv(
    ratings_path, sep="::", engine="python", encoding="latin-1",
    names=["user_id", "item_id", "rating", "timestamp"]
)

# Ép kiểu an toàn
ratings["user_id"] = ratings["user_id"].astype(int)
ratings["item_id"] = ratings["item_id"].astype(int)
ratings["rating"]  = ratings["rating"].astype(float)
ratings["timestamp"] = ratings["timestamp"].astype(int)

movies["item_id"] = movies["item_id"].astype(int)
movies["title"] = movies["title"].astype(str)
movies["genres"] = movies["genres"].fillna("").astype(str)

print(users.head(2))
print(movies.head(2))
print(ratings.head(2))
print("Counts:", len(users), len(movies), len(ratings))

# Đồng bộ tên biến với pipeline tiếp theo
df_ratings = ratings.copy()
df_movies  = movies.copy()


   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
   item_id             title                        genres
0        1  Toy Story (1995)   Animation|Children's|Comedy
1        2    Jumanji (1995)  Adventure|Children's|Fantasy
   user_id  item_id  rating  timestamp
0        1     1193     5.0  978300760
1        1      661     3.0  978302109
Counts: 6040 3883 1000209


In [8]:
# Nếu muốn thêm user features, có thể merge users vào đây (xem khối comment bên dưới)
df = df_ratings.merge(df_movies[["item_id", "title", "genres"]], on="item_id", how="left")

# (Tùy chọn) Thêm user features: gender/age/occupation
# df = df.merge(users[["user_id","gender","age","occupation"]], on="user_id", how="left")

# Nhãn nhị phân
df["label"] = (df["rating"] >= POS_THRESHOLD).astype(int)

# Chuẩn hóa kiểu dữ liệu
df["user_id"] = df["user_id"].astype(int)
df["item_id"] = df["item_id"].astype(int)
df["timestamp"] = df["timestamp"].astype(int)

print("Tổng tương tác:", len(df), "| Số user:", df["user_id"].nunique(), "| Số item:", df["item_id"].nunique())
print("Pos rate:", df["label"].mean())
df.head()


Tổng tương tác: 1000209 | Số user: 6040 | Số item: 3706
Pos rate: 0.5751607913945985


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,label
0,1,1193,5.0,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,1
1,1,661,3.0,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,0
2,1,914,3.0,978301968,My Fair Lady (1964),Musical|Romance,0
3,1,3408,4.0,978300275,Erin Brockovich (2000),Drama,1
4,1,2355,5.0,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,1


In [9]:
# Mỗi user: tương tác cuối -> TEST; tương tác liền trước -> VAL; còn lại -> TRAIN
df = df.sort_values(["user_id", "timestamp"])

last_idx = df.groupby("user_id").tail(1).index
test_df = df.loc[last_idx]

tmp = df.drop(last_idx)
val_idx = tmp.groupby("user_id").tail(1).index
val_df = df.loc[val_idx]

train_df = df.drop(index=last_idx.union(val_idx))

print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))
print("Users train:", train_df["user_id"].nunique(), "| val:", val_df["user_id"].nunique(), "| test:", test_df["user_id"].nunique())


Train/Val/Test sizes: 988129 6040 6040
Users train: 6040 | val: 6040 | test: 6040


In [10]:
def row_to_features(row, use_genres=True):
    feats = {
        f"user_id={row.user_id}": 1.0,
        f"item_id={row.item_id}": 1.0,
    }
    # (Tùy chọn) nếu đã merge users ở Cell 4:
    # if hasattr(row, "gender") and pd.notna(row.gender): feats[f"gender={row.gender}"] = 1.0
    # if hasattr(row, "age") and pd.notna(row.age): feats[f"age={int(row.age)}"] = 1.0
    # if hasattr(row, "occupation") and pd.notna(row.occupation): feats[f"occ={int(row.occupation)}"] = 1.0

    if use_genres and isinstance(row.genres, str) and row.genres:
        for g in row.genres.split("|"):
            feats[f"genre={g}"] = 1.0
    return feats

def df_to_hashed_matrix(df_in, hasher: FeatureHasher, use_genres=True, desc="hashing"):
    dict_gen = (row_to_features(r, use_genres) for r in tqdm(df_in.itertuples(index=False), total=len(df_in), desc=desc))
    X = hasher.transform(dict_gen)
    y = df_in["label"].values.astype(int)
    return X, y

hasher = FeatureHasher(n_features=N_FEATURES, input_type="dict", alternate_sign=False)
print("Hasher ready.")


Hasher ready.


In [11]:
X_train, y_train = df_to_hashed_matrix(train_df, hasher, use_genres=USE_GENRES, desc="hash train")
X_val,   y_val   = df_to_hashed_matrix(val_df,   hasher, use_genres=USE_GENRES, desc="hash val")

lr = LogisticRegression(
    penalty=LR_PENALTY,
    C=LR_C,
    max_iter=LR_MAX_ITER,
    solver="saga",
    class_weight=LR_CLASS_WEIGHT,
    n_jobs=-1,
    random_state=SEED,
)

lr.fit(X_train, y_train)

val_proba = lr.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_proba)
val_ll  = log_loss(y_val, val_proba)
val_acc = accuracy_score(y_val, (val_proba >= 0.5).astype(int))
print(f"[VAL] AUC={val_auc:.4f} | LogLoss={val_ll:.4f} | Acc={val_acc:.4f}")


hash train: 100%|██████████| 988129/988129 [00:04<00:00, 223192.10it/s]
hash val: 100%|██████████| 6040/6040 [00:00<00:00, 202240.02it/s]


[VAL] AUC=0.7939 | LogLoss=0.5390 | Acc=0.7250


In [12]:
ALL_ITEMS = np.array(sorted(df["item_id"].unique()))
watched_by_user = df.groupby("user_id")["item_id"].apply(set).to_dict()

def sample_negatives_for_user(user, num_neg=NEG_PER_USER):
    pos_set = watched_by_user.get(user, set())
    candidates = []
    while len(candidates) < num_neg:
        cands = np.random.choice(ALL_ITEMS, size=num_neg*2, replace=True)
        for it in cands:
            if it not in pos_set:
                candidates.append(it)
            if len(candidates) >= num_neg:
                break
    return candidates[:num_neg]

def make_batch_user_item(user, item_ids, use_genres=True):
    sub = df_movies.loc[df_movies["item_id"].isin(item_ids), ["item_id", "genres"]].copy()
    sub["user_id"] = user
    sub["rating"] = 0.0
    sub["timestamp"] = 0
    sub["label"] = 0
    return sub[["user_id", "item_id", "genres", "rating", "timestamp", "label"]]


In [13]:
def hit_ndcg_at_k(test_df, model, hasher, k=10, neg_per_user=99, use_genres=True):
    hits, ndcgs = [], []
    users = test_df["user_id"].values
    items_pos = test_df["item_id"].values

    for u, pos_item in tqdm(zip(users, items_pos), total=len(test_df), desc=f"Eval Hit@{k}/NDCG@{k}"):
        neg_items = sample_negatives_for_user(u, num_neg=neg_per_user)
        cand_items = [pos_item] + neg_items

        batch = make_batch_user_item(u, cand_items, use_genres=use_genres)
        X_cand, _ = df_to_hashed_matrix(batch, hasher, use_genres=use_genres, desc="hash candidates")
        scores = model.predict_proba(X_cand)[:, 1]

        rank_idx = np.argsort(-scores)
        ranked_items = np.array(cand_items)[rank_idx]
        topk = ranked_items[:k]

        hit = 1.0 if pos_item in topk else 0.0
        # Với 1 positive duy nhất: IDCG = 1
        if pos_item in ranked_items:
            r = np.where(ranked_items == pos_item)[0][0] + 1
            dcg = 1.0 / math.log2(r + 1)
        else:
            dcg = 0.0
        ndcg = dcg

        hits.append(hit); ndcgs.append(ndcg)

    return float(np.mean(hits)), float(np.mean(ndcgs))

# Classification metrics on TEST
X_test, y_test = df_to_hashed_matrix(test_df, hasher, use_genres=USE_GENRES, desc="hash test")
test_proba = lr.predict_proba(X_test)[:, 1]
print(f"[TEST] AUC={roc_auc_score(y_test, test_proba):.4f} | LogLoss={log_loss(y_test, test_proba):.4f} | Acc={accuracy_score(y_test, (test_proba>=0.5).astype(int)):.4f}")

# Ranking metrics
hit10, ndcg10 = hit_ndcg_at_k(test_df, lr, hasher, k=TOPK, neg_per_user=NEG_PER_USER, use_genres=USE_GENRES)
print(f"[TEST] Hit@{TOPK}={hit10:.4f} | NDCG@{TOPK}={ndcg10:.4f}")


hash test: 100%|██████████| 6040/6040 [00:00<00:00, 231833.41it/s]


[TEST] AUC=0.7873 | LogLoss=0.5428 | Acc=0.7328


hash candidates: 100%|██████████| 99/99 [00:00<00:00, 99149.02it/s]
hash candidates: 100%|██████████| 98/98 [00:00<00:00, 97890.40it/s]
hash candidates: 100%|██████████| 100/100 [00:00<?, ?it/s]
hash candidates: 100%|██████████| 100/100 [00:00<?, ?it/s]
hash candidates: 100%|██████████| 100/100 [00:00<00:00, 100848.86it/s]
hash candidates: 100%|██████████| 96/96 [00:00<00:00, 62787.02it/s]s]
hash candidates: 100%|██████████| 100/100 [00:00<00:00, 100150.53it/s]
hash candidates: 100%|██████████| 96/96 [00:00<?, ?it/s]
hash candidates: 100%|██████████| 98/98 [00:00<?, ?it/s]
hash candidates: 100%|██████████| 97/97 [00:00<00:00, 96615.41it/s]
hash candidates: 100%|██████████| 99/99 [00:00<00:00, 97840.74it/s]
hash candidates: 100%|██████████| 99/99 [00:00<00:00, 65743.52it/s]
hash candidates: 100%|██████████| 99/99 [00:00<00:00, 98094.99it/s]
hash candidates: 100%|██████████| 98/98 [00:00<00:00, 97820.51it/s]
hash candidates: 100%|██████████| 99/99 [00:00<?, ?it/s]
hash candidates: 100%|█

[TEST] Hit@10=0.0952 | NDCG@10=0.2027





In [14]:
def recommend_for_user(user_id: int, topk=10, use_genres=USE_GENRES):
    # Lấy negative + một ít positive của chính user làm candidate đa dạng
    neg_items = sample_negatives_for_user(user_id, num_neg=min(500, len(ALL_ITEMS)-1))
    pos_set = list(watched_by_user.get(user_id, set()))
    seed_pos = random.sample(pos_set, k=min(20, len(pos_set))) if pos_set else []

    cand = list(set(neg_items + seed_pos))
    batch = make_batch_user_item(user_id, cand, use_genres=use_genres)
    X_cand, _ = df_to_hashed_matrix(batch, hasher, use_genres=use_genres, desc="hash recommend")
    scores = lr.predict_proba(X_cand)[:, 1]
    order = np.argsort(-scores)[:topk]
    rec_items = np.array(cand)[order]
    rec_scores = scores[order]

    title_map = df_movies.set_index("item_id")["title"].to_dict()
    return [(int(i), float(s), title_map.get(int(i), "")) for i, s in zip(rec_items, rec_scores)]

# Ví dụ chạy nhanh:
sample_user = int(test_df["user_id"].iloc[0])
for iid, s, t in recommend_for_user(sample_user, topk=10):
    print(f"Item {iid:<6} | score={s:.4f} | {t}")


hash recommend: 100%|██████████| 497/497 [00:00<00:00, 166618.90it/s]

Item 3557   | score=0.9782 | Jennifer 8 (1992)
Item 368    | score=0.9700 | Maverick (1994)
Item 268    | score=0.9552 | Little Odessa (1994)
Item 358    | score=0.9531 | Higher Learning (1995)
Item 1984   | score=0.9422 | Halloween III: Season of the Witch (1983)
Item 2355   | score=0.9412 | Bug's Life, A (1998)
Item 2201   | score=0.9407 | Paradine Case, The (1947)
Item 2573   | score=0.9376 | Tango (1998)
Item 2576   | score=0.9346 | Love, etc. (1996)
Item 515    | score=0.9314 | Remains of the Day, The (1993)





In [15]:
ART_DIR = Path("./artifacts_lr")
ART_DIR.mkdir(parents=True, exist_ok=True)

dump(lr, ART_DIR / "logreg_model.joblib")
dump({"n_features": N_FEATURES, "use_genres": USE_GENRES, "seed": SEED}, ART_DIR / "feature_cfg.joblib")
df_movies[["item_id", "title", "genres"]].to_parquet(ART_DIR / "movies.parquet", index=False)

print("Saved to:", ART_DIR.resolve())

Saved to: D:\IU\Thesis\movie-recommendation-app\Data\artifacts_lr
