In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

# =============================
# 🧩 Generate or load matrices
# =============================
def generate_synthetic_matrix(shape, density=0.12, min_rating=1, max_rating=5, seed=42):
    np.random.seed(seed)
    m, n = shape
    R = np.zeros((m, n))
    num_ratings = int(m * n * density)
    indices = np.random.choice(m * n, num_ratings, replace=False)
    for idx in indices:
        i, j = divmod(idx, n)
        R[i, j] = np.random.uniform(min_rating, max_rating)
    return pd.DataFrame(R)

small_path = 'matrix_182x182.csv'
large_path = 'full_matrix_1393x182.csv'

if os.path.exists(small_path):
    matrix_small = pd.read_csv(small_path, index_col=0)
    print("Loaded real small matrix.")
else:
    matrix_small = generate_synthetic_matrix((110, 110), density=0.12)
    print("Generated synthetic small matrix (110x110, density 12%).")

if os.path.exists(large_path):
    matrix_large = pd.read_csv(large_path, index_col=0)
    print("Loaded real large matrix.")
else:
    matrix_large = generate_synthetic_matrix((875, 110), density=0.05)
    print("Generated synthetic large matrix (875x110, density 5%).")

# =============================
# 🔧 Prepare data (robust cleaning + fallback)
# =============================
def prepare_data(matrix):
    """
    Melt matrix into (user, item, rating). Try to extract digits from user/item (e.g. 'user077')->77.
    If some entries fail, fall back to pd.factorize() to assign integer ids to each unique label.
    Returns DataFrame with integer 'user','item' and 'rating'.
    """
    df = matrix.reset_index().rename(columns={'index': 'user'}).melt(
        id_vars='user', var_name='item', value_name='rating'
    )
    df = df[df['rating'] > 0].copy()
    # Keep originals (for debugging / mapping)
    df['user_raw'] = df['user'].astype(str)
    df['item_raw'] = df['item'].astype(str)

    # Try to extract digits
    df['user_digits'] = df['user_raw'].str.extract(r'(\d+)', expand=False)
    df['item_digits'] = df['item_raw'].str.extract(r'(\d+)', expand=False)

    # If any NaN in digits, fallback to factorize
    if df['user_digits'].isna().any() or df['item_digits'].isna().any():
        # Show examples of problematic labels (first 10 unique)
        bad_users = df.loc[df['user_digits'].isna(), 'user_raw'].unique()[:10]
        bad_items = df.loc[df['item_digits'].isna(), 'item_raw'].unique()[:10]
        if len(bad_users) > 0:
            print("[prepare_data] Warning: some user ids couldn't be parsed as digits. Examples:", bad_users)
        if len(bad_items) > 0:
            print("[prepare_data] Warning: some item ids couldn't be parsed as digits. Examples:", bad_items)

        # Use factorize to produce contiguous integer ids for user and item
        df['user'], user_uniques = pd.factorize(df['user_raw'])
        df['item'], item_uniques = pd.factorize(df['item_raw'])
        # user/item are now integers 0..N-1
        df = df[['user', 'item', 'rating', 'user_raw', 'item_raw']]
        return df

    # Otherwise safe: use extracted digits
    df['user'] = df['user_digits'].astype(int)
    df['item'] = df['item_digits'].astype(int)
    df = df[['user', 'item', 'rating', 'user_raw', 'item_raw']]
    return df

small_df = prepare_data(matrix_small)
large_df = prepare_data(matrix_large)

print("Small dataset shape:", small_df.shape)
print("Large dataset shape:", large_df.shape)

# =============================
# ⚙️ ALS with Bias
# =============================
def als_with_bias(R, k=20, lambda_reg=0.01, max_iter=10):
    m, n = R.shape
    mu = np.mean(R[R > 0]) if np.any(R > 0) else 0.0
    b_u = np.zeros(m)
    b_i = np.zeros(n)
    U = np.random.normal(0, 0.1, (m, k))
    V = np.random.normal(0, 0.1, (n, k))
    
    for it in range(max_iter):
        # Update item biases
        for j in range(n):
            ratings_j = [R[i, j] - mu - b_u[i] - np.dot(U[i], V[j]) for i in range(m) if R[i, j] > 0]
            b_i[j] = np.mean(ratings_j) if ratings_j else 0.0
        # Update user biases
        for i in range(m):
            ratings_i = [R[i, j] - mu - b_i[j] - np.dot(U[i], V[j]) for j in range(n) if R[i, j] > 0]
            b_u[i] = np.mean(ratings_i) if ratings_i else 0.0
        # Update V
        for j in range(n):
            A = np.dot(U.T, U) + lambda_reg * np.eye(k)
            b = np.zeros(k)
            for i in range(m):
                if R[i, j] > 0:
                    pred_no_v = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred_no_v
                    b += err * U[i]
            V[j] = np.linalg.solve(A, b)
        # Update U
        for i in range(m):
            A = np.dot(V.T, V) + lambda_reg * np.eye(k)
            b = np.zeros(k)
            for j in range(n):
                if R[i, j] > 0:
                    pred_no_u = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred_no_u
                    b += err * V[j]
            U[i] = np.linalg.solve(A, b)
    
    return U, V, mu, b_u, b_i

# =============================
# ⚙️ SGD with Bias + Learning Decay
# =============================
def sgd_with_bias(R, k=20, eta0=0.01, lambda_reg=0.01, max_iter=100):
    m, n = R.shape
    mu = np.mean(R[R > 0]) if np.any(R > 0) else 0.0
    b_u = np.zeros(m)
    b_i = np.zeros(n)
    U = np.random.normal(0, 0.1, (m, k))
    V = np.random.normal(0, 0.1, (n, k))
    
    for it in range(max_iter):
        eta = eta0 / np.sqrt(it + 1)
        for i in range(m):
            for j in range(n):
                if R[i, j] > 0:
                    pred = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred
                    b_u[i] += eta * (err - lambda_reg * b_u[i])
                    b_i[j] += eta * (err - lambda_reg * b_i[j])
                    U[i] += eta * (err * V[j] - lambda_reg * U[i])
                    V[j] += eta * (err * U[i] - lambda_reg * V[j])
    
    return U, V, mu, b_u, b_i

# =============================
# 🧮 Evaluation with Debug + Gợi ý + Xuất CSV (robust)
# =============================
def evaluate_model_bias(df, train_func, model_name="Model", k=20, max_iter=100, eta0=0.01):
    """
    df: DataFrame with integer 'user','item','rating' columns (prepare_data ensures this or factorizes)
    train_func: als_with_bias or sgd_with_bias
    """
    # use a copy
    dfc = df.copy()
    # If the columns are object, try extract digits or factorize as a safety (shouldn't be needed)
    if dfc['user'].dtype == object:
        dfc['user'] = pd.factorize(dfc['user'])[0]
    if dfc['item'].dtype == object:
        dfc['item'] = pd.factorize(dfc['item'])[0]

    n_users = int(dfc['user'].max()) + 1
    n_items = int(dfc['item'].max()) + 1
    R_train = np.zeros((n_users, n_items), dtype=float)
    for u, i, r in dfc[['user', 'item', 'rating']].itertuples(index=False):
        R_train[int(u), int(i)] = float(r)

    # Debug: print first user's row (optional)
    user_example = 0
    if user_example < R_train.shape[0]:
        print(f"\n[DEBUG] Training user {user_example} non-zero ratings:", np.count_nonzero(R_train[user_example]))
    else:
        print("\n[DEBUG] user_example >= n_users, skipping debug print.")

    # Call appropriate train function signature
    if train_func == als_with_bias:
        U, V, mu, b_u, b_i = train_func(R_train, k=k, max_iter=max_iter)
    else:
        U, V, mu, b_u, b_i = train_func(R_train, k=k, max_iter=max_iter, eta0=eta0)

    # Build full prediction matrix
    R_pred = mu + b_u[:, np.newaxis] + b_i[np.newaxis, :] + np.dot(U, V.T)

    # Compute RMSE on observed entries
    mask = R_train > 0
    if mask.sum() > 0:
        rmse = sqrt(mean_squared_error(R_train[mask], R_pred[mask]))
    else:
        rmse = float('nan')
    print(f"{model_name} RMSE: {rmse:.4f} (n_obs={mask.sum()})")

    # Save before/after CSVs
    before_fname = f"{model_name.lower().replace(' ', '_')}_before.csv"
    after_fname = f"{model_name.lower().replace(' ', '_')}_after.csv"
    np.savetxt(before_fname, R_train, delimiter=",", fmt="%.6f")
    np.savetxt(after_fname, R_pred, delimiter=",", fmt="%.6f")
    print(f"[💾] Saved: {before_fname}, {after_fname}")

    # Recommendation for a specific user (user_id = 0)
    user_id = 0
    if user_id < R_train.shape[0]:
        user_real = R_train[user_id, :]
        user_pred = R_pred[user_id, :]
        mask_unseen = user_real == 0
        if mask_unseen.any():
            recommendation_scores = user_pred[mask_unseen]
            recommend_idx = np.argsort(-recommendation_scores)[:10]
            recommended_items = np.arange(len(user_real))[mask_unseen][recommend_idx]
            print(f"\n=== TOP-10 GỢI Ý CHO USER {user_id} ===")
            for rank, (item, score) in enumerate(zip(recommended_items, recommendation_scores[recommend_idx]), start=1):
                print(f"{rank:2d}. Item {item:3d} → Dự đoán: {score:.3f}")
        else:
            print(f"[!] User {user_id} đã đánh giá tất cả item (không có item chưa xem).")
    else:
        print(f"[!] User {user_id} không tồn tại trong R_train (n_users={R_train.shape[0]}).")

    return {"rmse": rmse, "before_file": before_fname, "after_file": after_fname}

# =============================
# 🆕 Recommend from CSV (choose top-k, then sort ascending before printing)
# =============================
def recommend_from_csv(after_csv,
                       before_csv=None,
                       user_id=0,
                       top_n=5,
                       sort_ascending=True,
                       item_names=None):
    """
    Read after_csv (prediction matrix). If before_csv provided, use it to find unseen items (==0).
    Pick top_n highest predicted unseen items, then sort the picked items by score ascending
    (if sort_ascending=True) before printing.
    Returns list of (item_id, score).
    """
    after = pd.read_csv(after_csv, header=None).values
    if before_csv is not None:
        before = pd.read_csv(before_csv, header=None).values
        if before.shape != after.shape:
            raise ValueError(f"Shape mismatch: before {before.shape} vs after {after.shape}")
    else:
        before = np.zeros_like(after)

    n_users, n_items = after.shape
    if user_id < 0 or user_id >= n_users:
        raise IndexError(f"user_id {user_id} out of range (n_users={n_users})")

    user_real = before[user_id, :].astype(float)
    user_pred = after[user_id, :].astype(float)

    unseen_mask = (user_real == 0)
    unseen_items = np.arange(n_items)[unseen_mask]
    unseen_scores = user_pred[unseen_mask]

    if len(unseen_items) == 0:
        print(f"[!] User {user_id} đã đánh giá hết item (không có item chưa xem).")
        return []

    # pick top_n by highest score
    count = min(top_n, len(unseen_items))
    top_desc_idx = np.argsort(-unseen_scores)[:count]
    selected_items = unseen_items[top_desc_idx]
    selected_scores = unseen_scores[top_desc_idx]

    # sort the selected pairs by score ascending (or descending if requested)
    order = np.argsort(selected_scores) if sort_ascending else np.argsort(-selected_scores)
    final_items = selected_items[order]
    final_scores = selected_scores[order]

    print(f"\nRecommendations for user {user_id} (picked top {count}, printed {'ascending' if sort_ascending else 'descending'} by score):")
    results = []
    for rank, (it, sc) in enumerate(zip(final_items, final_scores), start=1):
        name_str = ""
        if item_names is not None:
            if isinstance(item_names, dict):
                name_str = f" - {item_names.get(int(it), '')}"
            elif isinstance(item_names, (list, np.ndarray, pd.Series)):
                if int(it) < len(item_names):
                    name_str = f" - {item_names[int(it)]}"
        print(f"{rank:2d}. Item {int(it):3d}{name_str} -> score: {sc:.6f}")
        results.append((int(it), float(sc)))
    return results

# =============================
# 🚀 Run experiments + example recommend_from_csv usage
# =============================
print("=== IMPROVED SMALL MATRIX (110x110) ===")
res1 = evaluate_model_bias(small_df, als_with_bias, "Improved ALS", k=20, max_iter=10)
res2 = evaluate_model_bias(small_df, sgd_with_bias, "Improved SGD", k=20, max_iter=100, eta0=0.01)

print("\n=== IMPROVED LARGE MATRIX (875x110) ===")
res3 = evaluate_model_bias(large_df, als_with_bias, "Improved ALS", k=20, max_iter=10)
res4 = evaluate_model_bias(large_df, sgd_with_bias, "Improved SGD", k=20, max_iter=100, eta0=0.01)

print("\nSummary of results:")
for label, r in [("ALS small", res1), ("SGD small", res2), ("ALS large", res3), ("SGD large", res4)]:
    print(f"{label}: rmse={r['rmse']:.4f}, before={r['before_file']}, after={r['after_file']}")

# Example: print 5 recommendations for user 0 from the SGD result, sorted ascending before printing
after_csv = res2['after_file']   # "improved_sgd_after.csv"
before_csv = res2['before_file'] # "improved_sgd_before.csv"
recommend_from_csv(after_csv, before_csv=before_csv, user_id=0, top_n=5, sort_ascending=True)


Loaded real small matrix.
Loaded real large matrix.
 'Buzza Pizza - Nowzone' 'Bà Hai - Bánh Xèo & Bánh Khọt'
 'Bánh Cuốn Ba Miền - Trường Chinh'
 'Bún Riêu & Canh Bún - Phan Đình Phùng' 'Bún Thịt Nướng Anh Ba'
 'Cheese Coffee - Hồng Bàng' 'Cháo Sườn Bé Hiền - Súp Cua & Bún Bò'
 'Cháo Sườn Chú Chen - Nguyễn Trãi' 'Chè HongKong - 港式糖水 - Phan Phú Tiên']
 'Bonchon Chicken - Aeon Tân Phú' 'Buzza Pizza - Nowzone'
 'Bà Hai - Bánh Xèo & Bánh Khọt' 'Bánh Cuốn Ba Miền - Trường Chinh'
 'Bún Nước Tương Mr Nhoi - Nguyễn Thượng Hiền'
 'Bún Riêu & Canh Bún - Phan Đình Phùng' 'Bún Thịt Nướng Anh Ba'
 'Cheese Coffee - Hồng Bàng' 'Cháo Sườn Bé Hiền - Súp Cua & Bún Bò']
Small dataset shape: (1363, 5)
Large dataset shape: (1218, 5)
=== IMPROVED SMALL MATRIX (110x110) ===

[DEBUG] Training user 0 non-zero ratings: 14
Improved ALS RMSE: 0.6940 (n_obs=1363)
[💾] Saved: improved_als_before.csv, improved_als_after.csv

=== TOP-10 GỢI Ý CHO USER 0 ===
 1. Item  27 → Dự đoán: 10.391
 2. Item  80 → Dự đoán: 10.370

[(59, 8.200219), (70, 8.376667), (64, 8.44525), (73, 8.470523), (19, 8.7812)]

NameError: name 'real_matrix' is not defined