In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import os

# Function to generate synthetic matrix nếu files không tồn tại
def generate_synthetic_matrix(shape, density=0.12, min_rating=1, max_rating=5, seed=42):
    np.random.seed(seed)
    m, n = shape
    R = np.zeros((m, n))
    num_ratings = int(m * n * density)
    indices = np.random.choice(m * n, num_ratings, replace=False)
    for idx in indices:
        i, j = divmod(idx, n)
        R[i, j] = np.random.uniform(min_rating, max_rating)
    return pd.DataFrame(R)

# Load hoặc generate ma trận for both small and large
small_path = 'matrix_110x110.csv'
large_path = 'full_matrix_875x110.csv'

if os.path.exists(small_path):
    matrix_small = pd.read_csv(small_path, index_col=0)
    print("Loaded real small matrix.")
else:
    matrix_small = generate_synthetic_matrix((110, 110), density=0.12)
    print("Generated synthetic small matrix (110x110, density 12%).")

if os.path.exists(large_path):
    matrix_large = pd.read_csv(large_path, index_col=0)
    print("Loaded real large matrix.")
else:
    matrix_large = generate_synthetic_matrix((875, 110), density=0.05)
    print("Generated synthetic large matrix (875x110, density 5%).")

# FIXED: prepare_data function
def prepare_data(matrix):
    df = matrix.reset_index().rename(columns={'index': 'user'}).melt(id_vars='user', var_name='item', value_name='rating')
    df = df[df['rating'] > 0].copy()  # Bỏ implicit 0
    return df

small_df = prepare_data(matrix_small)
large_df = prepare_data(matrix_large)

print("Small dataset shape:", small_df.shape)
print("Large dataset shape:", large_df.shape)

# IMPROVED ALS with Bias
def als_with_bias(R, k=20, lambda_reg=0.01, max_iter=10):
    m, n = R.shape
    # Global mean
    mu = np.mean(R[R > 0])
    # Initialize biases
    b_u = np.zeros(m)
    b_i = np.zeros(n)
    # Initialize factors
    U = np.random.normal(0, 0.1, (m, k))
    V = np.random.normal(0, 0.1, (n, k))
    
    for it in range(max_iter):
        # Update item biases (fix others)
        for j in range(n):
            ratings_j = [R[i,j] - mu - b_u[i] - np.dot(U[i], V[j]) for i in range(m) if R[i,j] > 0]
            b_i[j] = np.mean(ratings_j) if ratings_j else 0
        # Update user biases
        for i in range(m):
            ratings_i = [R[i,j] - mu - b_i[j] - np.dot(U[i], V[j]) for j in range(n) if R[i,j] > 0]
            b_u[i] = np.mean(ratings_i) if ratings_i else 0
        # Update V (fix U, biases)
        for j in range(n):
            A = np.dot(U.T, U) + lambda_reg * np.eye(k)
            b = np.zeros(k)
            for i in range(m):
                if R[i, j] > 0:
                    pred_no_v = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred_no_v
                    b += err * U[i]
            V[j] = np.linalg.solve(A, b)
        # Update U
        for i in range(m):
            A = np.dot(V.T, V) + lambda_reg * np.eye(k)
            b = np.zeros(k)
            for j in range(n):
                if R[i, j] > 0:
                    pred_no_u = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred_no_u
                    b += err * V[j]
            U[i] = np.linalg.solve(A, b)
    
    return U, V, mu, b_u, b_i

# IMPROVED SGD with Bias and Eta Decay
def sgd_with_bias(R, k=20, eta0=0.01, lambda_reg=0.01, max_iter=100):
    m, n = R.shape
    # Global mean
    mu = np.mean(R[R > 0])
    # Initialize biases
    b_u = np.zeros(m)
    b_i = np.zeros(n)
    # Initialize factors
    U = np.random.normal(0, 0.1, (m, k))
    V = np.random.normal(0, 0.1, (n, k))
    
    for it in range(max_iter):
        eta = eta0 / np.sqrt(it + 1)  # Eta decay
        for i in range(m):
            for j in range(n):
                if R[i, j] > 0:
                    pred = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
                    err = R[i, j] - pred
                    # Update biases
                    b_u[i] += eta * (err - lambda_reg * b_u[i])
                    b_i[j] += eta * (err - lambda_reg * b_i[j])
                    # Update U
                    U[i] += eta * (err * V[j] - lambda_reg * U[i])
                    # Update V
                    V[j] += eta * (err * U[i] - lambda_reg * V[j])
    
    return U, V, mu, b_u, b_i

# Updated evaluate_model for bias models
def evaluate_model_bias(df, model_func, model_name, **kwargs):
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    user_to_idx = {u: idx for idx, u in enumerate(train_df['user'].unique())}
    item_to_idx = {it: idx for idx, it in enumerate(train_df['item'].unique())}
    
    m_train = len(user_to_idx)
    n_train = len(item_to_idx)
    R_train = np.zeros((m_train, n_train))
    for _, row in train_df.iterrows():
        i = user_to_idx[row['user']]
        j = item_to_idx[row['item']]
        R_train[i, j] = row['rating']
    
    # Fit model
    U, V, mu, b_u, b_i = model_func(R_train, **kwargs)
    
    # Predict & RMSE
    preds, actuals = [], []
    for _, row in test_df.iterrows():
        if row['user'] in user_to_idx and row['item'] in item_to_idx:
            i = user_to_idx[row['user']]
            j = item_to_idx[row['item']]
            pred = mu + b_u[i] + b_i[j] + np.dot(U[i], V[j])
            actual = row['rating']
            preds.append(max(1, min(5, pred)))
            actuals.append(actual)
    
    if len(preds) == 0:
        return f"{model_name}: No overlapping test data"
    rmse = sqrt(mean_squared_error(actuals, preds))
    return f"{model_name} RMSE: {rmse:.4f} (n_test={len(actuals)})"

# Chạy thử với improved models on both
print("=== IMPROVED SMALL MATRIX (110x110) ===")
print(evaluate_model_bias(small_df, als_with_bias, "Improved ALS", k=20, max_iter=10))
print(evaluate_model_bias(small_df, sgd_with_bias, "Improved SGD", k=20, max_iter=100, eta0=0.01))

print("\n=== IMPROVED LARGE MATRIX (875x110) ===")
print(evaluate_model_bias(large_df, als_with_bias, "Improved ALS", k=20, max_iter=10))
print(evaluate_model_bias(large_df, sgd_with_bias, "Improved SGD", k=20, max_iter=100, eta0=0.01))

Loaded real small matrix.
Loaded real large matrix.
Small dataset shape: (1363, 3)
Large dataset shape: (1218, 3)
=== IMPROVED SMALL MATRIX (110x110) ===
Improved ALS RMSE: 1.6302 (n_test=264)
Improved SGD RMSE: 1.6265 (n_test=264)

=== IMPROVED LARGE MATRIX (875x110) ===
Improved ALS RMSE: 2.6318 (n_test=95)
Improved SGD RMSE: 2.6397 (n_test=95)
