Cargamos los datos procesados y preparamos para Surpirsa usando train_df

In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

# data loading
processed_dir = 'data/processed'
train = pd.read_csv(os.path.join(processed_dir, 'train.csv'))
test  = pd.read_csv(os.path.join(processed_dir, 'test.csv'))

# usuarios x libros
num_users = train['user_id'].max()
num_items = train['book_id'].max()
R = np.zeros((num_users, num_items), dtype=np.float32)
for _, row in train.iterrows():
    R[int(row.user_id)-1, int(row.book_id)-1] = row.rating

# user means
counts = (R != 0).sum(axis=1)
sums   = R.sum(axis=1)
user_means = np.divide(sums, counts, out=np.zeros_like(sums), where=counts!=0)

#    restamos a cada fila su rating siempre que exista
D = R.copy()
D = D - user_means[:, None]     # broadcasting
D[R == 0] = 0                   

# similitud coseno entre usuarios
sim = cosine_similarity(D + 1e-9)

# predicción knn
def predict_knn(u, i, k=20):
    # vecinos que valoraron el ítem i
    voters = np.where(R[:, i] != 0)[0]
    if len(voters) == 0:
        return user_means[u]
    sims = sim[u, voters]
    idx = np.argsort(sims)[-k:]
    top_sims = sims[idx]
    top_devs = D[voters[idx], i]
    if top_sims.sum() == 0:
        return user_means[u]
    pred = user_means[u] + np.dot(top_sims, top_devs) / np.abs(top_sims).sum()
    return min(5, max(1, pred))

y_true, y_pred = [], []
for _, row in test.iterrows():
    u = int(row.user_id) - 1
    i = int(row.book_id) - 1
    y_true.append(row.rating)
    y_pred.append(predict_knn(u, i, k=20))

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae  = mean_absolute_error(y_true, y_pred)

print(f"KNN colaborativo manual → RMSE: {rmse:.4f}, MAE: {mae:.4f}")


KNN colaborativo manual → RMSE: 0.8268, MAE: 0.6338
