In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, Model, optimizers, losses
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score, f1_score

# ------------------ 1. CARGA DE DATOS ------------------
processed_dir = 'data/processed'
train_df = pd.read_csv(os.path.join(processed_dir, 'train.csv'))
test_df  = pd.read_csv(os.path.join(processed_dir, 'test.csv'))

# Relevance binaria para nDCG y métricas de clasificación
test_df['rel'] = (test_df['rating'] >= 4).astype(int)

# ------------------ 2. PARÁMETROS ------------------
EMBED_FACTORS = 5    # dimensión de cada embedding
MLP_UNITS_1   = 20   # neuronas primera capa densa
MLP_UNITS_2   = 10   # neuronas segunda capa densa
EPOCHS        = 10
BATCH_SIZE    = 1024
LR            = 0.001
THRESHOLD     = 4.0
K_NDCG        = 5
MIN_R, MAX_R  = 1.0, 5.0

# Inferimos número de usuarios e ítems
NUM_USERS = int(max(train_df.user_id.max(), test_df.user_id.max()))
NUM_ITEMS = int(max(train_df.book_id.max(), test_df.book_id.max()))

# ------------------ 3. PREPARACIÓN DE ENTRADAS ------------------
# Convertir IDs a base 0
u_train = train_df.user_id.values.astype('int32') - 1
i_train = train_df.book_id.values.astype('int32') - 1
y_train = train_df.rating.values.astype('float32')

u_test  = test_df.user_id.values.astype('int32') - 1
i_test  = test_df.book_id.values.astype('int32') - 1
y_test  = test_df.rating.values.astype('float32')

X_train = {'user_input': u_train, 'item_input': i_train}
X_test  = {'user_input': u_test,  'item_input': i_test}

# ------------------ 4. DEFINICIÓN DEL MLP ------------------
user_input = layers.Input(shape=(1,), name='user_input', dtype='int32')
item_input = layers.Input(shape=(1,), name='item_input', dtype='int32')

# Embeddings
user_emb = layers.Embedding(input_dim=NUM_USERS,
                            output_dim=EMBED_FACTORS,
                            name='user_emb')(user_input)
item_emb = layers.Embedding(input_dim=NUM_ITEMS,
                            output_dim=EMBED_FACTORS,
                            name='item_emb')(item_input)

# Aplanamos
user_vec = layers.Flatten()(user_emb)
item_vec = layers.Flatten()(item_emb)

# Concatenamos
concat   = layers.Concatenate()([user_vec, item_vec])

# Capas densas del MLP
x = layers.Dense(MLP_UNITS_1, activation='relu')(concat)
x = layers.Dense(MLP_UNITS_2, activation='relu')(x)

# Capa de salida lineal para predecir rating
output = layers.Dense(1, activation='linear')(x)

MLP = Model(inputs=[user_input, item_input], outputs=output)

MLP.compile(
    optimizer=optimizers.Adam(learning_rate=LR),
    loss=losses.MeanSquaredError(),
    metrics=['mae']
)

MLP.summary()

# ------------------ 5. ENTRENAMIENTO ------------------
MLP.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1,
    verbose=1
)

# ------------------ 6. PREDICCIÓN ------------------
y_pred = MLP.predict(X_test, batch_size=BATCH_SIZE).flatten()
y_pred = np.clip(y_pred, MIN_R, MAX_R)

# Construimos la matriz completa de predicciones para ranking
pred_matrix = np.zeros((NUM_USERS, NUM_ITEMS), dtype='float32')
for u,i,p in zip(u_test, i_test, y_pred):
    pred_matrix[u,i] = p

# ------------------ 7. EVALUACIÓN ------------------
# Regresión
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae  = mean_absolute_error(y_test, y_pred)

# Clasificación binaria
y_true_bin = (y_test >= THRESHOLD).astype(int)
y_pred_bin = (y_pred >= THRESHOLD).astype(int)
prec = precision_score(y_true_bin, y_pred_bin, zero_division=0)
rec  = recall_score   (y_true_bin, y_pred_bin, zero_division=0)
f1   = f1_score       (y_true_bin, y_pred_bin, zero_division=0)

# nDCG@K
def get_user_ndcg(u, matrix, df, K=5):
    user_df = df[df.user_id==(u+1)]
    if user_df.empty: return None
    rels   = {int(r.book_id)-1: int(r.rating>=THRESHOLD)
              for _,r in user_df.iterrows()}
    scores = matrix[u]
    topk   = np.argsort(scores)[::-1][:K]
    dcg    = sum((2**rels.get(i,0)-1)/np.log2(rank+1)
                 for rank,i in enumerate(topk, start=1))
    ideal  = sorted(rels.values(), reverse=True)[:K]
    idcg   = sum((2**rel-1)/np.log2(idx+1)
                 for idx,rel in enumerate(ideal, start=1))
    return dcg/idcg if idcg>0 else 0.0

def get_ndcg(matrix, df, K=5):
    vals = [get_user_ndcg(u, matrix, df, K) for u in range(matrix.shape[0])]
    vals = [v for v in vals if v is not None]
    return np.mean(vals) if vals else 0.0

ndcg5 = get_ndcg(pred_matrix, test_df, K=K_NDCG)

print(f"\nMLP CF results:")
print(f"  RMSE      = {rmse:.4f}")
print(f"  MAE       = {mae:.4f}")
print(f"  Precision = {prec:.4f}")
print(f"  Recall    = {rec:.4f}")
print(f"  F1        = {f1:.4f}")
print(f"  nDCG@{K_NDCG}= {ndcg5:.4f}")
