BeMF, Bernoulli Matrix Factorization

In [None]:
import os, numpy as np, pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
train_df = pd.read_csv('./data/processed/train.csv')
test_df = pd.read_csv('./data/processed/test.csv')

### Inicialización del modelo

In [None]:
import random

# Parámetros
NUM_FACTORS    = 7
LEARNING_RATE  = 0.001
REGULARIZATION = 0.1

NUM_USERS = int(max(train_df.user_id.max(), test_df.user_id.max()))
NUM_ITEMS = int(max(train_df.book_id.max(), test_df.book_id.max()))

ratings = [[None for _ in range(NUM_ITEMS)] for _ in range(NUM_USERS)]

#### Generamos U y V como arrays uniformes en [0,1)

In [None]:
SCORES = [1, 2, 3, 4, 5]

In [None]:
U = { 
    s: np.random.rand(NUM_USERS, NUM_FACTORS) 
    for s in SCORES 
}

V = { 
    s: np.random.rand(NUM_ITEMS, NUM_FACTORS) 
    for s in SCORES 
}

#### Cálculo de predicciones

La función sigmoidea convierte el producto escalar en una probabilidad entre 0 y 1

In [None]:
import math

def logit(x):
    return 1.0 / (1.0 + math.exp(-x))

def compute_prediction(u, i):
    """
    Devuelve (predicted_score, probability) para usuario u, ítem i.
    u, i son índices base 0.
    """
    best_s, best_p = None, 0.0
    for idx, s in enumerate(SCORES):
        # Producto escalar P^s_u · Q^s_i
        dot = np.dot(U[s][u], V[s][i])
        p   = logit(dot)
        if p > best_p:
            best_p = p
            best_s = s
    return best_s, best_p

#### Aprendizaje factores latentes

In [None]:
NUM_ITERATIONS = 10

for it in range(NUM_ITERATIONS):
    print(f"Iteración {it+1} de {NUM_ITERATIONS}")
    # Para cada clase de puntuación
    for s in SCORES:
        P = U[s]
        Q = V[s]

        # Actualizar factores de usuario
        for u in range(NUM_USERS):
            for i in range(NUM_ITEMS):
                r_ui = ratings[u][i]
                if r_ui is None:
                    continue

                # Predecimos probabilidad de que rating==s
                z = np.dot(P[u], Q[i])
                p = logit(z)
                y = 1.0 if r_ui == s else 0.0
                e = p - y

                # Gradientes y actualización
                for f in range(NUM_FACTORS):
                    grad_p = e * Q[i][f] + REGULARIZATION * P[u][f]
                    P[u][f] -= LEARNING_RATE * grad_p

        # Actualizar factores de ítem
        for i in range(NUM_ITEMS):
            for u in range(NUM_USERS):
                r_ui = ratings[u][i]
                if r_ui is None:
                    continue

                z = np.dot(P[u], Q[i])
                p = logit(z)
                y = 1.0 if r_ui == s else 0.0
                e = p - y

                for f in range(NUM_FACTORS):
                    grad_q = e * P[u][f] + REGULARIZATION * Q[i][f]
                    Q[i][f] -= LEARNING_RATE * grad_q

        # Guardamos de nuevo
        U[s], V[s] = P, Q

#### Cálculo de recomendaciones

In [None]:
def get_recommendations(user_id, pred_matrix, train_df, N=5):
    u = user_id - 1
    # Conjunto de ítems ya valorados
    seen = set(train_df.loc[train_df.user_id == user_id, 'book_id'])
    # Array de scores para ese usuario
    scores = pred_matrix[u]
    # Índices ordenados por score descendente
    ranked = np.argsort(scores)[::-1]
    # Filtrar out los ya vistos y tomar los N primeros
    recs = []
    for idx in ranked:
        book_id = idx + 1
        if book_id not in seen:
            recs.append(book_id)
            if len(recs) >= N:
                break
    return recs