In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [2]:
train.head()

Unnamed: 0,user,item,rating
0,1,25715,7.0
1,1,25716,10.0
2,5,25851,9.0
3,6,25923,5.0
4,7,25924,6.0


In [3]:
len(set(test.user.unique()) - set(train.user.unique())), test.user.nunique()

(4349, 11426)

In [4]:
def generateSubmision(test_preds, name):
    submition = pd.DataFrame()
    submition['ID'] = test['ID']
    submition['rating'] = test_preds
    submition.to_csv(f'./data/submission_{name}.csv', index=False)

In [5]:
def custom_round(number: float, min_round = 0.3, max_ronund = 0.7) -> float:
    integer_part = int(number)
    decimal_part = number - integer_part
    
    if decimal_part < 0.3:
        return float(integer_part)  # Round down
    elif decimal_part > 0.7:
        return float(integer_part + 1)  # Round up
    else:
        return number

In [6]:
train.describe()

Unnamed: 0,user,item,rating
count,390351.0,390351.0,390351.0
mean,35910.676834,68610.585878,7.604666
std,23425.777567,49826.877193,1.842793
min,1.0,1.0,1.0
25%,13944.0,30035.0,7.0
50%,35080.0,52295.0,8.0
75%,55912.5,104051.0,9.0
max,77804.0,185972.0,10.0


In [7]:
test.describe()

Unnamed: 0,ID,user,item
count,43320.0,43320.0,43320.0
mean,21659.5,5335.888989,9626.943721
std,12505.551167,3376.304807,7673.878995
min,0.0,0.0,0.0
25%,10829.75,2283.75,2470.0
50%,21659.5,4948.5,8049.5
75%,32489.25,8198.0,15830.0
max,43319.0,11425.0,25696.0


In [8]:
submition = pd.DataFrame()
submition['rating'] = train['rating'].mean() * np.ones(test.shape[0])
submition['ID'] = test['ID']
submition.to_csv('./data/submission.csv', index=False)

In [9]:
submition = pd.DataFrame()
submition['ID'] = test['ID']

In [None]:
mean_train = train.rating.mean()
def predictByMeanUser(x):
    if train[train.user == x].shape[0] > 0:
        return train[train.user == x].rating.mean()
    else:
        return mean_train
submition['rating'] = test.user.apply(predictByMeanUser)

In [11]:
submition.to_csv('./data/submission.csv', index=False)

In [None]:
submition = pd.DataFrame()
submition['ID'] = test['ID']

mean_train = train.rating.mean()
def predictByMeanBook(x):
    if train[train.item == x].shape[0] > 0:
        return train[train.item == x].rating.mean()
    else:
        return mean_train
submition['rating'] = test.item.apply(predictByMeanBook)

submition.to_csv('./data/submission_meanbook.csv', index=False)

In [33]:
submition = pd.DataFrame()
submition['ID'] = test['ID']

mean_train = train.rating.mean()
def predictPonderedMean(x):
    if train[train.item == x].shape[0] > 0:
        book_mean =  train[train.item == x].rating.mean()
    else:
        book_mean = mean_train
    if train[train.user == x].shape[0] > 0:
        user_mean =  train[train.user == x].rating.mean()
    else:
        user_mean =  mean_train

    return 0.4 * book_mean + 0.4 * user_mean + 0.2 * mean_train
submition['rating'] = test.item.apply(predictByMeanBook)

submition.to_csv('./data/submission_pondered_mean.csv', index=False)

## Matrix Factorization (1.509)

In [5]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
class MatrixFactorization:
    def __init__(self, n_factors=20, learning_rate=0.01, regularization=0.1, n_epochs=100, verbose = 1):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.n_epochs = n_epochs
        
    def fit(self, ratings_df):
        # Obtener usuarios y libros únicos
        self.users = ratings_df['user'].unique()
        self.items = ratings_df['item'].unique()
        
        # Crear mapeos de IDs
        self.user_to_idx = {user: i for i, user in enumerate(self.users)}
        self.item_to_idx = {item: i for i, item in enumerate(self.items)}
        
        # Inicializar matrices de factores latentes
        self.user_factors = np.random.normal(0, 0.1, (len(self.users), self.n_factors))
        self.item_factors = np.random.normal(0, 0.1, (len(self.items), self.n_factors))
        
        # Calcular el rating promedio global
        self.global_mean = ratings_df['rating'].mean()
        
        # Entrenar el modelo
        for epoch in range(self.n_epochs):
            for _, row in ratings_df.iterrows():
                user, item, rating = row['user'], row['item'], row['rating']
                
                # Obtener índices
                user_idx = self.user_to_idx.get(user)
                item_idx = self.item_to_idx.get(item)
                
                if user_idx is None or item_idx is None:
                    continue
                
                # Calcular la predicción actual
                pred = self.global_mean + np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
                
                # Calcular el error
                error = rating - pred
                
                # Actualizar factores
                self.user_factors[user_idx] += self.learning_rate * (error * self.item_factors[item_idx] - self.regularization * self.user_factors[user_idx])
                self.item_factors[item_idx] += self.learning_rate * (error * self.user_factors[user_idx] - self.regularization * self.item_factors[item_idx])
            
            # Opcionalmente, calcular el error del conjunto de entrenamiento
            if (epoch + 1) % 10 == 0:
                train_preds = self.predict(ratings_df)
                rmse = np.sqrt(mean_squared_error(ratings_df['rating'], train_preds))
                print(f"Epoch {epoch+1}/{self.n_epochs} - RMSE: {rmse:.4f}")
    
    def predict(self, ratings_df):
        predictions = []
        
        for _, row in ratings_df.iterrows():
            user, item = row['user'], row['item']
            
            user_idx = self.user_to_idx.get(user)
            item_idx = self.item_to_idx.get(item)
            
            if user_idx is None or item_idx is None:
                # Para nuevos usuarios o libros, usar el promedio global
                predictions.append(self.global_mean)
            else:
                # Calcular la predicción
                pred = self.global_mean + np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
                # Limitar la predicción al rango de ratings
                pred = max(min(pred, 10.0), 1.0)  # Asumiendo ratings entre 1 y 10
                predictions.append(pred)
        
        return predictions

In [14]:
# Crear y entrenar el modelo
model = MatrixFactorization(n_factors=20, learning_rate=0.01, regularization=0.1, n_epochs=10)
model.fit(train)

Epoch 10/10 - RMSE: 1.6952


In [None]:
# Realizar predicciones en el conjunto de prueba
test_predictions = model.predict(test)
generateSubmision(test_predictions, 'matrix_factorization')

## Matrix Factorization 2 (1.292)

In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
import time

class MatrixFactorization:
    def __init__(self, n_factors=20, learning_rate=0.01, regularization=0.1, n_epochs=100):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.n_epochs = n_epochs
        self.user_factors = None
        self.item_factors = None
        self.global_mean = None
        self.user_to_idx = None
        self.item_to_idx = None
        self.users = None
        self.items = None
        self.user_biases = None
        self.item_biases = None
        
    def create_mappings(self, ratings_df):
        """Crea mapeos de usuarios e ítems a índices"""
        self.users = ratings_df['user'].unique()
        self.items = ratings_df['item'].unique()
        
        self.user_to_idx = {user: i for i, user in enumerate(self.users)}
        self.item_to_idx = {item: i for i, item in enumerate(self.items)}
        
    # def create_matrix(self, df):
    #     """Crea una matriz dispersa a partir del DataFrame"""
    #     rows = [self.user_to_idx.get(user) for user in df['user'] if user in self.user_to_idx]
    #     cols = [self.item_to_idx.get(item) for item in df['item'] if item in self.item_to_idx]
    #     ratings = df.loc[df['user'].isin(self.users) & df['item'].isin(self.items), 'rating'].values
        
    #     return csr_matrix((ratings, (rows, cols)), shape=(len(self.users), len(self.items)))
    
    def initialize_factors(self):
        """Inicializa las matrices de factores latentes y los sesgos"""
        self.user_factors = np.random.normal(0, 0.1, (len(self.users), self.n_factors))
        self.item_factors = np.random.normal(0, 0.1, (len(self.items), self.n_factors))
        self.user_biases = np.zeros(len(self.users))
        self.item_biases = np.zeros(len(self.items))
        
    def train_epoch(self, ratings_df):
        """Entrena el modelo por una época"""
        # Convertir DataFrame a formato de coordenadas para acceso eficiente
        users = ratings_df['user'].values
        items = ratings_df['item'].values
        ratings = ratings_df['rating'].values
        
        # Actualización de factores usando SGD (Stochastic Gradient Descent)
        for i in range(len(ratings)):
            user, item, rating = users[i], items[i], ratings[i]
            
            # Verificar si el usuario y el ítem existen en nuestros mapeos
            if user not in self.user_to_idx or item not in self.item_to_idx:
                continue
                
            user_idx = self.user_to_idx[user]
            item_idx = self.item_to_idx[item]
            
            # Calcular la predicción actual
            pred = self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx] + \
                   np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            
            # Calcular el error
            error = rating - pred
            
            # Actualizar sesgos
            self.user_biases[user_idx] += self.learning_rate * (error - self.regularization * self.user_biases[user_idx])
            self.item_biases[item_idx] += self.learning_rate * (error - self.regularization * self.item_biases[item_idx])
            
            # Actualizar factores
            user_factor = self.user_factors[user_idx].copy()
            item_factor = self.item_factors[item_idx].copy()
            
            self.user_factors[user_idx] += self.learning_rate * (error * item_factor - self.regularization * user_factor)
            self.item_factors[item_idx] += self.learning_rate * (error * user_factor - self.regularization * item_factor)
    
    def fit(self, ratings_df, val_df=None, patience=20, verbose=True):
        """Entrena el modelo completo con early stopping opcional"""
        start_time = time.time()
        
        # Crear mapeos y calcular el rating promedio global
        self.create_mappings(ratings_df)
        self.global_mean = ratings_df['rating'].mean()
        
        # Inicializar factores latentes
        self.initialize_factors()
        
        # Implementar early stopping si se proporciona un conjunto de validación
        if val_df is not None:
            best_val_rmse = float('inf')
            patience_counter = 0
            
            for epoch in range(self.n_epochs):
                # Entrenar por una época
                self.train_epoch(ratings_df)
                
                # Evaluar en el conjunto de validación
                val_preds = self.predict(val_df)
                val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_preds))
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    print(f"Época {epoch+1}/{self.n_epochs} - Val RMSE: {val_rmse:.4f} - Tiempo: {elapsed:.2f}s")
                
                if val_rmse < best_val_rmse:
                    best_val_rmse = val_rmse
                    patience_counter = 0
                    # Guardar el mejor modelo
                    best_user_factors = self.user_factors.copy()
                    best_item_factors = self.item_factors.copy()
                    best_user_biases = self.user_biases.copy()
                    best_item_biases = self.item_biases.copy()
                else:
                    patience_counter += 1
                
                if patience_counter >= patience:
                    if verbose:
                        print(f"Early stopping en la época {epoch+1} con {patience}")
                    # Restaurar el mejor modelo
                    self.user_factors = best_user_factors
                    self.item_factors = best_item_factors
                    self.user_biases = best_user_biases
                    self.item_biases = best_item_biases
                    break
        else:
            # Sin early stopping, entrenar por un número fijo de épocas
            for epoch in range(self.n_epochs):
                self.train_epoch(ratings_df)
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    train_preds = self.predict(ratings_df)
                    train_rmse = np.sqrt(mean_squared_error(ratings_df['rating'], train_preds))
                    print(f"Época {epoch+1}/{self.n_epochs} - Train RMSE: {train_rmse:.4f} - Tiempo: {elapsed:.2f}s")
        
        total_time = time.time() - start_time
        if verbose:
            print(f"Entrenamiento completado en {total_time:.2f} segundos")
        
        return self
    
    def predict_one(self, user, item):
        """Predice el rating para un par usuario-ítem específico (maneja cold start)"""
        # Caso 1: Si el usuario y el ítem existen en nuestros mapeos
        if user in self.user_to_idx and item in self.item_to_idx:
            user_idx = self.user_to_idx[user]
            item_idx = self.item_to_idx[item]
            pred = self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx] + \
                   np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            return max(min(pred, 10.0), 1.0)  # Limitar al rango [1, 10]
        
        # Caso 2: Si solo el usuario existe (nuevo ítem)
        elif user in self.user_to_idx:
            user_idx = self.user_to_idx[user]
            return max(min(self.global_mean + self.user_biases[user_idx], 10.0), 1.0)
        
        # Caso 3: Si solo el ítem existe (nuevo usuario)
        elif item in self.item_to_idx:
            item_idx = self.item_to_idx[item]
            return max(min(self.global_mean + self.item_biases[item_idx], 10.0), 1.0)
        
        # Caso 4: Ni el usuario ni el ítem existen
        else:
            return self.global_mean
    
    def predict(self, ratings_df):
        """Predice ratings para un DataFrame de pares usuario-ítem"""
        predictions = []
        
        for _, row in ratings_df.iterrows():
            user, item = row['user'], row['item']
            predictions.append(self.predict_one(user, item))
        
        return predictions

def guardar_modelo(modelo, filename):
    """Guarda el modelo en un archivo"""
    with open(filename, 'wb') as f:
        pickle.dump(modelo, f)

def cargar_modelo(filename):
    """Carga el modelo desde un archivo"""
    with open(filename, 'rb') as f:
        return pickle.load(f)


In [22]:
# Crear y entrenar el modelo
modelo = MatrixFactorization(n_factors=20, learning_rate=0.01, regularization=0.1, n_epochs=100)
modelo.fit(train, val_df=None, patience=20)

Época 5/100 - Train RMSE: 1.5049 - Tiempo: 22.47s
Época 10/100 - Train RMSE: 1.3694 - Tiempo: 77.91s
Época 15/100 - Train RMSE: 1.2197 - Tiempo: 203.93s
Época 20/100 - Train RMSE: 1.0639 - Tiempo: 363.62s
Época 25/100 - Train RMSE: 0.9207 - Tiempo: 522.07s
Época 30/100 - Train RMSE: 0.7989 - Tiempo: 586.39s
Época 35/100 - Train RMSE: 0.6992 - Tiempo: 616.81s
Época 40/100 - Train RMSE: 0.6190 - Tiempo: 645.52s
Época 45/100 - Train RMSE: 0.5553 - Tiempo: 674.20s
Época 50/100 - Train RMSE: 0.5048 - Tiempo: 704.37s
Época 55/100 - Train RMSE: 0.4648 - Tiempo: 734.62s
Época 60/100 - Train RMSE: 0.4330 - Tiempo: 762.96s
Época 65/100 - Train RMSE: 0.4077 - Tiempo: 803.15s
Época 70/100 - Train RMSE: 0.3875 - Tiempo: 872.24s
Época 75/100 - Train RMSE: 0.3713 - Tiempo: 903.73s
Época 80/100 - Train RMSE: 0.3582 - Tiempo: 935.40s
Época 85/100 - Train RMSE: 0.3475 - Tiempo: 967.58s
Época 90/100 - Train RMSE: 0.3387 - Tiempo: 999.83s
Época 95/100 - Train RMSE: 0.3314 - Tiempo: 1031.54s
Época 100/100 

<__main__.MatrixFactorization at 0x2eb2e7fd1d0>

In [24]:
test_preds = modelo.predict(test)
generateSubmision(test_preds, 'mf2')

In [25]:
guardar_modelo(modelo, 'matrixFact.pkl')
print("Modelo guardado correctamente.")

Modelo guardado correctamente.


In [None]:
test_preds2 = [custom_round(x) for x in test_preds]
generateSubmision(test_preds2, 'mf2_rounded')

In [62]:
global_mean = train.rating.mean()
test_study = test.copy()
test_study['rating'] = test_preds2
test_study['decimal_rating'] = test_preds
test_study['prediction_mean'] = test_study.rating.mean()
test_study['prediction_mean'] = test_study.rating.mean()
test_study['real_mean'] = global_mean
test_study['item_mean'] = test_study.item.apply(predictByMeanBook)
test_study['user_mean'] = test_study.item.apply(predictByMeanUser)



## MF con matriz dispersa (1.292)

In [26]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
import time

class MatrixFactorization:
    def __init__(self, n_factors=20, learning_rate=0.01, regularization=0.1, n_epochs=100):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.regularization = regularization
        self.n_epochs = n_epochs
        self.user_factors = None
        self.item_factors = None
        self.global_mean = None
        self.user_to_idx = None
        self.item_to_idx = None
        self.idx_to_user = None
        self.idx_to_item = None
        self.users = None
        self.items = None
        self.user_biases = None
        self.item_biases = None
        
    def create_mappings(self, ratings_df):
        """Creates mappings from users and items to indices"""
        self.users = ratings_df['user'].unique()
        self.items = ratings_df['item'].unique()
        
        self.user_to_idx = {user: i for i, user in enumerate(self.users)}
        self.item_to_idx = {item: i for i, item in enumerate(self.items)}
        self.idx_to_user = {i: user for i, user in enumerate(self.users)}
        self.idx_to_item = {i: item for i, item in enumerate(self.items)}
        
    def create_matrix(self, df):
        """Creates a sparse matrix from the DataFrame"""
        # Filter out entries where user or item is not in our mappings
        valid_df = df[df['user'].isin(self.users) & df['item'].isin(self.items)]
        
        rows = [self.user_to_idx[user] for user in valid_df['user']]
        cols = [self.item_to_idx[item] for item in valid_df['item']]
        ratings = valid_df['rating'].values
        
        return csr_matrix((ratings, (rows, cols)), shape=(len(self.users), len(self.items)))
    
    def initialize_factors(self):
        """Initializes latent factor matrices and biases"""
        self.user_factors = np.random.normal(0, 0.1, (len(self.users), self.n_factors))
        self.item_factors = np.random.normal(0, 0.1, (len(self.items), self.n_factors))
        self.user_biases = np.zeros(len(self.users))
        self.item_biases = np.zeros(len(self.items))
        
    def train_epoch(self, ratings_matrix):
        """Trains the model for one epoch using sparse matrix format"""
        # Get non-zero elements from the sparse matrix (for faster iteration)
        nonzero_indices = ratings_matrix.nonzero()
        users_idx = nonzero_indices[0]
        items_idx = nonzero_indices[1]
        
        # Shuffle the indices to ensure stochastic gradient descent
        permutation = np.random.permutation(len(users_idx))
        users_idx = users_idx[permutation]
        items_idx = items_idx[permutation]
        
        # Loop through non-zero elements only for efficiency
        for i in range(len(users_idx)):
            user_idx = users_idx[i]
            item_idx = items_idx[i]
            
            # Get the actual rating value from the sparse matrix
            rating = ratings_matrix[user_idx, item_idx]
            
            # Calculate the current prediction
            pred = self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx] + \
                   np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            
            # Calculate the error
            error = rating - pred
            
            # Update biases
            self.user_biases[user_idx] += self.learning_rate * (error - self.regularization * self.user_biases[user_idx])
            self.item_biases[item_idx] += self.learning_rate * (error - self.regularization * self.item_biases[item_idx])
            
            # Update factors
            user_factor = self.user_factors[user_idx].copy()
            item_factor = self.item_factors[item_idx].copy()
            
            self.user_factors[user_idx] += self.learning_rate * (error * item_factor - self.regularization * user_factor)
            self.item_factors[item_idx] += self.learning_rate * (error * user_factor - self.regularization * item_factor)
    
    def fit(self, ratings_df, val_df=None, patience=20, verbose=True):
        """Trains the complete model with optional early stopping using sparse matrices"""
        start_time = time.time()
        
        # Create mappings and calculate global mean rating
        self.create_mappings(ratings_df)
        self.global_mean = ratings_df['rating'].mean()
        
        # Convert dataframes to sparse matrices for more efficient training
        train_matrix = self.create_matrix(ratings_df)
        
        # Initialize latent factors
        self.initialize_factors()
        
        # Implement early stopping if a validation set is provided
        if val_df is not None:
            val_matrix = self.create_matrix(val_df)
            best_val_rmse = float('inf')
            patience_counter = 0
            
            for epoch in range(self.n_epochs):
                # Train for one epoch
                self.train_epoch(train_matrix)
                
                # Evaluate on validation set
                val_preds = self.predict(val_df)
                val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_preds))
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    print(f"Epoch {epoch+1}/{self.n_epochs} - Val RMSE: {val_rmse:.4f} - Time: {elapsed:.2f}s")
                
                if val_rmse < best_val_rmse:
                    best_val_rmse = val_rmse
                    patience_counter = 0
                    # Save the best model
                    best_user_factors = self.user_factors.copy()
                    best_item_factors = self.item_factors.copy()
                    best_user_biases = self.user_biases.copy()
                    best_item_biases = self.item_biases.copy()
                else:
                    patience_counter += 1
                
                if patience_counter >= patience:
                    if verbose:
                        print(f"Early stopping at epoch {epoch+1} with patience {patience}")
                    # Restore the best model
                    self.user_factors = best_user_factors
                    self.item_factors = best_item_factors
                    self.user_biases = best_user_biases
                    self.item_biases = best_item_biases
                    break
        else:
            # No early stopping, train for a fixed number of epochs
            for epoch in range(self.n_epochs):
                self.train_epoch(train_matrix)
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    train_preds = self.predict(ratings_df)
                    train_rmse = np.sqrt(mean_squared_error(ratings_df['rating'], train_preds))
                    print(f"Epoch {epoch+1}/{self.n_epochs} - Train RMSE: {train_rmse:.4f} - Time: {elapsed:.2f}s")
        
        total_time = time.time() - start_time
        if verbose:
            print(f"Training completed in {total_time:.2f} seconds")
        
        return self
    
    def predict_one(self, user, item):
        """Predicts the rating for a specific user-item pair (handles cold start)"""
        # Case 1: If both user and item exist in our mappings
        if user in self.user_to_idx and item in self.item_to_idx:
            user_idx = self.user_to_idx[user]
            item_idx = self.item_to_idx[item]
            pred = self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx] + \
                   np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            return max(min(pred, 10.0), 1.0)  # Limit to range [1, 10]
        
        # Case 2: If only the user exists (new item)
        elif user in self.user_to_idx:
            user_idx = self.user_to_idx[user]
            return max(min(self.global_mean + self.user_biases[user_idx], 10.0), 1.0)
        
        # Case 3: If only the item exists (new user)
        elif item in self.item_to_idx:
            item_idx = self.item_to_idx[item]
            return max(min(self.global_mean + self.item_biases[item_idx], 10.0), 1.0)
        
        # Case 4: Neither user nor item exists
        else:
            return self.global_mean
    
    def predict(self, ratings_df):
        """Predicts ratings for a DataFrame of user-item pairs"""
        predictions = []
        
        for _, row in ratings_df.iterrows():
            user, item = row['user'], row['item']
            predictions.append(self.predict_one(user, item))
        
        return predictions
    

In [27]:
mf = MatrixFactorization(n_factors=20, learning_rate=0.005, regularization=0.02, n_epochs=100)
mf.fit(train, None, patience=7)

Epoch 5/100 - Train RMSE: 1.5833 - Time: 54.60s
Epoch 10/100 - Train RMSE: 1.4771 - Time: 265.11s
Epoch 15/100 - Train RMSE: 1.3819 - Time: 465.32s
Epoch 20/100 - Train RMSE: 1.2842 - Time: 526.58s
Epoch 25/100 - Train RMSE: 1.1823 - Time: 587.76s
Epoch 30/100 - Train RMSE: 1.0814 - Time: 650.20s
Epoch 35/100 - Train RMSE: 0.9850 - Time: 789.41s
Epoch 40/100 - Train RMSE: 0.8957 - Time: 850.93s
Epoch 45/100 - Train RMSE: 0.8143 - Time: 914.80s
Epoch 50/100 - Train RMSE: 0.7408 - Time: 977.81s
Epoch 55/100 - Train RMSE: 0.6748 - Time: 1040.55s
Epoch 60/100 - Train RMSE: 0.6157 - Time: 1101.84s
Epoch 65/100 - Train RMSE: 0.5628 - Time: 1163.19s
Epoch 70/100 - Train RMSE: 0.5155 - Time: 1224.93s
Epoch 75/100 - Train RMSE: 0.4732 - Time: 1286.81s
Epoch 80/100 - Train RMSE: 0.4355 - Time: 1348.52s
Epoch 85/100 - Train RMSE: 0.4017 - Time: 1409.62s
Epoch 90/100 - Train RMSE: 0.3714 - Time: 1472.69s
Epoch 95/100 - Train RMSE: 0.3442 - Time: 1535.80s
Epoch 100/100 - Train RMSE: 0.3200 - Time: 

<__main__.MatrixFactorization at 0x2eb2e0d8950>

In [30]:
# Evaluate the model
predictions = mf.predict(test)
generateSubmision(predictions, 'mf_sparse')

In [15]:
# guardar_modelo(modelo, 'sparseMAtrixFact.pkl')
# print("Modelo guardado correctamente.")

## PMF (1.2562 - rounded -1.246)

PMF (Probabilistic Matrix Factorization) descompone la matriz de calificaciones usuario-artículo en dos matrices de menor dimensión, representando los factores latentes de usuarios y artículos. A diferencia de la factorización matricial estándar, PMF utiliza un enfoque probabilístico, donde los factores latentes se asumen provenientes de distribuciones gaussianas. Las calificaciones observadas se modelan como generadas a partir de estos factores latentes más ruido gaussiano. El algoritmo usa estimación de Máxima A Posteriori (MAP), incorporando creencias previas sobre la distribución de factores. Esto permite al modelo manejar la incertidumbre en los datos, evitar sobreajustes y tratar los valores faltantes de manera natural, siendo especialmente útil para sistemas de recomendación con datos dispersos.

In [6]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pickle
import time

class ProbabilisticMatrixFactorization:
    def __init__(self, n_factors=20, learning_rate=0.005, user_regularization=0.1, item_regularization=0.1, n_epochs=100):
        self.n_factors = n_factors
        self.learning_rate = learning_rate
        self.user_regularization = user_regularization
        self.item_regularization = item_regularization
        self.n_epochs = n_epochs
        self.user_factors = None
        self.item_factors = None
        self.global_mean = None
        self.user_to_idx = None
        self.item_to_idx = None
        self.idx_to_user = None
        self.idx_to_item = None
        self.users = None
        self.items = None
        # PMF typically doesn't use biases, but we can keep them for flexibility
        self.user_biases = None
        self.item_biases = None
        # PMF parameters
        self.rating_var = 0.1  # observation noise variance (1/precision)
        self.user_prior_var = 1.0  # prior variance for user factors
        self.item_prior_var = 1.0  # prior variance for item factors
        
    def create_mappings(self, ratings_df):
        """Creates mappings from users and items to indices"""
        self.users = ratings_df['user'].unique()
        self.items = ratings_df['item'].unique()
        
        self.user_to_idx = {user: i for i, user in enumerate(self.users)}
        self.item_to_idx = {item: i for i, item in enumerate(self.items)}
        self.idx_to_user = {i: user for i, user in enumerate(self.users)}
        self.idx_to_item = {i: item for i, item in enumerate(self.items)}
        
    def create_matrix(self, df):
        """Creates a sparse matrix from the DataFrame"""
        # Filter out entries where user or item is not in our mappings
        valid_df = df[df['user'].isin(self.users) & df['item'].isin(self.items)]
        
        rows = [self.user_to_idx[user] for user in valid_df['user']]
        cols = [self.item_to_idx[item] for item in valid_df['item']]
        ratings = valid_df['rating'].values
        
        return csr_matrix((ratings, (rows, cols)), shape=(len(self.users), len(self.items)))
    
    def initialize_factors(self):
        """Initializes latent factor matrices with Gaussian priors"""
        # In PMF, factors are initialized from a Gaussian distribution
        # with mean 0 and small variance
        self.user_factors = np.random.normal(0, np.sqrt(self.user_prior_var/self.n_factors), 
                                            (len(self.users), self.n_factors))
        self.item_factors = np.random.normal(0, np.sqrt(self.item_prior_var/self.n_factors), 
                                            (len(self.items), self.n_factors))
        # Initialize biases to zero (PMF traditionally doesn't use biases)
        self.user_biases = np.zeros(len(self.users))
        self.item_biases = np.zeros(len(self.items))
        
    def train_epoch(self, ratings_matrix):
        """Trains the model for one epoch using MAP estimation"""
        # Get non-zero elements from the sparse matrix (for faster iteration)
        nonzero_indices = ratings_matrix.nonzero()
        users_idx = nonzero_indices[0]
        items_idx = nonzero_indices[1]
        
        # Shuffle the indices to ensure stochastic gradient descent
        permutation = np.random.permutation(len(users_idx))
        users_idx = users_idx[permutation]
        items_idx = items_idx[permutation]
        
        # Scale regularization by precision (inverse variance)
        user_reg_scaled = self.user_regularization / self.user_prior_var
        item_reg_scaled = self.item_regularization / self.item_prior_var
        
        # Loop through non-zero elements only for efficiency
        for i in range(len(users_idx)):
            user_idx = users_idx[i]
            item_idx = items_idx[i]
            
            # Get the actual rating value from the sparse matrix
            rating = ratings_matrix[user_idx, item_idx]
            
            # Calculate the current prediction
            pred = np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            # Add biases if using them (optional in PMF)
            if self.user_biases is not None and self.item_biases is not None:
                pred += self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx]
            
            # Calculate the error
            error = rating - pred
            
            # Update biases if using them
            if self.user_biases is not None and self.item_biases is not None:
                self.user_biases[user_idx] += self.learning_rate * (error - user_reg_scaled * self.user_biases[user_idx])
                self.item_biases[item_idx] += self.learning_rate * (error - item_reg_scaled * self.item_biases[item_idx])
            
            # Update factors using PMF update rules (MAP estimation)
            # The derivatives come from the log posterior probability
            user_factor = self.user_factors[user_idx].copy()
            item_factor = self.item_factors[item_idx].copy()
            
            # PMF update rules (scaled by precision)
            self.user_factors[user_idx] += self.learning_rate * ((error * item_factor) / self.rating_var - user_reg_scaled * user_factor)
            self.item_factors[item_idx] += self.learning_rate * ((error * user_factor) / self.rating_var - item_reg_scaled * item_factor)
    
    def fit(self, ratings_df, val_df=None, patience=20, verbose=True):
        """Trains the complete model with optional early stopping using sparse matrices"""
        start_time = time.time()
        
        # Create mappings and calculate global mean rating
        self.create_mappings(ratings_df)
        self.global_mean = ratings_df['rating'].mean()
        
        # Convert dataframes to sparse matrices for more efficient training
        train_matrix = self.create_matrix(ratings_df)
        
        # Initialize latent factors
        self.initialize_factors()
        
        # Set priors based on data variance
        # This is a common practice in PMF to set priors based on data characteristics
        rating_std = ratings_df['rating'].std()
        self.rating_var = rating_std ** 2  # Set observation noise variance
        
        # Implement early stopping if a validation set is provided
        if val_df is not None:
            val_matrix = self.create_matrix(val_df)
            best_val_rmse = float('inf')
            patience_counter = 0
            
            for epoch in range(self.n_epochs):
                # Train for one epoch
                self.train_epoch(train_matrix)
                
                # Evaluate on validation set
                val_preds = self.predict(val_df)
                val_rmse = np.sqrt(mean_squared_error(val_df['rating'], val_preds))
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    print(f"Epoch {epoch+1}/{self.n_epochs} - Val RMSE: {val_rmse:.4f} - Time: {elapsed:.2f}s")
                
                if val_rmse < best_val_rmse:
                    best_val_rmse = val_rmse
                    patience_counter = 0
                    # Save the best model
                    best_user_factors = self.user_factors.copy()
                    best_item_factors = self.item_factors.copy()
                    best_user_biases = self.user_biases.copy() if self.user_biases is not None else None
                    best_item_biases = self.item_biases.copy() if self.item_biases is not None else None
                else:
                    patience_counter += 1
                
                if patience_counter >= patience:
                    if verbose:
                        print(f"Early stopping at epoch {epoch+1} with patience {patience}")
                    # Restore the best model
                    self.user_factors = best_user_factors
                    self.item_factors = best_item_factors
                    self.user_biases = best_user_biases
                    self.item_biases = best_item_biases
                    break
        else:
            # No early stopping, train for a fixed number of epochs
            for epoch in range(self.n_epochs):
                self.train_epoch(train_matrix)
                
                if verbose and (epoch + 1) % 5 == 0:
                    elapsed = time.time() - start_time
                    train_preds = self.predict(ratings_df)
                    train_rmse = np.sqrt(mean_squared_error(ratings_df['rating'], train_preds))
                    print(f"Epoch {epoch+1}/{self.n_epochs} - Train RMSE: {train_rmse:.4f} - Time: {elapsed:.2f}s")
        
        total_time = time.time() - start_time
        if verbose:
            print(f"Training completed in {total_time:.2f} seconds")
        
        return self
    
    def predict_one(self, user, item):
        """Predicts the rating for a specific user-item pair"""
        # Case 1: If both user and item exist in our mappings
        if user in self.user_to_idx and item in self.item_to_idx:
            user_idx = self.user_to_idx[user]
            item_idx = self.item_to_idx[item]
            pred = np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            # Add biases if using them
            if self.user_biases is not None and self.item_biases is not None:
                pred += self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx]
            return max(min(pred, 10.0), 1.0)  # Limit to range [1, 10]
        
        # Case 2: If only the user exists (new item)
        elif user in self.user_to_idx and self.user_biases is not None:
            user_idx = self.user_to_idx[user]
            return max(min(self.global_mean + self.user_biases[user_idx], 10.0), 1.0)
        
        # Case 3: If only the item exists (new user)
        elif item in self.item_to_idx and self.item_biases is not None:
            item_idx = self.item_to_idx[item]
            return max(min(self.global_mean + self.item_biases[item_idx], 10.0), 1.0)
        
        # Case 4: Neither user nor item exists
        else:
            return self.global_mean
    
    def predict(self, ratings_df):
        """Predicts ratings for a DataFrame of user-item pairs"""
        predictions = []
        
        for _, row in ratings_df.iterrows():
            user, item = row['user'], row['item']
            predictions.append(self.predict_one(user, item))
        
        return predictions
    
    def calculate_log_likelihood(self, ratings_matrix):
        """Calculate log likelihood of the model given the data (for monitoring)"""
        nonzero_indices = ratings_matrix.nonzero()
        users_idx = nonzero_indices[0]
        items_idx = nonzero_indices[1]
        
        log_likelihood = 0.0
        
        # Sum log likelihood of observed ratings
        for i in range(len(users_idx)):
            user_idx = users_idx[i]
            item_idx = items_idx[i]
            rating = ratings_matrix[user_idx, item_idx]
            
            pred = np.dot(self.user_factors[user_idx], self.item_factors[item_idx])
            if self.user_biases is not None and self.item_biases is not None:
                pred += self.global_mean + self.user_biases[user_idx] + self.item_biases[item_idx]
            
            # Log likelihood for this rating (Gaussian likelihood)
            log_lik = -0.5 * np.log(2 * np.pi * self.rating_var) - (0.5 / self.rating_var) * (rating - pred) ** 2
            log_likelihood += log_lik
        
        # Add log prior for user factors
        for u in range(len(self.users)):
            log_prior_u = -0.5 * np.sum(self.user_factors[u] ** 2) / self.user_prior_var
            log_likelihood += log_prior_u
        
        # Add log prior for item factors
        for i in range(len(self.items)):
            log_prior_i = -0.5 * np.sum(self.item_factors[i] ** 2) / self.item_prior_var
            log_likelihood += log_prior_i
        
        return log_likelihood

In [11]:
pmf = ProbabilisticMatrixFactorization(n_factors=20, learning_rate=0.05, n_epochs=200)
pmf.fit(train, None, patience=5)

Epoch 5/200 - Train RMSE: 1.2108 - Time: 48.44s
Epoch 10/200 - Train RMSE: 1.0520 - Time: 100.95s
Epoch 15/200 - Train RMSE: 0.9628 - Time: 164.38s
Epoch 20/200 - Train RMSE: 0.9011 - Time: 225.54s
Epoch 25/200 - Train RMSE: 0.8601 - Time: 280.31s
Epoch 30/200 - Train RMSE: 0.8261 - Time: 334.11s
Epoch 35/200 - Train RMSE: 0.8010 - Time: 387.81s
Epoch 40/200 - Train RMSE: 0.7793 - Time: 440.90s
Epoch 45/200 - Train RMSE: 0.7630 - Time: 488.80s
Epoch 50/200 - Train RMSE: 0.7494 - Time: 545.09s
Epoch 55/200 - Train RMSE: 0.7376 - Time: 599.40s
Epoch 60/200 - Train RMSE: 0.7275 - Time: 660.34s
Epoch 65/200 - Train RMSE: 0.7190 - Time: 803.43s
Epoch 70/200 - Train RMSE: 0.7102 - Time: 850.83s
Epoch 75/200 - Train RMSE: 0.7034 - Time: 895.47s
Epoch 80/200 - Train RMSE: 0.6962 - Time: 940.19s
Epoch 85/200 - Train RMSE: 0.6915 - Time: 984.28s
Epoch 90/200 - Train RMSE: 0.6857 - Time: 1028.22s
Epoch 95/200 - Train RMSE: 0.6823 - Time: 1070.95s
Epoch 100/200 - Train RMSE: 0.6794 - Time: 1120.18

<__main__.ProbabilisticMatrixFactorization at 0x1a4490fcb10>

In [None]:
test_preds = pmf.predict(test)
generateSubmision(test_preds, 'pmf')

In [14]:
test_preds_rounded = [custom_round(x) for x in test_preds]
generateSubmision(test_preds_rounded, 'pmf_rounded')

| Name  | Public score  |
|---|---|
| MF v1  | 1.509  |
| MF sparse matrix  | 1.292  |
| MF sparse matrix  custom_round| 1.273  |
| PMF  |  1.262 |
| PMF custom_round | 1.246  |