Implementacion del algoritmo Singular value decomposition (SVD) para realizar predicciones de un sistema de recomendacion.

Realizamos imports

In [1]:
import surprise

Generamos train y test

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix,lil_matrix

RANDOM_STATE = 46
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

Como modelo usaremos el SVDpp de la libreria del surprise, este modelo se basa en el SVD clasico, pero añade dos mejoras significativas:

Considera los ítems con los que el usuario ha interactuado, incluso si no los ha calificado.

Permite mejores predicciones al capturar datos indirectos sobre las preferencias de los usuarios.

Sin embargo esta implementacion aumenta el coste computacional.


Entrenamos el modelo

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from surprise.accuracy import rmse

# Step 1: Load and prepare the dataset
# Assuming your data is in a CSV file with columns 'user', 'item', 'rating'
def load_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Define the format of the data
    reader = Reader(rating_scale=(df['rating'].min(), df['rating'].max()))

    # Load the data into the Surprise format
    data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)

    return data

# Step 2: Train SVD++ model with hyperparameter tuning
def train_svdpp_model(data):
    # Split data into train and test sets
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


    print("Performing hyperparameter tuning...")
    # Define parameter grid
    param_grid = {
        'n_factors': [20, 30, 40],
        'n_epochs': [20,25],
        'lr_all': [0.007, 0.01],
        'reg_all': [0.02,0.1]
    
    }

    # Perform grid search
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=2)
    gs.fit(data)

    # Get the best parameters
    best_params = gs.best_params['rmse']
    print(f"Best parameters: {best_params}")

    # Train with best parameters
    algo = SVDpp(
        n_factors=best_params['n_factors'],
        n_epochs=best_params['n_epochs'],
        lr_all=best_params['lr_all'],
        reg_all=best_params['reg_all'],
        random_state=42,
        verbose=True
    )

    # Train the model
    algo.fit(trainset)

    # Test the model
    predictions = algo.test(testset)

    # Calculate and print RMSE
    test_rmse = rmse(predictions)
    print(f"Test RMSE: {test_rmse:.4f}")

    return algo, predictions


# Main function to run the whole process
def main(file_path):
    print("Loading data...")
    data = load_data(file_path)

    print("Training SVD++ model...")
    algo, _ = train_svdpp_model(data)

    return algo

# Usage example

# Replace with your actual file path
file_path = "./data/train.csv"

# Set to False if you want to skip hyperparameter tuning (faster)
algo = main(file_path)

Loading data...
Training SVD++ model...
Performing hyperparameter tuning...
Best parameters: {'n_factors': 20, 'n_epochs': 25, 'lr_all': 0.007, 'reg_all': 0.1}
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 20
 processing epoch 21
 processing epoch 22
 processing epoch 23
 processing epoch 24
RMSE: 1.6449
Test RMSE: 1.6449


Se ha realizado un grid search con este algoritmo, obteniendo los mejores hiperparametros de k = 20, 25 iteraciones, learning rate de 0.07 y regularizacion de 0.1.

Prediccion normal

In [15]:
predictions = []
    
# Iterate through each row in the test data
for _, row in test.iterrows():
    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']
    
    # Predict rating for this user-item pair
    pred = algo.predict(uid=user_id, iid=item_id)
    
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': pred.est
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_20k_25iter_lr007.csv', index=False)

Se ha realizado el test con kaggle, obteniendo un resultado de 1.270.

Sin embargo esto unicamente se ha realizado con una prediccion unicamente del modelo, sin tener en cuenta la falta de usuarios o items en train.

Prediccion sustitucion media

In [16]:
predictions = []
    
# Iterate through each row in the test data

user_train_set = set(train['user'].unique())
item_train_set = set(train['item'].unique())

for _, row in test.iterrows():

    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']

    if user_id not in user_train_set and item_id not in item_train_set:
        predicted_value = train['rating'].mean()
    elif user_id not in user_train_set:
        predicted_value = train[train['item'] == row['item']]['rating'].mean()
    elif item_id not in item_train_set:
        predicted_value = train[train['user'] == row['user']]['rating'].mean()
    else:
        predicted_value = algo.predict(uid=user_id, iid=item_id).est
        
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': predicted_value
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_20k_25iter_lr007mean_unk.csv', index=False)

Sustituyendo las predicciones de usuarios e items faltantes por la media, se obtienen resultados de 1.262.

Dado que las notas son valores enteros, probamos a redondear los valores para que sean enteros, ya que los valores finales son enteros

In [17]:
predictions = []
    
# Iterate through each row in the test data

user_train_set = set(train['user'].unique())
item_train_set = set(train['item'].unique())

for _, row in test.iterrows():

    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']

    if user_id not in user_train_set and item_id not in item_train_set:
        predicted_value = train['rating'].mean()
    elif user_id not in user_train_set:
        predicted_value = train[train['item'] == row['item']]['rating'].mean()
    elif item_id not in item_train_set:
        predicted_value = train[train['user'] == row['user']]['rating'].mean()
    else:
        predicted_value = algo.predict(uid=user_id, iid=item_id).est
        
    predicted_value = round(predicted_value)
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': predicted_value
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_20k_25iter_lr007_round_mean_unk.csv', index=False)

Este nuevo sistema de predicciones en base al redondeo mejora los resultados anteriores, obteniendo unos resultados en test de 1.237.

Realizamos las predicciones con un redondeo mas ameno. En vez de redondear siempre, solo se redondea solo si se esta seguro de que la prediccion es correcta

In [4]:
def custom_round(number: float, min_round = 0.3, max_ronund = 0.7) -> float:
    integer_part = int(number)
    decimal_part = number - integer_part
    
    if decimal_part < 0.3:
        return float(integer_part)  # Round down
    elif decimal_part > 0.7:
        return float(integer_part + 1)  # Round up
    else:
        return number




predictions = []
    
# Iterate through each row in the test data

user_train_set = set(train['user'].unique())
item_train_set = set(train['item'].unique())

for _, row in test.iterrows():

    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']

    if user_id not in user_train_set and item_id not in item_train_set:
        predicted_value = train['rating'].mean()
    elif user_id not in user_train_set:
        predicted_value = train[train['item'] == row['item']]['rating'].mean()
    elif item_id not in item_train_set:
        predicted_value = train[train['user'] == row['user']]['rating'].mean()
    else:
        predicted_value = algo.predict(uid=user_id, iid=item_id).est
        
    predicted_value = custom_round(predicted_value)
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': predicted_value
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_20k_25iter_lr007_custom_round_mean_unk.csv', index=False)

Este nuevo redondeo aunque parece ser mas seguro, empeora los resultados, esto es debido a que aunque el modelo no este seguro de si una prediccion es un valor u otro, si se decanta por un valor, seguramente sea porque el valor entero mas cercano es la prediccion verdadera.