LightGCN para sistemas de recomendacion

In [1]:
import surprise

In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix,lil_matrix

RANDOM_STATE = 46
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from surprise.accuracy import rmse

# Step 1: Load and prepare the dataset
# Assuming your data is in a CSV file with columns 'user', 'item', 'rating'
def load_data(file_path):
    # Load the data
    df = pd.read_csv(file_path)

    # Define the format of the data
    reader = Reader(rating_scale=(df['rating'].min(), df['rating'].max()))

    # Load the data into the Surprise format
    data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)

    return data

# Step 2: Train SVD++ model with hyperparameter tuning
def train_svdpp_model(data):
    # Split data into train and test sets
    trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


    print("Performing hyperparameter tuning...")
    # Define parameter grid
    param_grid = {
        'n_factors': [20, 30, 40],
        'n_epochs': [20],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02]
    }

    # Perform grid search
    gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
    gs.fit(data)

    # Get the best parameters
    best_params = gs.best_params['rmse']
    print(f"Best parameters: {best_params}")

    # Train with best parameters
    algo = SVDpp(
        n_factors=best_params['n_factors'],
        n_epochs=best_params['n_epochs'],
        lr_all=best_params['lr_all'],
        reg_all=best_params['reg_all'],
        random_state=42,
        verbose=True
    )

    # Train the model
    algo.fit(trainset)

    # Test the model
    predictions = algo.test(testset)

    # Calculate and print RMSE
    test_rmse = rmse(predictions)
    print(f"Test RMSE: {test_rmse:.4f}")

    return algo, predictions

# Step 3: Analyze the model's latent features
def analyze_latent_features(algo, n_features=5):
    # Get the user and item factors
    user_factors = algo.pu
    item_factors = algo.qi

    # Print the shape of the matrices
    print(f"User factors shape: {user_factors.shape}")
    print(f"Item factors shape: {item_factors.shape}")

    # Visualize feature distributions for first n features
    plt.figure(figsize=(15, 10))

    for i in range(min(n_features, user_factors.shape[1])):
        plt.subplot(2, n_features, i+1)
        plt.hist(user_factors[:, i], bins=50)
        plt.title(f'User Feature {i+1}')

        plt.subplot(2, n_features, i+1+n_features)
        plt.hist(item_factors[:, i], bins=50)
        plt.title(f'Item Feature {i+1}')

    plt.tight_layout()
    plt.savefig('latent_features.png')
    plt.close()

# Main function to run the whole process
def main(file_path):
    print("Loading data...")
    data = load_data(file_path)

    print("Training SVD++ model...")
    algo, predictions = train_svdpp_model(data)

    print("Analyzing latent features...")
    analyze_latent_features(algo)

    return algo

# Usage example

# Replace with your actual file path
file_path = "./data/train.csv"

# Set to False if you want to skip hyperparameter tuning (faster)
algo = main(file_path)

Loading data...
Training SVD++ model...
Performing hyperparameter tuning...


In [8]:
test

Unnamed: 0,ID,user,item
0,0,8117,268
1,1,10512,24393
2,2,534,1334
3,3,10984,6550
4,4,9093,22128
...,...,...,...
43315,43315,534,1751
43316,43316,1150,5467
43317,43317,10184,8805
43318,43318,7531,11566


Prediccion normal

In [None]:
predictions = []
    
# Iterate through each row in the test data
for _, row in test.iterrows():
    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']
    
    # Predict rating for this user-item pair
    pred = algo.predict(uid=user_id, iid=item_id)
    
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': pred.est
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp.csv', index=False)

Prediccion sustitucion media

In [None]:
predictions = []
    
# Iterate through each row in the test data

user_train_set = set(train['user'].unique())
item_train_set = set(train['item'].unique())

for _, row in test.iterrows():

    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']

    if user_id not in user_train_set and item_id not in item_train_set:
        predicted_value = train['rating'].mean()
    elif user_id not in user_train_set:
        predicted_value = train[train['item'] == row['item']]['rating'].mean()
    elif item_id not in item_train_set:
        predicted_value = train[train['user'] == row['user']]['rating'].mean()
    else:
        predicted_value = algo.predict(uid=user_id, iid=item_id).est
        
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': predicted_value
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_mean_unk.csv', index=False)

Dado que las notas son valores enteros, probamos a redondear los valores para que sean enteros, ya que los valores finales son enteros

In [15]:
predictions = []
    
# Iterate through each row in the test data

user_train_set = set(train['user'].unique())
item_train_set = set(train['item'].unique())

for _, row in test.iterrows():

    user_id = row['user']
    item_id = row['item']
    id_value = row['ID']

    if user_id not in user_train_set and item_id not in item_train_set:
        predicted_value = train['rating'].mean()
    elif user_id not in user_train_set:
        predicted_value = train[train['item'] == row['item']]['rating'].mean()
    elif item_id not in item_train_set:
        predicted_value = train[train['user'] == row['user']]['rating'].mean()
    else:
        predicted_value = algo.predict(uid=user_id, iid=item_id).est
        
    predicted_value = round(predicted_value)
    # Store the prediction
    predictions.append({
        'ID': id_value,
        'rating': predicted_value
    })

# Create DataFrame from predictions
result_df = pd.DataFrame(predictions)

result_df.to_csv('./SVDpp_round_mean_unk.csv', index=False)