In [10]:
from surprise import Dataset, Reader, SVD, KNNWithMeans
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
from surprise.model_selection.search import RandomizedSearchCV
import pandas as pd
import numpy as np

In [28]:
#Leemos los datos de las reseñas / Ejemplo con Reviews_9.csv
df = pd.read_csv('D:/Marcos/HENRY/Proyecto-Final/reviews_finales/reviews_1.csv', sep=';', escapechar='\\')

In [29]:
df.head()

Unnamed: 0,Id_Usuario,Id_Restaurant,Rating,Reseña,Timestamp
0,1.179759e+20,0x8889221157fb3455:0x5c125c40c3eccc2a,4,"On the higher end of price for pizza, but they...",1463443013514
1,1.143165e+20,0x8889221157fb3455:0x5c125c40c3eccc2a,3,"Food was ok, felt like the atmosphere as well ...",1447623939865
2,1.136326e+20,0x8889221157fb3455:0x5c125c40c3eccc2a,4,"Good food, service so so",1469293549247
3,1.07684e+20,0x8889221157fb3455:0x5c125c40c3eccc2a,5,Love it,1442279219480
4,1.126036e+20,0x8889221157fb3455:0x5c125c40c3eccc2a,5,Yum,1382634896130


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Id_Usuario     float64
 1   Id_Restaurant  object 
 2   Rating         int64  
 3   Reseña         object 
 4   Timestamp      int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 76.3+ MB


In [31]:
#Crea un objeto Reader y Dataset de Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["Id_Usuario", "Id_Restaurant", "Rating"]], reader)

In [32]:
#Divide los datos en conjuntos de entrenamiento y prueba
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [None]:
#Optimizamos hiperparámetros. Primero vamos a hacerlo con el SVD.
parametros = {'n_factors':np.arange(10,1000),
             'n_epochs':np.arange(5,50),
             'lr_all':np.arange(0.001,0.05,0.001),
             'reg_all':np.arange(0.005,0.1,0.005)}

rs = RandomizedSearchCV(SVD,param_distributions=parametros,cv=3,n_iter=30,n_jobs=-1,random_state =42)
rs.fit(data)

In [22]:
results_df = pd.DataFrame(rs.cv_results)
results_df.sort_values(by='mean_test_rmse').head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_lr_all,param_reg_all
1,1.339292,1.333803,1.37705,1.350048,0.019224,1,1.081632,1.092803,1.10866,1.094365,...,1,0.411235,0.052966,0.037565,0.008034,"{'n_factors': 69, 'n_epochs': 15, 'lr_all': 0....",69,15,0.027,0.06
28,1.341265,1.336985,1.376386,1.351545,0.017651,2,1.083849,1.097378,1.110418,1.097215,...,4,0.429183,0.00611,0.035571,0.009153,"{'n_factors': 147, 'n_epochs': 9, 'lr_all': 0....",147,9,0.04,0.02
2,1.341528,1.337397,1.377913,1.35228,0.018204,3,1.085866,1.097236,1.113241,1.098781,...,5,1.312825,0.026667,0.044549,0.025706,"{'n_factors': 132, 'n_epochs': 29, 'lr_all': 0...",132,29,0.039,0.08
20,1.341936,1.336382,1.380507,1.352941,0.019623,4,1.081458,1.093533,1.112039,1.095677,...,2,0.600392,0.012325,0.031585,0.002486,"{'n_factors': 77, 'n_epochs': 21, 'lr_all': 0....",77,21,0.041,0.055
26,1.341536,1.338122,1.381796,1.353818,0.019832,5,1.082213,1.095776,1.113501,1.097163,...,3,1.159233,0.028354,0.032912,0.010586,"{'n_factors': 76, 'n_epochs': 42, 'lr_all': 0....",76,42,0.037,0.08


In [23]:
#Tomamos solo las columnas que nos interesan.
resultados = results_df[['param_n_factors','param_n_epochs','param_lr_all',
                         'param_reg_all','mean_test_rmse']].sort_values(by='mean_test_rmse')
resultados

Unnamed: 0,param_n_factors,param_n_epochs,param_lr_all,param_reg_all,mean_test_rmse
1,69,15,0.027,0.06,1.350048
28,147,9,0.04,0.02,1.351545
2,132,29,0.039,0.08,1.35228
20,77,21,0.041,0.055,1.352941
26,76,42,0.037,0.08,1.353818
29,222,36,0.036,0.025,1.354762
22,230,17,0.012,0.055,1.356235
21,246,48,0.008,0.035,1.35643
25,264,25,0.046,0.045,1.357068
3,97,41,0.003,0.03,1.358354


In [24]:
#Definimos función para limpiar los nombres de los campos.
def shorten_param(param_name):
    if "param_" in param_name:
        return param_name.rsplit("param_", 1)[1]
    return param_name

In [25]:
#Limpiamos los nombres de los campos
resultados = resultados.rename(shorten_param, axis=1)
resultados

Unnamed: 0,n_factors,n_epochs,lr_all,reg_all,mean_test_rmse
1,69,15,0.027,0.06,1.350048
28,147,9,0.04,0.02,1.351545
2,132,29,0.039,0.08,1.35228
20,77,21,0.041,0.055,1.352941
26,76,42,0.037,0.08,1.353818
29,222,36,0.036,0.025,1.354762
22,230,17,0.012,0.055,1.356235
21,246,48,0.008,0.035,1.35643
25,264,25,0.046,0.045,1.357068
3,97,41,0.003,0.03,1.358354


In [26]:
#Ploteamos las dependencias de cada hiperparámetro
import plotly.express as px

fig = px.parallel_coordinates(
    resultados,
    color="mean_test_rmse",
    color_continuous_scale=px.colors.sequential.Viridis,
)
fig.show()

In [115]:
#Obtenemos las reseñas del usuario
user_reviews = {}
for user_id, restaurant_id, rating in df[["Id_Usuario", "Id_Restaurant", "Rating"]].values:
  if user_id not in user_reviews:
    user_reviews[user_id] = []
  user_reviews[user_id].append((restaurant_id, rating))

In [116]:
#Pide al usuario que ingrese su ID de usuario
user_id = input("Ingrese el Id de su usuario: ")

Ingrese el Id de su usuario: z0Yh6ntpynyTP6x0YLCnLA


In [117]:
#Hacemos la recomendacion de los restaurantes que el usuario aún no ha visitado
restaurant_recs = []
for restaurant_id in df["Id_Restaurant"].unique():
  if restaurant_id not in [r_iid for r_iid, _ in user_reviews.get(user_id, [])]:
    predicted_rating = algo.predict(user_id, restaurant_id).est
    restaurant_recs.append((restaurant_id, predicted_rating))

In [118]:
#Ordena las recomendaciones por calificación
restaurant_recs = sorted(restaurant_recs, key=lambda x: x[1], reverse=True)

In [119]:
#Muestra las 5 mejores recomendaciones al usuario
print("Recomendaciones para el usuario: {}".format(user_id))
for restaurant_id, predicted_rating in restaurant_recs[:5]:
  print("- Restaurante {}: Prediccion de puntaje {:.2f}".format(restaurant_id, predicted_rating))

Recomendaciones para el usuario: z0Yh6ntpynyTP6x0YLCnLA
- Restaurante QoezRbYQncpRqyrLH6Iqjg: Prediccion de puntaje 4.57
- Restaurante 9xdXS7jtWjCVzL4_oPGv9A: Prediccion de puntaje 4.56
- Restaurante lqF0Q5vVLxY-_Cth6LtEdg: Prediccion de puntaje 4.49
- Restaurante ytynqOUb3hjKeJfRj5Tshw: Prediccion de puntaje 4.47
- Restaurante K7KHmHzxNwzqiijSJeKe_A: Prediccion de puntaje 4.46


#Evaluación del modelo

In [120]:
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [123]:
#Evaluamos con RMSE (error cuadrático medio)
predictions = algo.test(testset)
accuracy.rmse(predictions)

RMSE: 1.3644


1.3644194321204535