In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import numpy as np

In [2]:
#Cargamos el dataset
df_reviews = pd.read_parquet('src/cleaned/reviews.parquet')

In [3]:
#Separamos los juegos con reviews positivas y negativas y los agrupamos por usuario
df_good_games_by_user = df_reviews.loc[df_reviews['sentiment_analysis'] > 0].groupby('user_id').agg({'item_id': 'unique'}).reset_index()
df_bad_games_by_user = df_reviews.loc[df_reviews['sentiment_analysis'] == 0].groupby('user_id').agg({'item_id': 'unique'}).reset_index()

In [4]:
#Creamos variables dummies para las reviews positivas y negativas
good_dummies = pd.get_dummies(df_good_games_by_user['item_id'].explode(), prefix='good').groupby(level=0).sum()
bad_dummies = pd.get_dummies(df_bad_games_by_user['item_id'].explode(), prefix='bad').groupby(level=0).sum()

In [5]:
#Unimos los datasets 
df_games_by_user = pd.merge(df_good_games_by_user, good_dummies, left_index=True, right_index=True)
df_games_by_user = pd.merge(df_games_by_user, bad_dummies, left_index=True, right_index=True, how='left').drop(columns=['item_id'])
df_games_by_user.fillna(0, inplace=True)

In [6]:
#Calculamos la matriz de distancia
distances = pairwise_distances(df_games_by_user.drop(columns=['user_id']), metric='cosine')

In [19]:
#Convertimos la matriz en un dataframe y agregamos los nombres de los usuarios
df_distances = pd.DataFrame(distances, index=df_games_by_user['user_id'].tolist() ,columns=df_games_by_user['user_id'].tolist())
# df_distances.insert(0, 'user_id', df_games_by_user['user_id'].tolist())

In [21]:
#Guardamos la matrix en un archivo parquet
df_distances.to_parquet('src/cleaned/user_distance_matrix.parquet')