In [87]:
#import csv
import pandas as pd
#import numpy as np
import pyarrow as pa
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

import my_functions
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Carga de datasets a utilizar para generar las funciones

In [88]:
# Cargar el archivo CSV en un DataFrame
df_games = pd.read_csv('Datasets/Datasets ETL/steam_games_v2.csv')
# Cargar el archivo CSV en un DataFrame
df_reviews = pd.read_csv('Datasets/Datasets ETL/user_reviews_v1.csv')
# Cargar el archivo CSV en un DataFrame
df_items = pd.read_csv('Datasets/Datasets ETL/user_items_v2.csv')

## df_games_names

In [89]:
# Seleccionar las columnas 'game_id' y 'app_name' de df_games_data
df_games_names = df_games[['game_id', 'app_name']].copy()

# Convertir el campo 'game_id' a tipo string
df_games_names['game_id'] = df_games_names['game_id'].astype(str)
#df_games_names.set_index('game_id', inplace=True)

In [90]:
df_games_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31332 entries, 0 to 31331
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   game_id   31332 non-null  object
 1   app_name  31332 non-null  object
dtypes: object(2)
memory usage: 489.7+ KB


In [91]:
# Exportar a formato CSV
df_games_names.to_csv('Datasets/Datasets final/df_games_names.csv', index=False)

# Exportar 
df_games_names.to_csv('FastAPI/Datasets/df_games_names.csv', index=False)

In [92]:
pd.reset_option('display.max_rows')
df_games_names

Unnamed: 0,game_id,app_name
0,761140,Lost Summoner Kitty
1,643980,Ironbound
2,670290,Real Pool 3D - Poolians
3,767400,弹炸人2222
4,773570,Log Challenge
...,...,...
31327,773640,Colony On Mars
31328,733530,LOGistICAL: South Africa
31329,610660,Russian Roads
31330,658870,EXIT 2 - Directions


## Machine Learning

En esta etapa se trabajará sobre los datos necesarios para responder a 2 funciones de recomendación:
- 'def recomendacion_juego': ingresando el id de un producto, deberíamos recibir una lista con 5 juegos recomendados similares al ingresado.

- 'def recomendacion_usuario': ingresando el id de un usuario, deberíamos recibir una lista con 5 juegos recomendados para dicho usuario.

Para ello se aplicará un algoritmo de similitud del coseno, generando matrices de similitud entre juegos y entre usuarios.

## Recomendación de juegos

## df_games_data

In [93]:
# Completar la columna 'app_name'
df_games_data = df_games[['game_id', 'app_name']]

# Completar la columna 'playtime' desde df_items
playtime_by_game = df_items.groupby('game_id')['playtime_forever'].sum()
df_games_data = pd.merge(df_games_data, playtime_by_game, on='game_id', how='left')
df_games_data = df_games_data.rename(columns={'playtime_forever': 'playtime'})

# Completar las columnas 'neg_rec' y 'pos_rec' desde df_reviews
df_reviews['neg_rec'] = ((df_reviews['recommend'] == False) & ((df_reviews['sentiment'] == 0) | df_reviews['sentiment'].isnull())).astype(int)
df_reviews['pos_rec'] = ((df_reviews['recommend'] == True) & ((df_reviews['sentiment'] == 1) | (df_reviews['sentiment'] == 2) | df_reviews['sentiment'].isnull())).astype(int)

# Sumar los valores por 'game_id'
neg_rec_counts = df_reviews.groupby('game_id')['neg_rec'].sum().reset_index()
pos_rec_counts = df_reviews.groupby('game_id')['pos_rec'].sum().reset_index()

# Completar las columnas 'neg_rec' y 'pos_rec' en df_games_data
df_games_data = pd.merge(df_games_data, neg_rec_counts, on='game_id', how='left')
df_games_data = pd.merge(df_games_data, pos_rec_counts, on='game_id', how='left')

# Rellenar con ceros los valores nulos después de la fusión
df_games_data[['neg_rec', 'pos_rec']] = df_games_data[['neg_rec', 'pos_rec']].fillna(0).astype(int)

# Agregar la columna 'total_rec' 
df_games_data['total_rec'] = df_games_data['neg_rec'] + df_games_data['pos_rec']

# Completar las columnas con prefijo 'gnr_' desde df_games
gnr_columns = [col for col in df_games.columns if col.startswith('gnr_')]

# Renombrar las columnas antes de unirlas
# rename_dict = {col: f"{col}_games" for col in gnr_columns}
df_games_data = pd.merge(df_games_data, df_games[['game_id'] + gnr_columns], on='game_id', how='left')

# Completar los campos con valor nulo
# df_games_selection[['app_name']] = df_games_selection[['app_name']].fillna('Sin Datos')
df_games_data = df_games_data.fillna(0)

# Convertir las columnas a los tipos especificados
df_games_data = df_games_data.astype({
    'game_id': 'string',
    'playtime': int
})

# Convertir las columnas gnr_ a tipo int
gnr_columns = [col for col in df_games_data.columns if col.startswith('gnr_')]
df_games_data[gnr_columns] = df_games_data[gnr_columns].astype(int)

In [94]:
# Resetear indice
df_games_data = df_games_data.reset_index(drop=True)
df_games_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31332 entries, 0 to 31331
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    31332 non-null  string
 1   app_name                   31332 non-null  object
 2   playtime                   31332 non-null  int32 
 3   neg_rec                    31332 non-null  int32 
 4   pos_rec                    31332 non-null  int32 
 5   total_rec                  31332 non-null  int32 
 6   gnr_Action                 31332 non-null  int32 
 7   gnr_Adventure              31332 non-null  int32 
 8   gnr_Casual                 31332 non-null  int32 
 9   gnr_Early Access           31332 non-null  int32 
 10  gnr_Free to Play           31332 non-null  int32 
 11  gnr_Indie                  31332 non-null  int32 
 12  gnr_Massively Multiplayer  31332 non-null  int32 
 13  gnr_RPG                    31332 non-null  int32 
 14  gnr_Ra

Reservamos información actual de df_games_data en df_games_selection para luego trabajar sobre esté último.

In [95]:
# Obtener las columnas que comienzan con 'gnr_'
gnr_columns = [col for col in df_games_data.columns if col.startswith('gnr_')]

# Filtrar las filas donde todas las columnas 'gnr_' tengan valor 0
df_games_selection = df_games_data[~(df_games_data[gnr_columns] == 0).all(axis=1)]

Continuamos trabajando sobre df_games_data.

In [96]:
# Tomar solo las columnas necesarias de df_games
cols_to_keep = ['game_id', 'app_name', 'playtime'] + [col for col in df_games.columns if col.startswith('gnr_')]
df_games_data = df_games_data[cols_to_keep].copy()

# Filtrar filas donde todas las columnas 'gnr_' sean 0
df_games_data = df_games_data[df_games_data.filter(like='gnr_').sum(axis=1) > 0]

# Establecer 'game_id' como índice
# df_games_data.set_index('game_id', inplace=True)

# Resetear indice
df_games_data = df_games_data.reset_index(drop=True)


In [97]:
# Calcular el percentil 85 de playtime
playtime_threshold = df_games_data['playtime'].quantile(0.85)

# Filtrar las filas con playtime mayor que el percentil 85
df_games_data = df_games_data[df_games_data['playtime'] > playtime_threshold].reset_index(drop=True)


In [98]:
# Eliminar la columna 'playtime'
df_games_data = df_games_data.drop(columns=['playtime'])

In [99]:
df_games_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4208 entries, 0 to 4207
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    4208 non-null   string
 1   app_name                   4208 non-null   object
 2   gnr_Action                 4208 non-null   int32 
 3   gnr_Adventure              4208 non-null   int32 
 4   gnr_Casual                 4208 non-null   int32 
 5   gnr_Early Access           4208 non-null   int32 
 6   gnr_Free to Play           4208 non-null   int32 
 7   gnr_Indie                  4208 non-null   int32 
 8   gnr_Massively Multiplayer  4208 non-null   int32 
 9   gnr_RPG                    4208 non-null   int32 
 10  gnr_Racing                 4208 non-null   int32 
 11  gnr_Simulation             4208 non-null   int32 
 12  gnr_Sports                 4208 non-null   int32 
 13  gnr_Strategy               4208 non-null   int32 
dtypes: int32

## Filtros

## df_games_selection

Primero se trabaja sobre el dataset con la información necesaria para la comparación entre juegos.

In [100]:
df_games_selection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28050 entries, 0 to 31330
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    28050 non-null  string
 1   app_name                   28050 non-null  object
 2   playtime                   28050 non-null  int32 
 3   neg_rec                    28050 non-null  int32 
 4   pos_rec                    28050 non-null  int32 
 5   total_rec                  28050 non-null  int32 
 6   gnr_Action                 28050 non-null  int32 
 7   gnr_Adventure              28050 non-null  int32 
 8   gnr_Casual                 28050 non-null  int32 
 9   gnr_Early Access           28050 non-null  int32 
 10  gnr_Free to Play           28050 non-null  int32 
 11  gnr_Indie                  28050 non-null  int32 
 12  gnr_Massively Multiplayer  28050 non-null  int32 
 13  gnr_RPG                    28050 non-null  int32 
 14  gnr_Racing 

In [101]:
# Eliminar filas donde neg_rec > pos_rec
df_games_selection = df_games_selection[df_games_selection['neg_rec'] < df_games_selection['pos_rec']]

In [102]:
df_games_selection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2519 entries, 27 to 31314
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    2519 non-null   string
 1   app_name                   2519 non-null   object
 2   playtime                   2519 non-null   int32 
 3   neg_rec                    2519 non-null   int32 
 4   pos_rec                    2519 non-null   int32 
 5   total_rec                  2519 non-null   int32 
 6   gnr_Action                 2519 non-null   int32 
 7   gnr_Adventure              2519 non-null   int32 
 8   gnr_Casual                 2519 non-null   int32 
 9   gnr_Early Access           2519 non-null   int32 
 10  gnr_Free to Play           2519 non-null   int32 
 11  gnr_Indie                  2519 non-null   int32 
 12  gnr_Massively Multiplayer  2519 non-null   int32 
 13  gnr_RPG                    2519 non-null   int32 
 14  gnr_Racing 

In [20]:
# Filtrar y mantener solo las filas donde playtime no sea 0
df_games_selection = df_games_selection[df_games_selection['playtime'] > 0]
df_games_selection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2312 entries, 27 to 31314
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    2312 non-null   string
 1   app_name                   2312 non-null   object
 2   playtime                   2312 non-null   int32 
 3   neg_rec                    2312 non-null   int32 
 4   pos_rec                    2312 non-null   int32 
 5   total_rec                  2312 non-null   int32 
 6   gnr_Action                 2312 non-null   int32 
 7   gnr_Adventure              2312 non-null   int32 
 8   gnr_Casual                 2312 non-null   int32 
 9   gnr_Early Access           2312 non-null   int32 
 10  gnr_Free to Play           2312 non-null   int32 
 11  gnr_Indie                  2312 non-null   int32 
 12  gnr_Massively Multiplayer  2312 non-null   int32 
 13  gnr_RPG                    2312 non-null   int32 
 14  gnr_Racing 

In [21]:
# Seleccionar columnas 'app_name' y 'gnr_'
cols_to_select = ['game_id', 'app_name'] + [col for col in df_games_selection.columns if col.startswith('gnr_')]
df_games_selection = df_games_selection[cols_to_select].copy()
df_games_selection = df_games_selection.reset_index(drop=True)

In [22]:
df_games_selection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2312 entries, 0 to 2311
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    2312 non-null   string
 1   app_name                   2312 non-null   object
 2   gnr_Action                 2312 non-null   int32 
 3   gnr_Adventure              2312 non-null   int32 
 4   gnr_Casual                 2312 non-null   int32 
 5   gnr_Early Access           2312 non-null   int32 
 6   gnr_Free to Play           2312 non-null   int32 
 7   gnr_Indie                  2312 non-null   int32 
 8   gnr_Massively Multiplayer  2312 non-null   int32 
 9   gnr_RPG                    2312 non-null   int32 
 10  gnr_Racing                 2312 non-null   int32 
 11  gnr_Simulation             2312 non-null   int32 
 12  gnr_Sports                 2312 non-null   int32 
 13  gnr_Strategy               2312 non-null   int32 
dtypes: int32

In [23]:
df_games_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4208 entries, 0 to 4207
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   game_id                    4208 non-null   string
 1   app_name                   4208 non-null   object
 2   gnr_Action                 4208 non-null   int32 
 3   gnr_Adventure              4208 non-null   int32 
 4   gnr_Casual                 4208 non-null   int32 
 5   gnr_Early Access           4208 non-null   int32 
 6   gnr_Free to Play           4208 non-null   int32 
 7   gnr_Indie                  4208 non-null   int32 
 8   gnr_Massively Multiplayer  4208 non-null   int32 
 9   gnr_RPG                    4208 non-null   int32 
 10  gnr_Racing                 4208 non-null   int32 
 11  gnr_Simulation             4208 non-null   int32 
 12  gnr_Sports                 4208 non-null   int32 
 13  gnr_Strategy               4208 non-null   int32 
dtypes: int32

## Modelo

In [26]:
df_games_data.shape

(4208, 14)

In [27]:
df_games_selection.shape

(2312, 14)

## df_games_similarity

In [28]:
cols_for_similarity = [col for col in df_games_data.columns if col.startswith('gnr_')]

# Seleccionar las filas que contienen datos para las columnas seleccionadas
df_games_data_model = df_games_data[['game_id'] + cols_for_similarity]

# Seleccionar las columnas que contienen datos para las columnas seleccionadas
df_games_selection_model = df_games_selection[['game_id'] + cols_for_similarity]

# Establecer 'game_id' como índice
df_games_data_model.set_index('game_id', inplace=True)
df_games_selection_model.set_index('game_id', inplace=True)

In [29]:
# Calcular la matriz de similitud del coseno
similarity_matrix = cosine_similarity(df_games_data_model, df_games_selection_model)

# Crear un DataFrame con la matriz de similitud
df_games_similarity = pd.DataFrame(similarity_matrix, index=df_games_data_model.index, columns=df_games_selection_model.index)

In [30]:
df_games_similarity.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4208 entries, 282010 to 80
Columns: 2312 entries, 282010 to 80
dtypes: float64(2312)
memory usage: 74.3 MB


In [31]:
df_games_similarity.head(5)

game_id,282010,70,2400,3800,1520,2420,4000,6210,2990,6220,...,3320,20,50,60,10,40,130,30,13230,80
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
282010,1.0,0.57735,0.666667,0.57735,0.408248,0.666667,0.408248,0.0,0.57735,0.57735,...,0.0,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735
70,0.57735,1.0,0.57735,1.0,0.0,0.57735,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1640,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1630,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2400,0.666667,0.57735,1.0,0.57735,0.408248,1.0,0.408248,0.0,0.0,0.0,...,0.0,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735,0.57735


In [32]:
# Exportar a formato Parquet manteniendo los índices
df_games_similarity.to_parquet('Datasets/Datasets final/df_games_similarity.parquet', index=True)

# Exportar a formato Parquet manteniendo los índices
df_games_similarity.to_parquet('FastAPi/Datasets/df_games_similarity.parquet', index=True)

In [34]:
# Cargar el archivo csv en un DataFrame
df_games_names = pd.read_csv('FastAPI/Datasets/df_games_names.csv')

# Convertir el campo 'game_id' a tipo string
df_games_names['game_id'] = df_games_names['game_id'].astype(str)

df_games_names.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31332 entries, 0 to 31331
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   game_id   31332 non-null  object
 1   app_name  31332 non-null  object
dtypes: object(2)
memory usage: 489.7+ KB


## def recomendacion_juego

In [35]:
def recomendacion_juego(game_id:str, num_recommendations=5):

    game_name = df_games_names.loc[df_games_names['game_id'] == game_id, 'app_name'].values[0]

    # Convertir el campo 'game_id' a tipo string
    df_games_names['game_id'] = df_games_names['game_id'].astype(str)

    # Obtener la serie de juegos similares, excluir el game_id introducido 
    similar_games = df_games_similarity.drop(game_id).loc[:, game_id].sort_values(ascending=False)

    # Tomar las primeras num_recommendations recomendaciones
    recommendations = similar_games.head(num_recommendations).index.tolist()

    recommended_games = df_games_names.loc[df_games_names['game_id'].isin(recommendations), ['game_id', 'app_name']].reset_index(drop=True)

    recommended_games = recommended_games[['game_id', 'app_name']].to_dict(orient='records')

    message = f"Juegos similares a {game_name} (game_id {game_id}):"

    result = f"{message}", recommended_games
    
    return result


# Ejemplo de uso
game_id_input = '282010'
recommendations = recomendacion_juego(game_id_input)
recommendations

('Juegos similares a Carmageddon Max Pack (game_id 282010):',
 [{'game_id': '302710', 'app_name': 'BlazeRush'},
  {'game_id': '324070', 'app_name': 'iO'},
  {'game_id': '331650', 'app_name': 'Carmageddon TDR 2000'},
  {'game_id': '308580', 'app_name': 'Orborun'},
  {'game_id': '220820', 'app_name': 'Zombie Driver HD'}])

## Recomendación usuario - item

In [38]:
df_items.head(1)

Unnamed: 0,game_id,app_name,playtime_forever,user_id
0,10,Counter-Strike,6,76561197970982479


In [39]:
my_functions.describe_df(df_items)

Cantidad Registros:  4568172
Cantidad Campos:  4


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,int64,0,0.0,4568172,100.0,10854,0.24
1,app_name,object,0,0.0,4568172,100.0,10823,0.24
2,playtime_forever,int64,0,0.0,4568172,100.0,9568,0.21
3,user_id,object,0,0.0,4568172,100.0,70827,1.55


## df_usr_data

In [40]:
# Seleccionar solo las columnas requeridas
df_usr_data = df_items[['game_id', 'playtime_forever', 'user_id']]

# Convertir la columna 'game_id' a tipo de dato str
df_usr_data['game_id'] = df_usr_data['game_id'].astype(str)

# Ordenar por user_id y playtime_forever
df_usr_data = df_usr_data.sort_values(by=['user_id', 'playtime_forever'], ascending=[True, False]).groupby('user_id').head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usr_data['game_id'] = df_usr_data['game_id'].astype(str)


In [42]:
# Configurar la opción para mostrar todas las filas
pd.set_option('display.max_rows', None)
#pd.reset_option('display.max_rows')

# Imprimir el DataFrame resultante
df_usr_data.head(15)

Unnamed: 0,game_id,playtime_forever,user_id
1552359,304930,4037,--000--
1552324,1250,2949,--000--
1552335,113400,2763,--000--
4527742,304930,2959,--ace--
4527734,250900,1706,--ace--
4527723,113200,1269,--ace--
2353460,42690,8335,--ionex--
2353458,105600,842,--ionex--
2353454,10190,694,--ionex--
3453983,239140,3032,-2SV-vuLB-Kg


In [43]:
my_functions.describe_df(df_usr_data)

Cantidad Registros:  203914
Cantidad Campos:  3


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,203914,100.0,3467,1.7
1,playtime_forever,int64,0,0.0,203914,100.0,9568,4.69
2,user_id,object,0,0.0,203914,100.0,70827,34.73


In [44]:
# Eliminar la columna 'playtime_forever'
df_usr_data = df_usr_data.drop('playtime_forever', axis=1)


In [45]:
my_functions.describe_df(df_usr_data)

Cantidad Registros:  203914
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,203914,100.0,3467,1.7
1,user_id,object,0,0.0,203914,100.0,70827,34.73


In [46]:
# Agrupar por 'user_id' y contar la cantidad de registros para cada grupo
df_usr_data_count = df_usr_data.groupby('user_id').count().reset_index()

pd.reset_option('display.max_rows')
df_usr_data_count

Unnamed: 0,user_id,game_id
0,--000--,3
1,--ace--,3
2,--ionex--,3
3,-2SV-vuLB-Kg,3
4,-404PageNotFound-,3
...,...,...
70822,zzonci,3
70823,zzoptimuszz,3
70824,zzydrax,3
70825,zzyfo,3


In [47]:
# Borrar registros en df_usr_data_count donde game_id sea menor a 3
df_usr_data_count = df_usr_data_count[df_usr_data_count['game_id'] >= 3]

In [48]:
my_functions.describe_df(df_usr_data_count)

Cantidad Registros:  65623
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,user_id,object,0,0.0,65623,100.0,65623,100.0
1,game_id,int64,0,0.0,65623,100.0,1,0.0


In [49]:
# Quedarse solo con registros en df_usr_data que compartan user_id con df_usr_data_count_filtered
df_usr_data = df_usr_data[df_usr_data['user_id'].isin(df_usr_data_count['user_id'])]

In [50]:
my_functions.describe_df(df_usr_data)

Cantidad Registros:  196869
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,196869,100.0,3428,1.74
1,user_id,object,0,0.0,196869,100.0,65623,33.33


In [51]:
df_usr_data

Unnamed: 0,game_id,user_id
1552359,304930,--000--
1552324,1250,--000--
1552335,113400,--000--
4527742,304930,--ace--
4527734,250900,--ace--
...,...,...
1599564,110800,zzyfo
1599592,239820,zzyfo
3884987,49520,zzzmidmiss
3884977,50130,zzzmidmiss


In [52]:
# Quedarse con los primeros 3000 usuarios
df_usr_data = df_usr_data.head(9000)

In [53]:
df_usr_data

Unnamed: 0,game_id,user_id
1552359,304930,--000--
1552324,1250,--000--
1552335,113400,--000--
4527742,304930,--ace--
4527734,250900,--ace--
...,...,...
3455196,218620,76561197999375514
3455207,306130,76561197999375514
3455210,220,76561197999382290
3455216,420,76561197999382290


In [54]:
my_functions.describe_df(df_usr_data)

Cantidad Registros:  9000
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,9000,100.0,1202,13.36
1,user_id,object,0,0.0,9000,100.0,3000,33.33


In [55]:
pd.reset_option('display.max_rows')

In [56]:
# Exportar 
df_usr_data.to_parquet('Datasets/Datasets final/df_usr_data.parquet', index=False)

# Exportar 
df_usr_data.to_parquet('FastAPI/Datasets/df_usr_data.parquet', index=False)

## df_usr_selection

In [58]:
# Quedarse con el primer registro de cada user_id
df_usr_selection = df_usr_data.drop_duplicates(subset='user_id', keep='first')

# Imprimir el DataFrame resultante
df_usr_selection.head(15)

Unnamed: 0,game_id,user_id
1552359,304930,--000--
4527742,304930,--ace--
2353460,42690,--ionex--
3453983,239140,-2SV-vuLB-Kg
4253522,212910,-404PageNotFound-
43151,33930,-AnimeIsMyThing-
52444,39690,-Azsael-
1724393,304930,-Beave-
2144175,212680,-Encore-
30078,113400,-GM-Dragon


In [59]:
df_usr_selection.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 1552359 to 3455210
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   game_id  3000 non-null   object
 1   user_id  3000 non-null   object
dtypes: object(2)
memory usage: 70.3+ KB


In [60]:
# Realizar la concatenación y el filtrado
df_usr_selection = pd.merge(df_usr_selection, df_games_selection[['game_id']], on='game_id', how='inner')


In [61]:
df_usr_selection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2435 entries, 0 to 2434
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   game_id  2435 non-null   object
 1   user_id  2435 non-null   object
dtypes: object(2)
memory usage: 38.2+ KB


In [62]:
df_usr_selection

Unnamed: 0,game_id,user_id
0,304930,--000--
1,304930,--ace--
2,304930,-Beave-
3,304930,008port
4,304930,12341846
...,...,...
2430,242550,76561197998937521
2431,32440,76561197999140519
2432,6020,76561197999155279
2433,24780,76561197999171631


In [63]:
my_functions.describe_df(df_usr_selection)

Cantidad Registros:  2435
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,2435,100.0,490,20.12
1,user_id,object,0,0.0,2435,100.0,2435,100.0


In [64]:
my_functions.describe_df(df_usr_data)

Cantidad Registros:  9000
Cantidad Campos:  2


Unnamed: 0,Campo,Tipo de Dato,Valores Nulos,% Nulos,Valores No Nulos,% No Nulos,Valores Únicos,% Únicos
0,game_id,object,0,0.0,9000,100.0,1202,13.36
1,user_id,object,0,0.0,9000,100.0,3000,33.33


## Modelo

In [66]:
# Agrupar por 'user_id' y generar una lista de distintos valores de 'game_id' para cada usuario
df_usr_data_model = df_usr_data.groupby('user_id')['game_id'].agg(lambda x: list(set(x))).reset_index()

# Renombrar la columna
df_usr_data_model.rename(columns={'game_id': 'games'}, inplace=True)


In [67]:
df_usr_data_model

Unnamed: 0,user_id,games
0,--000--,"[304930, 1250, 113400]"
1,--ace--,"[304930, 250900, 113200]"
2,--ionex--,"[10190, 42690, 105600]"
3,-2SV-vuLB-Kg,"[294020, 338050, 239140]"
4,-404PageNotFound-,"[243870, 212910, 226700]"
...,...,...
2995,76561197999368302,"[22380, 200510, 1250]"
2996,76561197999369109,"[240, 300, 320]"
2997,76561197999370640,"[219640, 200710, 24780]"
2998,76561197999375514,"[306130, 226320, 218620]"


In [68]:
# Establecer 'user_id' como índice
df_usr_data_model.set_index('user_id', inplace=True)

df_usr_data_model

Unnamed: 0_level_0,games
user_id,Unnamed: 1_level_1
--000--,"[304930, 1250, 113400]"
--ace--,"[304930, 250900, 113200]"
--ionex--,"[10190, 42690, 105600]"
-2SV-vuLB-Kg,"[294020, 338050, 239140]"
-404PageNotFound-,"[243870, 212910, 226700]"
...,...
76561197999368302,"[22380, 200510, 1250]"
76561197999369109,"[240, 300, 320]"
76561197999370640,"[219640, 200710, 24780]"
76561197999375514,"[306130, 226320, 218620]"


In [71]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

In [72]:
# Convertir la columna 'games' a un formato adecuado para la similitud del coseno
mlb = MultiLabelBinarizer()
usr_matrix = mlb.fit_transform(df_usr_data_model['games'])
usr_df = pd.DataFrame(usr_matrix, index=df_usr_data_model.index)


In [73]:
# Calcular la similitud del coseno entre usuarios
cosine_sim = cosine_similarity(usr_df)

# Crear un DataFrame de similitud del coseno
df_user_similarity = pd.DataFrame(cosine_sim, index=df_usr_data_model.index, columns=df_usr_data_model.index)

In [74]:
pd.set_option('display.max_rows', None)

# Imprimir el DataFrame de similitud del coseno
df_user_similarity.head(10)

user_id,--000--,--ace--,--ionex--,-2SV-vuLB-Kg,-404PageNotFound-,-AnimeIsMyThing-,-Azsael-,-Beave-,-Encore-,-GM-Dragon,...,76561197999306529,76561197999320594,76561197999327663,76561197999330728,76561197999351396,76561197999368302,76561197999369109,76561197999370640,76561197999375514,76561197999382290
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,1.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
--ace--,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ionex--,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2SV-vuLB-Kg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-404PageNotFound-,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-AnimeIsMyThing-,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
-Azsael-,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Beave-,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Encore-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-GM-Dragon,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# Exportar 
df_user_similarity.to_parquet('Datasets/Datasets final/df_user_similarity.parquet', index=True)

# Exportar 
df_user_similarity.to_parquet('FastAPI/Datasets/df_user_similarity.parquet', index=True)

In [77]:
df_user_similarity.shape

(3000, 3000)

## def recomendacion_usuario

In [78]:
pd.reset_option('display.max_rows')

# Cargar el archivo Parquet en un DataFrame
df_usr_data = pd.read_parquet('FastAPI/Datasets/df_usr_data.parquet')

In [79]:
def recomendacion_usuario(user_id: str, num_recommendations = 5):
    # Filtrar df_usr_data por el user_id específico
    user_games = df_usr_data[df_usr_data['user_id'] == user_id]

    # Crear la lista user_games a partir de los valores de df_usr_data
    user_games = user_games['game_id'].tolist()

    # Obtener la serie de usuarios similares, excluir el game_id introducido 
    similar_users = df_user_similarity.drop(user_id).loc[:, user_id].sort_values(ascending=False).index.tolist()

    # Crear DataFrame con similar_users
    df_similar_users = pd.DataFrame({'user_id': similar_users})

    # Realizar la concatenación con df_usr_selection usando el método merge
    df_concatenado = pd.merge(df_similar_users, df_usr_data, on='user_id', how='inner')

    # Filtrar las filas donde el game_id no está en user_games
    df_concatenado = df_concatenado[~df_concatenado['game_id'].isin(user_games)]

    # Utilizar merge para concatenar basándote en la columna game_id
    df_final = pd.merge(df_concatenado, df_games_names, on='game_id', how='inner')

    # Eliminar duplicados y quedarse con el primer registro
    df_final_sin_duplicados = df_final.drop_duplicates(subset='game_id', keep='first').head(num_recommendations) 
        
    recommended_games = df_final_sin_duplicados[['game_id', 'app_name', 'user_id']].to_dict(orient='records')
 
    message = f"A usuarios similares a {user_id} les gustó:"
    
    #result = message, recommended_games
    result = f"{message}", recommended_games
    
    return result

user_id = '-Encore-'
rec_usuario = recomendacion_usuario(user_id)
rec_usuario


('A usuarios similares a -Encore- les gustó:',
 [{'game_id': '42700',
   'app_name': 'Call of Duty®: Black Ops',
   'user_id': '59285912859'},
  {'game_id': '8980', 'app_name': 'Borderlands', 'user_id': '59285912859'},
  {'game_id': '42910', 'app_name': 'Magicka', 'user_id': '76561197998052647'},
  {'game_id': '15620',
   'app_name': 'Warhammer 40,000: Dawn of War II',
   'user_id': '76561197998052647'},
  {'game_id': '377160',
   'app_name': 'Fallout 4',
   'user_id': '76561197961051810'}])