In [60]:
# importamos las librerías
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
# importamos nuetro df base
games = pd.read_parquet("nuevo/games_API.parquet")

In [62]:
# nos quedamos con nuestras columnas de interés
games = games[["item_id", "game_name", "description"]]
games

Unnamed: 0,item_id,game_name,description
0,761140,Lost Summoner Kitty,"Action, Casual, Indie, Simulation, Strategy, S..."
1,643980,Ironbound,"Free to Play, Indie, RPG, Strategy, Free to Pl..."
2,670290,Real Pool 3D - Poolians,"Casual, Free to Play, Indie, Simulation, Sport..."
3,767400,弹炸人2222,"Action, Adventure, Casual, Action, Adventure, ..."
4,772540,Battle Royale Trainer,"Action, Adventure, Simulation, Action, Adventu..."
...,...,...,...
22525,745400,Kebab it Up!,"Action, Adventure, Casual, Indie, Action, Indi..."
22526,773640,Colony On Mars,"Casual, Indie, Simulation, Strategy, Strategy,..."
22527,733530,LOGistICAL: South Africa,"Casual, Indie, Strategy, Strategy, Indie, Casu..."
22528,610660,Russian Roads,"Indie, Racing, Simulation, Indie, Simulation, ..."


### Creación de un Vectorizador de Texto (CountVectorizer):

El objetivo es convertir la columna description en vectores numéricos.
El vectorizador asigna un número a cada palabra única presente en el texto y cuenta su frecuencia. Cada juego se representa como un vector.

Es necesaria para calcular la similitud del coseno, ya que este método trabaja con vectores numéricos.

In [63]:
# creamos un vectorizador de texto
cv = CountVectorizer()
vectores = cv.fit_transform(games['description']).toarray()

### Cálculo de la Similitud del Coseno entre Vectores:

Mide el coseno del ángulo entre dos vectores. Cuanto más cercano a 1, más similares son los vectores.

In [64]:
# Calcular la similitud del coseno entre vectores
similitud = cosine_similarity(vectores)

In [65]:
# generamos una función para obtener recomendaciones por título. devuelve lista de los cinco juegos más recomendados.
def recomendacion(juego):
# buscamos el índice del juego en el DataFrame original games. con este índice ingresamos a cada fila de la matriz.
    indice_juego = games[games["item_id"] == juego].index[0]
# obtenemos la diferencia de similitud entre el juego de entrada y todos los demás juegos en el conjunto de datos. 
    distances = similitud[indice_juego]
# ordenamos de forma descendente y seleccionamos los cinco juegos más similares exluyendo el juego de entrada
    lista_juegos = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
# formateamos la salida
    recommended_titles = [games.iloc[i[0]]['game_name'] for i in lista_juegos]
    
    return recommended_titles

In [66]:
# aplicamos la función a item_id y creamos la columna recomendaciones
games['recomendaciones'] = games['item_id'].apply(recomendacion)

In [67]:
# visualizamos
games

Unnamed: 0,item_id,game_name,description,recomendaciones
0,761140,Lost Summoner Kitty,"Action, Casual, Indie, Simulation, Strategy, S...","[Trivia Vault: 1980's Trivia 2, Trivia Vault: ..."
1,643980,Ironbound,"Free to Play, Indie, RPG, Strategy, Free to Pl...","[Kingdoms CCG, Chronicle: RuneScape Legends, S..."
2,670290,Real Pool 3D - Poolians,"Casual, Free to Play, Indie, Simulation, Sport...","[Snooker-online multiplayer snooker game!, RoS..."
3,767400,弹炸人2222,"Action, Adventure, Casual, Action, Adventure, ...","[Biozone, CHASER, Turtle Odyssey, Luxor: 5th P..."
4,772540,Battle Royale Trainer,"Action, Adventure, Simulation, Action, Adventu...","[Gabe Newell Simulator 2.0, Oddworld: Stranger..."
...,...,...,...,...
22525,745400,Kebab it Up!,"Action, Adventure, Casual, Indie, Action, Indi...","[Dark Snow, HardBall, Achievement Hunter: Phar..."
22526,773640,Colony On Mars,"Casual, Indie, Simulation, Strategy, Strategy,...","[Trivia Vault: Super Heroes Trivia, Trials of ..."
22527,733530,LOGistICAL: South Africa,"Casual, Indie, Strategy, Strategy, Indie, Casu...","[LOGistICAL: Italy, LOGistICAL: USA - Oregon, ..."
22528,610660,Russian Roads,"Indie, Racing, Simulation, Indie, Simulation, ...","[Deserter Simulator, Anykey Simulator, Sky Val..."


In [68]:
# nos quedamos con nuestras columnas de interés
games.drop(columns=['description', "game_name"], inplace=True)
games

Unnamed: 0,item_id,recomendaciones
0,761140,"[Trivia Vault: 1980's Trivia 2, Trivia Vault: ..."
1,643980,"[Kingdoms CCG, Chronicle: RuneScape Legends, S..."
2,670290,"[Snooker-online multiplayer snooker game!, RoS..."
3,767400,"[Biozone, CHASER, Turtle Odyssey, Luxor: 5th P..."
4,772540,"[Gabe Newell Simulator 2.0, Oddworld: Stranger..."
...,...,...
22525,745400,"[Dark Snow, HardBall, Achievement Hunter: Phar..."
22526,773640,"[Trivia Vault: Super Heroes Trivia, Trials of ..."
22527,733530,"[LOGistICAL: Italy, LOGistICAL: USA - Oregon, ..."
22528,610660,"[Deserter Simulator, Anykey Simulator, Sky Val..."


In [69]:
# exploramos el df
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22530 entries, 0 to 22529
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   item_id          22530 non-null  int64 
 1   recomendaciones  22530 non-null  object
dtypes: int64(1), object(1)
memory usage: 352.2+ KB


In [70]:
# exportamos
games.to_csv("data_api/recomendaciones.csv", index=False)

API

In [71]:
# importamos la tabla recién creada
rec = pd.read_csv("data_api/recomendaciones.csv")
rec

Unnamed: 0,item_id,recomendaciones
0,761140,"[""Trivia Vault: 1980's Trivia 2"", 'Trivia Vaul..."
1,643980,"['Kingdoms CCG', 'Chronicle: RuneScape Legends..."
2,670290,"['Snooker-online multiplayer snooker game!', '..."
3,767400,"['Biozone', 'CHASER', 'Turtle Odyssey', 'Luxor..."
4,772540,"['Gabe Newell Simulator 2.0', ""Oddworld: Stran..."
...,...,...
22525,745400,"['Dark Snow', 'HardBall', 'Achievement Hunter:..."
22526,773640,"['Trivia Vault: Super Heroes Trivia', 'Trials ..."
22527,733530,"['LOGistICAL: Italy', 'LOGistICAL: USA - Orego..."
22528,610660,"['Deserter Simulator', 'Anykey Simulator', 'Sk..."


In [72]:
# seteamos la variable que representa el parámetro de la futura función
item = 772540

In [201]:
# desarrollamos el funcionamiento de la función
recom = rec["recomendaciones"][rec["item_id"] == item].iloc[0]
recom = eval(recom)

out = {f"ID del Juego: {item}": recom}
out

{'ID del Juego: 772540': ['Gabe Newell Simulator 2.0',
  "Oddworld: Stranger's Wrath HD",
  'Global Ops: Commando Libya',
  'HotLead',
  'Tomb Raider III']}