In [1]:
import pandas as pd
import ast
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Sistema de recomendacion

Queremos crear un sistema de recomendacion a traves de la similitud del coseno, en el cual, ingresando un item_id me devuelva una lista de los 5 juegos recomendados segun el genero del juego.

Para realizar esto creamos un df con los generos, item_id y titulo del juego.

In [2]:
df_sg = pd.read_csv("steam_games/steam_games.csv")

In [3]:
df_ml = df_sg[['item_id', 'genres', 'title']]

In [4]:
df_ml

Unnamed: 0,item_id,genres,title
0,761140,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty
1,643980,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound
2,670290,"['Casual', 'Free to Play', 'Indie', 'Simulatio...",Real Pool 3D - Poolians
3,767400,"['Action', 'Adventure', 'Casual']",弹炸人2222
4,773570,,
...,...,...,...
32125,773640,"['Casual', 'Indie', 'Simulation', 'Strategy']",Colony On Mars
32126,733530,"['Casual', 'Indie', 'Strategy']",LOGistICAL: South Africa
32127,610660,"['Indie', 'Racing', 'Simulation']",Russian Roads
32128,658870,"['Casual', 'Indie']",EXIT 2 - Directions


Chequeamos si la columna 'genres' hay valores vacios

In [5]:
df_ml[df_ml['genres'].isnull()]

Unnamed: 0,item_id,genres,title
4,773570,,
11,724910,,
19,772590,,
20,640250,,
22,711440,,
...,...,...,...
32113,775640,,Robotpencil Presents: Exercise: Brushwork
32114,777930,,Robotpencil Presents: Creative Composition
32115,775370,,The Gamble House
32116,777950,,Kalen Chock Presents: 2017 Free Tutorial


Para facilitar la creacion de este MVP decidimos eliminar los registros cuyo genero se encuentra vacio

In [6]:
df_ml = df_ml.dropna(subset='genres')

Chequeamos si hay listas vacias

In [7]:
hay_lista_vacia = df_ml['genres'].apply(lambda x: len(x) == 0).any()
hay_lista_vacia

False

Reindexamos

In [8]:
df_ml.reset_index(drop=True, inplace=True)

# Achicamos el dataset

Para simplificar el sistema, ya que solo estamos realizando un MVP, vamos a tomar el top 100 de juegos mas jugados por los usuarios para realizar el sistema de recomendacion.

Para eso, necesitamos encontrar el top 100 de juegos mas jugados. Importamos el csv de user_items en donde podemos ver la cantidad de horas jugadas por usuario y juego. 

In [9]:
df_ui = pd.read_csv('users_items/user_items.csv')

Filtramos el DataFrame unicamente por las columnas que necesitamos, para luego agrupar segun el item_id y la sumatoria de las horas jugadas, ordenamos los valores de mayor a menor y por ultimo filtramos por los primeros 200 items.

In [10]:
df_top100_juegos_jugados = df_ui[['item_id', 'playtime_forever']]
df_top100_juegos_jugados = df_top100_juegos_jugados.groupby('item_id', as_index=False).sum()
df_top100_juegos_jugados = df_top100_juegos_jugados.sort_values(by='playtime_forever', ascending=False)
df_top100_juegos_jugados = df_top100_juegos_jugados.iloc[0:100]
df_top100_juegos_jugados

Unnamed: 0,item_id,playtime_forever
23,730,775784836
163,4000,441871026
1673,105600,152997644
1543,72850,134851450
2403,230410,122726853
...,...,...
3001,255710,7487143
367,10090,7469618
1854,205100,7414669
357,9900,7393345


Convertimos la columna de item_id en una lista

In [11]:
top100_id = df_top100_juegos_jugados['item_id'].to_list()

In [12]:
len(top100_id)

100

# Normalizacion de generos

Ahora queremos crear un dataframe para poder comparar la similitud de los items por medio de los generos. Por eso, vamos a realizar el proceso de one-hot enconding para compararlos.

Para eso primero buscamos el listado de generos unicos.

In [13]:
df_generos = df_ml[['genres']]
df_generos['genres'] = df_generos['genres'].apply(eval)
df_generos = df_generos.explode('genres')
df_generos

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_generos['genres'] = df_generos['genres'].apply(eval)


Unnamed: 0,genres
0,Action
0,Casual
0,Indie
0,Simulation
0,Strategy
...,...
28847,Indie
28847,Racing
28847,Simulation
28848,Casual


In [14]:
lista_generos = df_generos['genres']
generos_unicos = lista_generos.unique()
generos_unicos

array(['Action', 'Casual', 'Indie', 'Simulation', 'Strategy',
       'Free to Play', 'RPG', 'Sports', 'Adventure', 'Racing',
       'Early Access', 'Massively Multiplayer',
       'Animation &amp; Modeling', 'Video Production', 'Utilities',
       'Web Publishing', 'Education', 'Software Training',
       'Design &amp; Illustration', 'Audio Production', 'Photo Editing',
       'Accounting'], dtype=object)

In [15]:
# Convertimos la cadena en una lista
genres = df_ml['genres'].apply(eval)

for genre in generos_unicos:
    df_ml[genre] = 0  # Inicializamos todas las columnas en 0

for genre in generos_unicos:
    df_ml.loc[genres.apply(lambda x: genre in x), genre] = 1  # Asignamos 1 a las filas donde el género está presente

df_ml.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ml[genre] = 0  # Inicializamos todas las columnas en 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ml[genre] = 0  # Inicializamos todas las columnas en 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ml[genre] = 0  # Inicializamos todas las columnas en 0
A value is trying to be set on 

Unnamed: 0,item_id,genres,title,Action,Casual,Indie,Simulation,Strategy,Free to Play,RPG,...,Animation &amp; Modeling,Video Production,Utilities,Web Publishing,Education,Software Training,Design &amp; Illustration,Audio Production,Photo Editing,Accounting
0,761140,"['Action', 'Casual', 'Indie', 'Simulation', 'S...",Lost Summoner Kitty,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,643980,"['Free to Play', 'Indie', 'RPG', 'Strategy']",Ironbound,0,0,1,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


Filtramos el DataFrame para quedarnos unicamente con el top 200 de juegos mas jugados

In [16]:
df_ml = df_ml[df_ml['item_id'].isin(top100_id)]

In [17]:
#Creamos una tabla pivot con los item_id y las columnas de generos
df_pivot = df_ml.set_index('item_id').drop(['title', 'genres'], axis=1)
df_pivot

Unnamed: 0_level_0,Action,Casual,Indie,Simulation,Strategy,Free to Play,RPG,Sports,Adventure,Racing,...,Animation &amp; Modeling,Video Production,Utilities,Web Publishing,Education,Software Training,Design &amp; Illustration,Audio Production,Photo Editing,Accounting
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4000,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48700,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8930,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22380,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42700,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10090,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
240,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
220,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
genero_genero_sim_matrix = pd.DataFrame(cosine_similarity(df_pivot))
genero_genero_sim_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,1.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.288675,0.408248,0.353553,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.0,1.000000,0.0,1.000000,0.707107,0.408248,0.500000,0.577350,0.408248,0.707107,...,0.500000,0.707107,1.000000,0.707107,0.0,0.707107,0.707107,0.707107,0.707107,0.707107
2,0.0,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.0,1.000000,0.0,1.000000,0.707107,0.408248,0.500000,0.577350,0.408248,0.707107,...,0.500000,0.707107,1.000000,0.707107,0.0,0.707107,0.707107,0.707107,0.707107,0.707107
4,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
76,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
77,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
78,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000


In [19]:
genero_genero_sim_matrix.columns = df_pivot.index
genero_genero_sim_matrix.set_index(pd.Index(df_pivot.index), inplace=True)

In [20]:
genero_genero_sim_matrix

item_id,4000,48700,8930,22380,42700,39120,620,99900,35450,113200,...,8190,550,8980,1250,10500,500,10090,240,220,10
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4000,1.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.288675,0.408248,0.353553,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
48700,0.0,1.000000,0.0,1.000000,0.707107,0.408248,0.500000,0.577350,0.408248,0.707107,...,0.500000,0.707107,1.000000,0.707107,0.0,0.707107,0.707107,0.707107,0.707107,0.707107
8930,0.0,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,1.0,0.000000,0.000000,0.000000,0.000000,0.000000
22380,0.0,1.000000,0.0,1.000000,0.707107,0.408248,0.500000,0.577350,0.408248,0.707107,...,0.500000,0.707107,1.000000,0.707107,0.0,0.707107,0.707107,0.707107,0.707107,0.707107
42700,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
500,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
10090,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
240,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000
220,0.0,0.707107,0.0,0.707107,1.000000,0.000000,0.707107,0.408248,0.577350,0.500000,...,0.707107,1.000000,0.707107,1.000000,0.0,1.000000,1.000000,1.000000,1.000000,1.000000


In [30]:
genero_genero_sim_matrix.to_csv('df_ml.csv')

In [22]:
def recomendacion_juego(id_juego):
    juegos_similares = genero_genero_sim_matrix.loc[id_juego].sort_values(ascending=False)

    # Obtener el top 5 excluyendo el primero
    top_juegos_similares = juegos_similares.iloc[1:].nlargest(5)
    lista_de_ids = top_juegos_similares.index.tolist()

    titulos_top = df_ml.loc[df_ml['item_id'].isin(lista_de_ids), 'title']
    lista_de_juegos = [f"{i + 1}- {titulo}" for i, titulo in enumerate(titulos_top.values)]

    return f'Para el id introducido: {id_juego}, te recomendamos los siguientes juegos similares:\n' + '\n'.join(lista_de_juegos)

In [23]:
recomendacion_juego(10)

'Para el id introducido: 10, te recomendamos los siguientes juegos similares:\n1- Counter-Strike: Global Offensive\n2- The Binding of Isaac: Rebirth\n3- DARK SOULS™ III\n4- BioShock Infinite\n5- Left 4 Dead 2'

In [27]:
def recomendacion_juego1(id_juego):

    juegos_similares = genero_genero_sim_matrix.loc[id_juego].sort_values(ascending=False)

    # Obtener el top 5 excluyendo el primero
    top_juegos_similares = juegos_similares.iloc[1:6]  # Cambiado 5 a 6 para obtener el top 5
    lista_de_ids = top_juegos_similares.index.tolist()

    titulos_top = df_ml.loc[df_ml['item_id'].isin(lista_de_ids), 'title']
    lista_de_juegos = [f"{i + 1}- {titulo}" for i, titulo in enumerate(titulos_top.values)]

    return f'Para el ID introducido: {id_juego}, te recomendamos los siguientes juegos similares:\n' + '\n'.join(lista_de_juegos)

In [28]:
recomendacion_juego1(10)

'Para el ID introducido: 10, te recomendamos los siguientes juegos similares:\n1- Counter-Strike: Global Offensive\n2- The Binding of Isaac: Rebirth\n3- DARK SOULS™ III\n4- BioShock Infinite\n5- Left 4 Dead 2'