Machine learning Sistema de Recomendación

In [1]:
# Importar librerías
import csv
import pandas as pd
import json
import numpy as np
import ast
import sklearn as sk
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp

import operator

In [2]:
#Abrir archivo .csv a dataframe
df_2 = pd.read_csv('Third_Users.csv', encoding = "latin-1")

In [3]:
#Ver nombres de columnas
df_2.columns

Index(['user_id', 'items_count', 'steam_id', 'item_id', 'item_name',
       'playtime_forever'],
      dtype='object')

In [4]:
# Dataframe con las columnas de usuario, nombre de juego y tiempo de juego (en minutos)
user_name_time = df_2[["user_id", "item_name", "playtime_forever"]]

In [5]:
# Descripción estadística de los playtime_forever (en minutos)
user_name_time.describe()

Unnamed: 0,playtime_forever
count,5153209.0
mean,991.4951
std,5418.204
min,0.0
25%,0.0
50%,34.0
75%,355.0
max,642773.0


En este caso de los playtime_forever, hay una media de 9,9 x 10 a la 2 minutos, 990 minutos (16,5 horas) +/- 5,4 x 10 a la 3, 5400 minutos (90 horas).
Bajo el 75% de los usuarios, juegan 3,5 x 10 a la 2 minutos, 350 minutos (5,83 horas).

In [6]:
# Ver data
user_name_time

Unnamed: 0,user_id,item_name,playtime_forever
0,76561197970982479,Counter-Strike,6
1,76561197970982479,Team Fortress Classic,0
2,76561197970982479,Day of Defeat,7
3,76561197970982479,Deathmatch Classic,0
4,76561197970982479,Half-Life: Opposing Force,0
...,...,...,...
5153204,76561198329548331,BrainBread 2,0
5153205,76561198329548331,All Is Dust,0
5153206,76561198329548331,One Way To Die: Steam Edition,3
5153207,76561198329548331,You Have 10 Seconds 2,4


In [7]:
# Cambiar tipo de dato y rellenar con 0
user_name_time["playtime_forever"] = user_name_time["playtime_forever"].fillna(0).astype("int32")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_name_time["playtime_forever"] = user_name_time["playtime_forever"].fillna(0).astype("int32")


In [8]:
# Info de data
user_name_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5153209 entries, 0 to 5153208
Data columns (total 3 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   user_id           object
 1   item_name         object
 2   playtime_forever  int32 
dtypes: int32(1), object(2)
memory usage: 98.3+ MB


In [9]:
# Obtenemos una tabla con registros unicos de cada juego
unique_item_ids = user_name_time.drop_duplicates(subset="item_name")

In [10]:
# Reducimos el data por una cuestión de espacio, pero sin que afecte la calidad de los datos
df_modelo = unique_item_ids.sample(frac=0.125, random_state=1)

In [11]:
# Primero se comienza creando la tabla pivote user versus item_name
piv_table = df_modelo.pivot(index=["user_id"], columns=["item_name"], values="playtime_forever").fillna(0)
piv_table

item_name,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),"10,000,000",12 Labours of Hercules IV: Mother Nature,1943 Megami Strike,3 Suicides of Paul Hamilton,3030 Deathwar Redux,3d Bridges,500 MILES,688(I) Hunter/Killer,7 Days to Die,...,Zuma's Revenge! - Adventure,[the Sequence],a Family of Grave Diggers,eXceed - Gun Bullet Children,eXceed 2nd - Vampire REX,iPi Mocap Studio 2,planetarian ~the reverie of a little planet~,sZone-Online,the static speaks my name,ä¸æå®æèï¼Ranger of the jungleï¼
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12549,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13lazer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1920410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19702316748,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2d4nk4m3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wswanderer81,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wyvaud,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xexilex,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xfluttersx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Se normaliza la matriz pivote con la fórmula de normalización
piv_table_norm = piv_table.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [13]:
# Se transpone la matriz, para tener juegos en filas
piv_table_norm.fillna(0, inplace=True)
piv_table_norm = piv_table_norm.T
# Se manteniene solo las columnas que tienen al menos un valor distinto de cero
piv_table_norm = piv_table_norm.loc[:, (piv_table_norm != 0).any(axis=0)]

In [14]:
piv_table_norm

user_id,12549,13lazer,1920410,19702316748,3021Daniel,3dster,666NeCrO,678rtv45,750805,76561197961840584,...,thor2424,thugnificent,turneddreamsintoanempire,ulramite,undietaker,uradumbtit,virtueavatar,whodafuqisthisguilao,wswanderer81,xfluttersx
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
10000000,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
12 Labours of Hercules IV: Mother Nature,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
1943 Megami Strike,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
3 Suicides of Paul Hamilton,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iPi Mocap Studio 2,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
planetarian ~the reverie of a little planet~,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
sZone-Online,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349
the static speaks my name,-0.000737,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.005126,-0.000731,...,-0.000731,-0.002151,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.000731,-0.001349


In [15]:
# Reemplazar los valores infinitos con un valor específico
piv_table_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
piv_table_norm.fillna(0, inplace=True)

In [16]:
# Se crea una tabla sparse que solo guarda valores distintos de cero y permite optimizar 
piv_sparse = sp.csr_matrix(piv_table_norm.values)
piv_sparse

<1368x288 sparse matrix of type '<class 'numpy.float64'>'
	with 393984 stored elements in Compressed Sparse Row format>

In [17]:
# Coseno de similitud aplicado a tablas de Item (nombre de juego) y User (id de usuario)
item_simil = cosine_similarity(piv_sparse)  # Item
user_simil = cosine_similarity(piv_sparse.T) # User

In [18]:
# Se ordenan en df
df_item_simil = pd.DataFrame(item_simil, index = piv_table_norm.index, columns = piv_table_norm.index) # Item
df_user_simil = pd.DataFrame(user_simil, index = piv_table_norm.columns, columns = piv_table_norm.columns) # User

In [19]:
df_item_simil

item_name,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),"10,000,000",12 Labours of Hercules IV: Mother Nature,1943 Megami Strike,3 Suicides of Paul Hamilton,3030 Deathwar Redux,3d Bridges,500 MILES,688(I) Hunter/Killer,7 Days to Die,...,Zuma's Revenge! - Adventure,[the Sequence],a Family of Grave Diggers,eXceed - Gun Bullet Children,eXceed 2nd - Vampire REX,iPi Mocap Studio 2,planetarian ~the reverie of a little planet~,sZone-Online,the static speaks my name,ä¸æå®æèï¼Ranger of the jungleï¼
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),1.000000,-0.021787,0.045084,-0.049587,0.999980,-0.001374,-0.000383,-0.000383,0.519988,0.515949,...,-0.000383,-0.767237,0.999980,-0.024083,-0.008292,0.999980,0.753556,0.046402,0.345472,0.999980
10000000,-0.021787,1.000000,-0.003502,-0.002640,-0.021770,-0.001343,-0.001317,-0.001317,-0.013168,-0.013881,...,-0.001317,-0.022396,-0.021770,-0.004415,-0.004760,-0.021770,-0.017296,-0.003442,-0.009806,-0.021770
12 Labours of Hercules IV: Mother Nature,0.045084,-0.003502,1.000000,-0.006196,0.045104,-0.001524,-0.001428,-0.001428,0.021487,0.020453,...,-0.001428,-0.076221,0.045104,-0.006343,-0.005632,0.045104,0.033041,-0.000496,0.013151,0.045104
1943 Megami Strike,-0.049587,-0.002640,-0.006196,1.000000,-0.049560,-0.002090,-0.002064,-0.002064,-0.028676,-0.029734,...,-0.002064,-0.023430,-0.049560,-0.006569,-0.007354,-0.049560,-0.038748,-0.006122,-0.020713,-0.049560
3 Suicides of Paul Hamilton,0.999980,-0.021770,0.045104,-0.049560,1.000000,-0.001364,-0.000373,-0.000373,0.520012,0.515979,...,-0.000373,-0.766964,1.000000,-0.024047,-0.008256,1.000000,0.753577,0.046421,0.345495,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
iPi Mocap Studio 2,0.999980,-0.021770,0.045104,-0.049560,1.000000,-0.001364,-0.000373,-0.000373,0.520012,0.515979,...,-0.000373,-0.766964,1.000000,-0.024047,-0.008256,1.000000,0.753577,0.046421,0.345495,1.000000
planetarian ~the reverie of a little planet~,0.753556,-0.017296,0.033041,-0.038748,0.753577,-0.001545,-0.000780,-0.000780,0.391174,0.387833,...,-0.000780,-0.592682,0.753577,-0.019980,-0.008081,0.753577,1.000000,0.034066,0.259498,0.753577
sZone-Online,0.046402,-0.003442,-0.000496,-0.006122,0.046421,-0.001474,-0.001379,-0.001379,0.022241,0.021231,...,-0.001379,-0.075770,0.046421,-0.006190,-0.005458,0.046421,0.034066,1.000000,0.953447,0.046421
the static speaks my name,0.345472,-0.009806,0.013151,-0.020713,0.345495,-0.001797,-0.001408,-0.001408,0.177878,0.175712,...,-0.001408,-0.302717,0.345495,-0.013075,-0.007620,0.345495,0.259498,0.953447,1.000000,0.345495


In [20]:
df_user_simil

user_id,12549,13lazer,1920410,19702316748,3021Daniel,3dster,666NeCrO,678rtv45,750805,76561197961840584,...,thor2424,thugnificent,turneddreamsintoanempire,ulramite,undietaker,uradumbtit,virtueavatar,whodafuqisthisguilao,wswanderer81,xfluttersx
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12549,1.000000,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.002391,-0.000738,...,-0.000738,-0.001408,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.000738,-0.001169
13lazer,-0.000738,1.000000,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.001159
1920410,-0.000738,-0.000732,1.000000,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.001159
19702316748,-0.000738,-0.000732,-0.000732,1.000000,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.001159
3021Daniel,-0.000738,-0.000732,-0.000732,-0.000732,1.000000,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.001159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uradumbtit,-0.000738,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,1.000000,-0.000732,-0.000732,-0.000732,-0.001159
virtueavatar,-0.000738,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,1.000000,-0.000732,-0.000732,-0.001159
whodafuqisthisguilao,-0.000738,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,1.000000,-0.000732,-0.001159
wswanderer81,-0.000738,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.002371,-0.000732,...,-0.000732,-0.001396,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,-0.000732,1.000000,-0.001159


Sistema de recomendación ITEM-ITEM:

In [21]:
# Los 5 juegos más recomendados por juego similar...
def recommended_games_item(game, df_item_simil):
    similar_games = {}
    count = 1
    for item in df_item_simil.sort_values(by=game, ascending=False).index[1:6]:
        similar_games[f"Recomendación {count}"] = item
        count += 1
    return similar_games

In [22]:
recommended_games_item("the static speaks my name", df_item_simil)

{'Recomendación 1': 'Among Ripples',
 'Recomendación 2': 'Rustbucket Rumble',
 'Recomendación 3': 'Istrolid',
 'Recomendación 4': 'Curse of Mermos',
 'Recomendación 5': 'Red Crucible: Firestorm'}

Sistema de recomendación USER-ITEM:

In [23]:
# Los 5 juegos más recomendados similares por usuario...
def similar_user_recs(user):
    
    # Se verifica si el usuario está presente en las columnas de piv_table_norm
    if user not in piv_table_norm.columns:
        return {'message': 'El Usuario no tiene datos disponibles {}'.format(user)}

    # Se obtienen los usuarios más similares 
    sim_users = df_user_simil.sort_values(by=user, ascending=False).index[1:11]

    best = []  
    most_common = {}  

    # Por cada usuario similar, encuentra el juego mejor calificado y lo agrega a la lista 'best'
    for i in sim_users:
        max_score = piv_table_norm.loc[:, i].max()
        best.append(piv_table_norm[piv_table_norm.loc[:, i] == max_score].index.tolist())

    # Se cuenta cuántas veces se recomienda cada juego
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1

    # Se ordenan los juegos de mayor recomendacion
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)

    return dict(sorted_list[:5])

In [24]:
similar_user_recs("virtueavatar")

{'3d Bridges': 1, '500 MILES': 1, '8-Bit Hordes': 1, 'APOX': 1, 'ASRECorp': 1}

In [25]:
# Guardar dataframes a parquet
unique_item_ids.to_parquet('unique_item_ids.parquet')
user_name_time.to_parquet('user_name_time.parquet')
piv_table_norm.to_parquet('piv_table_norm.parquet')
df_item_simil.to_parquet('df_item_simil.parquet')
df_user_simil.to_parquet('df_user_simil.parquet')