In [35]:
# Librerías:
import pandas as pd
import numpy as np

import sklearn as sk
from sklearn.metrics.pairwise import cosine_similarity
import operator
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp

from sklearn.feature_extraction.text import CountVectorizer

In [36]:
# Abrir archivos para ML:
df_ML01 = pd.read_csv("df_unido_ML01.csv", encoding = "latin-1")
df_ML02 = pd.read_csv("df_unido_ML02.csv", encoding = "latin-1")

ML: coseno de similitud, sistema de recomendación.
En un sistema de recomendación, el cálculo de similitud del coseno es una técnica común para determinar la similitud entre elementos (ya sea entre usuarios o entre ítems).

Filtrado Colaborativo:
El filtrado colaborativo se basa en la idea de que los usuarios con preferencias similares tienden a gustar de los mismos ítems.

Hay dos variantes principales:
User-based: Identifica usuarios similares y recomienda ítems en función de las calificaciones dadas por otros usuarios similares.
Item-based: Calcula la similitud entre ítems y recomienda ítems similares a los que un usuario ya ha valorado positivamente

En resumen, item-item collaborative filtering es una buena opción si se desean recomendaciones específicas para cada ítem y no se preocupa tanto del cold start. Sin embargo, ambos enfoques tienen sus ventajas y desventajas, y la elección depende de las necesidades específicas y la cantidad de datos disponibles. Si se tiene una gran cantidad de datos y se quieren personalizar las recomendaciones a nivel de ítem, el cálculo de similitud del coseno es una excelente opción. 

In [37]:
# ML:
# Ver data Tenemos 2965 datos  item_name, user_id y recommend
df_ML01

Unnamed: 0,item_name,user_id,recommend
0,Counter-Strike: Global Offensive,76561198092244787,True
1,PAYDAY 2,76561197970982479,True
2,Theatre of War 2: Africa 1943,acorn13,False
3,Project Zomboid,diabeticfuck,True
4,The Bureau: XCOM Declassified,maxpaynepillkiller,False
...,...,...,...
2960,Duke Nukem 3D: Megaton Edition,LightfootSword7,True
2961,LYNE,Magjiikal,True
2962,Prison Architect,benjamin27,True
2963,Tomb Raider (VI): The Angel of Darkness,76561198083375450,True


In [38]:
# Obtenemos una tabla con registros únicos de cada juego:  
unique_item_name = df_ML01.drop_duplicates(subset="item_name")

In [39]:
# Borramos duplicados, si ya nos entregan la información necesaria:
df_sin_duplicados = df_ML01.drop_duplicates(subset=["user_id", "item_name"])

In [40]:
if df_ML01.index.is_unique:
    print("No hay filas duplicadas en el DataFrame.")
else:
    print("El DataFrame puede contener filas duplicadas.")

No hay filas duplicadas en el DataFrame.


In [41]:
# Primero se comienza creando la tabla pivote user versus item_name
piv_table = df_sin_duplicados.pivot(index=["user_id"], columns=["item_name"], values="recommend").fillna(0)

In [42]:
# Verificar información dataframe:
df_sin_duplicados.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2965 entries, 0 to 2964
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   item_name  2965 non-null   object
 1   user_id    2965 non-null   object
 2   recommend  2965 non-null   bool  
dtypes: bool(1), object(2)
memory usage: 49.4+ KB


In [43]:
# Crear la tabla pivote user_id, item_name y recommend:
piv_table = df_sin_duplicados.pivot_table(index="user_id", columns="item_name", values="recommend", fill_value=0)

In [44]:
# Se normaliza la tabla pivote con la fórmula de normalización
piv_table_norm = piv_table.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

In [45]:
# Se transpone la matriz, para tener juegos en filas
piv_table_norm.fillna(0, inplace=True)
piv_table_norm = piv_table_norm.T
# Se manteniene solo las columnas que tienen al menos un valor distinto de cero
piv_table_norm = piv_table_norm.loc[:, (piv_table_norm != 0).any(axis=0)]  # Tabla normal

In [46]:
piv_table_norm

user_id,-AnimeIsMyThing-,-GM-Dragon,1234865654,12549,1337lolroflmao,19702316748,2768820078,2d4nk4m3,2xDelorean,3456457568,...,uradumbtit,vault_brothers,wantmahbody,washington_,whodafuqisthisguilao,wirlom,xfluttersx,xoFushiox,yookobz,zomboy151
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000000,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
100% Orange Juice,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
1001 Spikes,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,0.984559,-0.002206,-0.008824,-0.002941
1953: NATO vs Warsaw Pact,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
1993 Space Machine,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ibb & obb,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
sZone-Online,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
the static speaks my name,-0.005882,-0.005147,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,0.990441,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941
theHunter,-0.005882,0.994853,-0.002206,-0.002206,-0.005882,-0.003676,-0.003676,-0.009559,-0.000735,-0.003676,...,-0.005882,-0.001471,-0.002206,-0.003676,-0.002206,-0.001471,-0.015441,-0.002206,-0.008824,-0.002941


In [47]:
# Reemplazar los valores infinitos con un valor específico
piv_table_norm.replace([np.inf, -np.inf], np.nan, inplace=True)
piv_table_norm.fillna(0, inplace=True)

In [48]:
# Se crea una tabla sparse que solo guarda valores distintos de cero y permite optimizar 
piv_sparse = sp.csr_matrix(piv_table_norm.values)
piv_sparse

<1360x493 sparse matrix of type '<class 'numpy.float64'>'
	with 670480 stored elements in Compressed Sparse Row format>

In [49]:
# Coseno de similitud aplicado a tablas de Item (nombre de juego) y User (id de usuario)
item_simil = cosine_similarity(piv_sparse)  # Item
user_simil = cosine_similarity(piv_sparse.T) # User

In [50]:
# Se ordenan en df
df_item_simil = pd.DataFrame(item_simil, index = piv_table_norm.index, columns = piv_table_norm.index) # Item
df_user_simil = pd.DataFrame(user_simil, index = piv_table_norm.columns, columns = piv_table_norm.columns) # User

In [51]:
# Ver matriz de item:
df_item_simil.head(2)

item_name,"10,000,000",100% Orange Juice,1001 Spikes,1953: NATO vs Warsaw Pact,1993 Space Machine,3 Stars of Destiny,30 IMPOSSIBLE LEVELS,3SwitcheD,404Sight,7 Days to Die,...,Zombies Monsters Robots,eden*,how do you Do It?,iBomber Defense Pacific,iO,ibb & obb,sZone-Online,the static speaks my name,theHunter,Ã¤Â¾Â Ã¥Â®Â¢Ã©Â£ÂÃ¤ÂºÂÃ¤Â¼Â (Tale of Wuxia)
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000000,1.0,0.001574,-0.009543,-0.024762,-0.016334,-0.016334,-0.020147,-0.002843,-0.004102,0.000595,...,-0.00201,-0.00581,-0.00581,0.002307,-0.020147,-0.00581,0.046021,-0.003583,-0.001215,0.002307
100% Orange Juice,0.001574,1.0,-0.002843,-0.01786,-0.009545,-0.009545,-0.013307,0.003768,0.000593,0.005244,...,0.002664,0.000841,0.000841,0.00885,-0.013307,0.000841,0.096239,0.003038,0.002597,0.00885


In [52]:
# Ver matriz de user:
df_user_simil.head(2)

user_id,-AnimeIsMyThing-,-GM-Dragon,1234865654,12549,1337lolroflmao,19702316748,2768820078,2d4nk4m3,2xDelorean,3456457568,...,uradumbtit,vault_brothers,wantmahbody,washington_,whodafuqisthisguilao,wirlom,xfluttersx,xoFushiox,yookobz,zomboy151
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-AnimeIsMyThing-,1.0,-0.005533,-0.003617,-0.003617,-0.005917,-0.004673,-0.004673,0.091264,-0.002087,-0.004673,...,-0.005917,-0.002952,-0.003617,-0.004673,-0.003617,-0.002952,-0.009633,-0.003617,-0.007258,-0.004178
-GM-Dragon,-0.005533,1.0,-0.003382,-0.003382,-0.005533,-0.004369,-0.004369,-0.007066,-0.001951,0.165411,...,-0.005533,-0.00276,0.215641,-0.004369,-0.003382,-0.00276,0.07433,-0.003382,-0.006786,-0.003907


RECOMENDACIÓN USER-ITEM:

Si es un sistema de recomendación user-item:

def recomendacion_usuario( id de usuario ): Ingresando el id de un usuario, deberíamos recibir una lista con 5 juegos recomendados para dicho usuario.

In [53]:
# Los 5 juegos más recomendados similares recomendados por usuario...
def similar_user_recs(user):
    
    # Se verifica si el usuario está presente en las columnas de piv_table_norm
    if user not in df_user_simil.columns:
        return {'message': 'El Usuario no tiene datos disponibles {}'.format(user)}

    # Se obtienen los usuarios más similares 
    sim_users = df_user_simil.sort_values(by=user, ascending=False).index[1:11]

    best = []  
    most_common = {}  

    # Por cada usuario similar, encuentra el juego mejor calificado y lo agrega a la lista 'best'
    for i in sim_users:
        max_score = piv_table_norm.loc[:, i].max()
        best.append(piv_table_norm[piv_table_norm.loc[:, i] == max_score].index.tolist())

    # Se cuenta cuántas veces se recomienda cada juego
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1

    # Se ordenan los juegos de mayor recomendacion
    #sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    sorted_list = sorted(most_common.items(), key=lambda x: x[1], reverse=True)

    return dict(sorted_list[:5])

In [54]:
similar_user_recs("-GM-Dragon")

{'Robocraft': 7,
 'The Mighty Quest For Epic Loot': 1,
 'theHunter': 1,
 'Counter-Strike: Global Offensive': 1,
 'March of War': 1}

In [55]:
similar_user_recs("2d4nk4m3")

{'Unturned': 4,
 'Left 4 Dead 2 Beta': 3,
 "Garry's Mod": 1,
 'HAWKEN': 1,
 'Tropico 4': 1}

RECOMENDACIÓN ITEM-ITEM:
Si es un sistema de recomendación item-item:

def recomendacion_juego( id de producto ): Ingresando el id de producto, deberíamos recibir una lista con 5 juegos recomendados similares al ingresado.

In [56]:
# Crear una tabla pivote con item_id e user_id, con las recomendaciones:
pivot_table_id = df_ML02.pivot_table(index="item_id", columns="user_id", values="recommend", fill_value=0)

In [57]:
# Calcular un coseno de similitud:
cosine_sim = cosine_similarity(pivot_table_id)

In [58]:
# Convertir la matriz de coseno de similitudto a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=pivot_table_id.index, columns=pivot_table_id.index)

In [59]:
# La función para obtener el top N de items similares:
def get_similar_items(item_id, top_n=5):
    similar_items = cosine_sim_df[item_id].sort_values(ascending=False).head(top_n + 1).iloc[1:]
    return similar_items

In [60]:
# Ejemplo: Los 5 juegos similares para item_id = 10
similar_items = get_similar_items(10)

In [61]:
# 5 juegos similares para el id de juego 10: Juego con item_id 100, 
# Juego con item_id 80, Juego con item_id 40, Juego con item_id 240, Juego con item_id 60
similar_items 

item_id
242760    0.577350
22300     0.408248
247430    0.408248
70000     0.408248
293180    0.408248
Name: 10, dtype: float64

In [62]:
# Guardar dataframes data_dev a parquet
piv_table_norm.to_parquet("piv_table_norm.parquet")
df_user_simil.to_parquet("df_user_simil.parquet")
cosine_sim_df.to_parquet("cosine_sim_df.parquet")