# Modelo de recomendacion

Repasemos los objetivos del modelo:

<ul><li>def recomendacion_juego( id de producto ): Ingresando el id de producto, deberíamos recibir una lista con 5 juegos recomendados similares al ingresado.</li>
<li>def recomendacion_usuario( id de usuario ): Ingresando el id de un usuario, deberíamos recibir una lista con 5 juegos recomendados para dicho usuario.</li></ul>

Para ello, vamos a usar un enfoque de recomendacion basada en contenido:


In [14]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import Modulo as md

Intentaremos generar un sistema de recomendacion que utilice dos variables de trabajo:

<ul><li>Similitud de puntajes de usuarios.</li>
<li>Similitud de titulos.</li></ul>

In [15]:
df_puntaje = pd.read_parquet('Datasets/parquet/Recomendacion/recomendacion.parquet')
df_puntaje.head()

Unnamed: 0,user_id,item_id,puntaje,item_name
0,76561197970982479,1250,4,Killing Floor
1,76561197970982479,22200,6,Zeno Clash
2,76561197970982479,43110,4,Metro 2033
3,js41637,251610,4,Barbie™ Dreamhouse Party™
4,js41637,227300,4,Euro Truck Simulator 2


In [16]:
unique_titles = df_puntaje['item_name'].sort_values().unique()
tfidf_titles = TfidfVectorizer(stop_words='english')
tfidf_matrix_titles = tfidf_titles.fit_transform(unique_titles)
item_titles_sim = cosine_similarity(tfidf_matrix_titles)
item_titles_sim_df = pd.DataFrame(item_titles_sim, index=unique_titles, columns=unique_titles)
item_titles_sim_df

Unnamed: 0,! That Bastard Is Trying To Steal Our Gold !,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,12 is Better Than 6,123 Slaughter Me Street,140,...,klocki,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theBlu,theHunter,theHunter: Primal
! That Bastard Is Trying To Steal Our Gold !,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
0RBITALIS,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
10000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
100% Orange Juice,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.220663,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1001 Spikes,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000
theBlu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.738334


In [17]:
user_ratings_matrix = df_puntaje.pivot(index='user_id', columns='item_name', values='puntaje')
user_ratings_matrix = user_ratings_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
user_ratings_matrix.fillna(0, inplace=True)
user_ratings_matrix = user_ratings_matrix.T
user_ratings_matrix = user_ratings_matrix.loc[:, (user_ratings_matrix != 0).any(axis=0)]
user_ratings_matrix

user_id,-2SV-vuLB-Kg,-GM-Dragon,-PRoSlayeR-,-SEVEN-,-_PussyDestroyer_-,00000000000000000001227,00True,011111135489484797,01189958889189157253,022899,...,zombieskiler6969,zomgieee,zoozles,zp3413,zrustz16,zucchin1,zuilde,zuzuga2003,zv_odd,zyr0n1c
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! That Bastard Is Trying To Steal Our Gold !,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
theBlu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.466667,0.0,0.0


In [18]:
piv_sparse = sp.sparse.csr_matrix(user_ratings_matrix.values)
piv_sparse

<3319x8298 sparse matrix of type '<class 'numpy.float64'>'
	with 32098 stored elements in Compressed Sparse Row format>

In [19]:
item_rating_similarity = cosine_similarity(piv_sparse)
user_rating_similarity = cosine_similarity(piv_sparse.T)

In [20]:
#item similarity dataframe
item_rating_sim_df = pd.DataFrame(item_rating_similarity, index = user_ratings_matrix.index, columns = user_ratings_matrix.index)
#user similarity dataframe
user_rating_sim_df = pd.DataFrame(user_rating_similarity, index = user_ratings_matrix.columns, columns = user_ratings_matrix.columns)

In [21]:
item_rating_sim_df

item_name,! That Bastard Is Trying To Steal Our Gold !,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,12 is Better Than 6,123 Slaughter Me Street,140,...,klocki,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theBlu,theHunter,theHunter: Primal
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! That Bastard Is Trying To Steal Our Gold !,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
theBlu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
user_rating_sim_df

user_id,-2SV-vuLB-Kg,-GM-Dragon,-PRoSlayeR-,-SEVEN-,-_PussyDestroyer_-,00000000000000000001227,00True,011111135489484797,01189958889189157253,022899,...,zombieskiler6969,zomgieee,zoozles,zp3413,zrustz16,zucchin1,zuilde,zuzuga2003,zv_odd,zyr0n1c
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-2SV-vuLB-Kg,1.000000,0.0,0.0,-0.062257,0.0,-6.002450e-01,-0.612372,0.0,0.000000,0.0,...,0.000000,0.000000,0.433013,0.0,-0.516223,0.0,0.0,0.456832,0.0,-0.063844
-GM-Dragon,0.000000,1.0,0.0,0.000000,0.0,0.000000e+00,0.000000,0.0,0.268028,0.0,...,0.000000,0.000000,0.000000,0.0,0.244023,0.0,0.0,0.000000,0.0,0.000000
-PRoSlayeR-,0.000000,0.0,1.0,0.000000,0.0,0.000000e+00,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
-SEVEN-,-0.062257,0.0,0.0,1.000000,0.0,4.982617e-02,0.050833,0.0,0.217994,0.0,...,0.000000,0.000000,-0.035944,0.0,0.284173,0.0,0.0,-0.246490,0.0,0.005300
-_PussyDestroyer_-,0.000000,0.0,0.0,0.000000,1.0,0.000000e+00,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zucchin1,0.000000,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,1.0,0.0,0.000000,0.0,0.000000
zuilde,0.000000,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000,0.0,0.000000,0.0,...,-0.049507,0.000000,0.000000,0.0,-0.006590,0.0,1.0,0.000000,0.0,0.000000
zuzuga2003,0.456832,0.0,0.0,-0.246490,0.0,-3.525573e-01,-0.373002,0.0,-0.049988,0.0,...,0.000000,0.000000,0.263752,0.0,-0.314436,0.0,0.0,1.000000,0.0,-0.106942
zv_odd,0.000000,0.0,0.0,0.000000,0.0,0.000000e+00,0.000000,0.0,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.000000,1.0,0.000000


In [23]:
item_sim_df = item_rating_sim_df.mul(0.5).add(item_titles_sim_df.mul(0.5))
item_sim_df

item_name,! That Bastard Is Trying To Steal Our Gold !,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,12 is Better Than 6,123 Slaughter Me Street,140,...,klocki,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theBlu,theHunter,theHunter: Primal
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! That Bastard Is Trying To Steal Our Gold !,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
0RBITALIS,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
10000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
100% Orange Juice,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.110331,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
1001 Spikes,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.000000,0.000000
theBlu,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.000000,0.000000
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.369167


In [24]:
diccionario_juegos = df_puntaje[['item_id','item_name']].drop_duplicates()
diccionario_juegos['item_id'] = diccionario_juegos['item_id'].apply(int)
md.tipo_datos(diccionario_juegos)

Unnamed: 0,Columna,Tipo_datos,%_nulos,Nulos
0,item_id,[int],0.0,0
1,item_name,[str],0.0,0


## Funcion recomendacion juego - item input

In [25]:
def recomendacion_juego(item_id):
    try:
        item_name = diccionario_juegos[diccionario_juegos.item_id==item_id].item_name.values[0]
    except:
        return print(f'Sin informacion disponible para el juego {item_id}')
    count = 1
    print(f'Los Juegos similares a ID juego {item_id}, nombre juego: {item_name} son:\n')
    for item in item_sim_df.sort_values(by = item_name, ascending = False).index[1:6]:
        print(f'No. {count}: ID {diccionario_juegos[diccionario_juegos.item_name==item].item_id.values[0]}, nombre {item}')
        count +=1

In [26]:
recomendacion_juego(1250)

Los Juegos similares a ID juego 1250, nombre juego: Killing Floor son:

No. 1: ID 232090, nombre Killing Floor 2
No. 2: ID 351570, nombre Killing Floor: Uncovered
No. 3: ID 35420, nombre Killing Floor Mod: Defence Alliance 2
No. 4: ID 521430, nombre Super Switch
No. 5: ID 13250, nombre Unreal Gold


## Funcion recomendacion juego - user input

In [27]:
def recomendacion_usuario(user_id):
    similarity_constant = 0.7
    if user_id not in user_rating_sim_df.columns:
        return(f'Sin informacion disponible para ese usuario {user_id}')
    
    sim_users = user_rating_sim_df[user_rating_sim_df>similarity_constant].sort_values(by=user_id, ascending=False)

    best = []
    most_common = {}

    for i in sim_users:
        user_scores = user_ratings_matrix.loc[:, i]
        max_scores = user_scores[user_scores>similarity_constant]
        for j in max_scores:
            best.append(user_ratings_matrix[user_ratings_matrix.loc[:, i]==j].index.tolist())
        
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    
    print(f'Los juegos mas recomendados segun usuarios relacionados para el usuario {user_id} son:\n')
    count = 1
    for i in sorted_list[:5]:
        print(f'No. {count}: {i[0]}')
        count +=1

In [28]:
recomendacion_usuario('-2SV-vuLB-Kg')

Los juegos mas recomendados segun usuarios relacionados para el usuario -2SV-vuLB-Kg son:

No. 1: Counter-Strike: Global Offensive
No. 2: Garry's Mod
No. 3: Left 4 Dead 2
No. 4: Portal 2
No. 5: Borderlands 2


## Guardado de bases de datos

In [51]:
user_rating_sim_df.to_parquet('Datasets/parquet/Recomendacion/Final/user_rating_sim_df.parquet')
user_ratings_matrix.to_parquet('Datasets/parquet/Recomendacion/Final/user_ratings_matrix.parquet')
diccionario_juegos.to_parquet('Datasets/parquet/Recomendacion/Final/diccionario_juegos.parquet',index=False)
item_sim_df.to_parquet('Datasets/parquet/Recomendacion/Final/item_sim_df.parquet')