# Modelo de recomendacion

A partir de la libreria Surprice, usamos el algoritmo de KNnWhitMeans, que en su caja negra lo que hace es construir una matriz a partir de los ratings puestos por los usuario
sobre los productos. Calcula la distancia por coseno entre vectores(rating) para ponerle valor numerico a la relacion para poder recomendar en base a los productos.

In [None]:
! pip install scikit-surprise

In [1]:
import pandas as pd
from pyspark.sql import SparkSession 
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.functions import col, udf, desc

In [2]:
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from pyspark.sql.types import DoubleType, IntegerType, StringType, FloatType
from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering, KNNBaseline, KNNWithZScore, KNNWithMeans, KNNBasic, BaselineOnly, NormalPredictor



In [3]:
#Iniciamos sesion en Spark
spark = SparkSession.builder.appName('sent').getOrCreate()

In [4]:
#leemos el dataset
dfspark = spark.read.json('../common/final')

In [19]:
dfspark.show(2)

+-------+----------+------------+------+----------+------+--------------------+----------+-------------+---------+---------------+
|calidad|categories|facilidadUso|precio| productId|rating|             related|reviewTime|   reviewerId|sentiment|          title|
+-------+----------+------------+------+----------+------+--------------------+----------+-------------+---------+---------------+
|      0|     Books|           1|     0|0002216973|     5|, 0812823354, 006...|2012-09-03|AESMLAZX4PI6L|        5|Red Adam's Lady|
|      0|     Books|           0|     0|0002216973|     5|, 0812823354, 006...|2009-06-18|AMVV8VYDTLA78|        5|Red Adam's Lady|
+-------+----------+------------+------+----------+------+--------------------+----------+-------------+---------+---------------+
only showing top 2 rows



In [5]:
#Definimos una funcion que va a servir para crear un columna nueva
def prom_rating(val, val1):
    ''' Calcula el valor pormedio entre dos valores '''
    return (val + val1)/2   

In [6]:
col_new = udf(prom_rating)

In [7]:
#Creamos una columna nueva que promedia el ranking dado por el usuario y el establecido por el analizis de testo
dfspark = dfspark.withColumn("average_ranq", col_new(col("sentiment"), col('rating')))

In [8]:
#Lista de los productos con mas reviews
mejores_productos_más_calificados = dfspark.groupBy('categories','productId').count().sort(F.col("count").desc()).limit(7000)


In [63]:
mejores_productos_más_calificados.groupBy('categories').count().sort(F.col('count').desc()).show(30)

+--------------------+-----+
|          categories|count|
+--------------------+-----+
|               Books| 1409|
|         Electronics|  965|
|      Home & Kitchen|  483|
|         Movies & TV|  328|
|Cell Phones & Acc...|  318|
|Health & Personal...|  294|
|            Clothing|  158|
|   Sports & Outdoors|  158|
|        Pet Supplies|  149|
|                Baby|  123|
|     Office Products|  100|
|Tools & Home Impr...|  100|
|              Beauty|   99|
|Grocery & Gourmet...|   83|
|        Toys & Games|   62|
|               Patio|   45|
|         CDs & Vinyl|   41|
|          Automotive|   32|
|                Arts|   28|
|     Digital Content|   12|
|Industrial & Scie...|   11|
|               Other|    2|
+--------------------+-----+



In [9]:
productos = mejores_productos_más_calificados.select('productId').rdd.flatMap(lambda x: x).collect()

In [10]:
#Seleccionamos los campos a utilizar
dfspark1 = dfspark.select('productId', 'reviewerId', 'average_ranq')

In [11]:
#Filtramos el dataframe con los productos con mas reviews
dfsparkFiltro = dfspark1.filter((dfspark.productId).isin(productos))

In [68]:
len(productos)

10000

In [12]:
#Cambiamos los nombres de los campos para que el modelo los pueda procesar
modelo = dfsparkFiltro.withColumnRenamed('reviewerId', 'user').withColumnRenamed('productId','item').withColumnRenamed('average_ranq', 'rating')

In [13]:
#Cambiamos el tipo de dato de la columna 'rating'
modelo = modelo.withColumn('rating', F.col('rating').cast(FloatType()))

In [14]:
#Hacemos dos listas
users = modelo.select('user').distinct()
items = modelo.select('item').distinct()
users = users.select('user').rdd.flatMap(lambda x: x).collect()
items = items.select('item').rdd.flatMap(lambda x: x).collect()


In [17]:
len(items)

10000

In [18]:
len(users)

2004426

In [15]:
#Fortalecemos la conexion de PySpark con Pandas
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [16]:
#Pasamos el dataframe a Pandas
df_pd = modelo.toPandas()

In [21]:
df_pd

Unnamed: 0,item,user,rating
0,0006476155,A3DN4XF7RS53A,5.0
1,0006476155,ANYRXHDVOVS17,5.0
2,0006476155,AT2J7H5TRZM8Z,3.0
3,0006476155,A3A1E05J5PXADW,4.5
4,0006476155,ACCDXTBXVEC6S,5.0
...,...,...,...
5414484,B00FJJ38HY,A19YUII8E4HLTD,5.0
5414485,B00FJJ38HY,A6RX52LSCQ6HR,4.5
5414486,B00FJJ38HY,AKPABVEJHJW1R,5.0
5414487,B00FJJ38HY,A3R0YYKNN0ZD61,3.5


In [17]:
#Instanciamos el modelo
sim_options = {"name": "cosine", "user_based": False,}

algoritmo = KNNWithMeans(sim_options=sim_options)

In [18]:
reader = Reader(rating_scale=(1, 5))

In [19]:
data_n = Dataset.load_from_df(df_pd[["user", "item", "rating"]], reader)

In [20]:
#Definimos la matriz de entrenamiento
trainingSet = data_n.build_full_trainset()

In [21]:
algoritmo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f246c589450>

In [34]:
user = 'A3DN4XF7RS53A'

In [43]:
#El modelo devuelvo los 5 productos mas recomendados para un usuario x
def recomsnd_user(user):
    recomend = []    
    for item in items:
        p1 = algoritmo.predict(user, item[:50])
        if p1.est > 1:
            p2 = [p1.est , p1.iid]
            recomend.append(p2)
    recomend.sort()
    return recomend[-10:]

In [44]:
def item_id_to_name(prod_id):
    name = dfspark.filter((dfspark.productId)==prod_id).collect()[0][10]    
    return name

In [36]:
def display(recomend): 
    con = 0
    while con <= 4:
        calification = recomend[-10:][con][0]
        prod_id = recomend[-10:][con][1]
        print(f'Para el usuario {user} la recomendación número {con+1} : {item_id_to_name(prod_id)}')
        con += 1

In [None]:
recomend = recomsnd_user(user)
display(recomend)

Para el usuario A3DN4XF7RS53A la recomendacion número 1 : Nexus 5 Case, Spigen Slim Armor Case for Nexus 5 - Retail Packaging - Smooth Black (SGP10569)
Para el usuario A3DN4XF7RS53A la recomendacion número 2 : OMOTON Amazon Kindle Paperwhite Case Cover -- The Thinnest and Lightest PU leather Case Cover for Kindle Paperwhite (Both 2012 and 2013 versions with 6&quot; Display and Built-in Light), Black
Para el usuario A3DN4XF7RS53A la recomendacion número 3 : TUDIA Ultra Slim Melody Series TPU Protective Case for LG (LG G2 (T-Mobile &amp; Sprint), Purple)
Para el usuario A3DN4XF7RS53A la recomendacion número 4 : GoPro HERO3+: Silver Edition
Para el usuario A3DN4XF7RS53A la recomendacion número 5 : Motorola Moto G - Global GSM - Unlocked - 8GB (Black)


In [None]:
prod_id = '0006476155'
item_id_to_name(prod_id)

'Along Came a Spider'

In [None]:
dfspark.filter(dfspark.reviewerId == user).select('title').show(10, truncate=False)

+---------------------------------------+
|title                                  |
+---------------------------------------+
|Along Came a Spider                    |
|Blood Secrets (The Vampire Legacy, #1) |
|News from the Edge: Vampires of Vermont|
|The Lord of the Rings                  |
|Dawn Of The Vampire                    |
|Bag of Bones                           |
|Extreme Measures                       |
|Single White Vampire Seeks Same        |
|Johnny Tremain                         |
|Superstition                           |
+---------------------------------------+
only showing top 10 rows

