In [2]:
import pandas as pd
import numpy as np
import datetime 
import time

import pyspark.sql.functions as f
from pyspark.sql.types import IntegerType
from pyspark.sql import Window

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext()
spark = SparkSession(sc)

from pyspark.ml.recommendation import ALS
# from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
import matplotlib.pyplot as plt 

In [3]:
events=spark.read.csv('data/events.csv',  header= True, inferSchema = True)
events = events.withColumn('datetime', f.from_unixtime((events.timestamp.cast('bigint')/1000)).cast('timestamp'))

In [6]:
events.where(f.col("visitorid")==102019).show()

+-------------+---------+-----------+------+-------------+-------------------+
|    timestamp|visitorid|      event|itemid|transactionid|           datetime|
+-------------+---------+-----------+------+-------------+-------------------+
|1433176736375|   102019|transaction|150318|        13556|2015-06-01 12:38:56|
|1433175894837|   102019|       view| 49521|         null|2015-06-01 12:24:54|
|1433176042269|   102019|       view| 49521|         null|2015-06-01 12:27:22|
|1433175812596|   102019|       view|150318|         null|2015-06-01 12:23:32|
|1433175801314|   102019|  addtocart| 49521|         null|2015-06-01 12:23:21|
|1433175871497|   102019|       view| 49521|         null|2015-06-01 12:24:31|
|1433175714335|   102019|       view| 49521|         null|2015-06-01 12:21:54|
|1433175945872|   102019|       view|150318|         null|2015-06-01 12:25:45|
|1433176736422|   102019|transaction| 49521|        13556|2015-06-01 12:38:56|
+-------------+---------+-----------+------+--------

In [4]:
#preprocessing data
df = events.where(f.col('event')=='transaction').groupby("visitorid",'itemid').agg(f.count("event"))

In [6]:
df = df.toDF('visitorid','itemid','rating')

In [7]:
df.show()

+---------+------+------+
|visitorid|itemid|rating|
+---------+------+------+
|   723092|  8641|     2|
|   851973| 40582|     1|
|   207917|449489|     1|
|   561611| 15822|     3|
|   503857|439194|     1|
|   621474| 92361|     1|
|  1119553|420079|     1|
|   952388| 66405|     1|
|   815215|375169|     1|
|   822310|272988|     1|
|   198153|312728|     1|
|   371604|315571|     1|
|   430190|447215|     1|
|   557700|111530|     2|
|   682031|435624|     1|
|  1375192| 17478|     1|
|  1385901|312728|     1|
|   998808|423269|     1|
|  1150086|396265|     1|
|   969827|427128|     1|
+---------+------+------+
only showing top 20 rows



In [24]:
df.agg(f.countDistinct("visitorid")).show()

+----------------+
|count(visitorid)|
+----------------+
|           11719|
+----------------+



In [25]:
df.agg(f.countDistinct("itemid")).show()

+-------------+
|count(itemid)|
+-------------+
|        12025|
+-------------+



In [8]:
# split data to training set and testing set
X_train,X_test = df.randomSplit([0.8, 0.2], seed = 42)

In [9]:
#train using ALS
als = ALS(maxIter=10, regParam=0.01, rank=25, userCol="visitorid", itemCol="itemid", ratingCol="rating", 
          coldStartStrategy="drop", nonnegative=True,implicitPrefs=False)
model = als.fit(X_train)

In [23]:
#evaluate the model on the testing set
Z = X_test.agg({"rating":"max"}).collect()[0][0] - X_test.agg({"rating":"min"}).collect()[0][0]
def predict(model, toPredict):
    return model.transform(toPredict).withColumn('prediction', f.round('prediction'))
predictions = predict(model, X_test)

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Normalized root-mean-square error on testing set = " + str(rmse/Z))

Normalized root-mean-square error on testing set = 0.18937995473443514


In [12]:
predictions.show()

+---------+------+------+----------+
|visitorid|itemid|rating|prediction|
+---------+------+------+----------+
|  1150086|237019|     1|       1.0|
|   272883|237019|     1|       0.0|
|  1078486|446934|     1|       0.0|
|  1293358| 78494|     1|       1.0|
|   841939|122604|     1|       1.0|
|   861299|122604|     1|       1.0|
|   132676|271823|     1|       1.0|
|   723092|273747|     1|       0.0|
|   911093|277119|     1|       1.0|
|  1156525|290993|     1|       0.0|
|   995225| 45731|     1|       0.0|
|   404403| 85579|     1|       1.0|
|  1150086|187549|     1|       0.0|
|     6958|431417|     1|       1.0|
|  1032432| 13188|     1|       0.0|
|   227091| 51629|     1|       1.0|
|    94030|101718|     1|       1.0|
|   994820|110077|     1|       0.0|
|   737053|206317|     1|       0.0|
|   645525|293416|     1|       1.0|
+---------+------+------+----------+
only showing top 20 rows



In [13]:
def get_recs_for_visitor(recs):
    #recs should be for a specific visitor
    recs = recs.select("recommendations.itemid","recommendations.rating")
    items = recs.select("itemid").toPandas().iloc[0,0]
#     events = recs.select("rating").toPandas().iloc[0,0]
    event_matrix = pd.DataFrame(items,columns = ["itemid_recs"])
#     event_matrix["score"] = events
    event_matrix_ps = spark.createDataFrame(event_matrix)
    return event_matrix_ps

from collections import namedtuple

def visitor_rec(visitor):
    user_row = namedtuple('user_row','visitorid')
    data = [user_row(visitor)]
    return spark.createDataFrame(data)

def display_recs(visitor,model):
    visitor = visitor_rec(visitor)
    userSubsetRecs = model.recommendForUserSubset(visitor, 10)
    items_recs = get_recs_for_visitor(userSubsetRecs)
    return items_recs.show()
    

In [14]:
# Generate top 10 item recommendations for a specific visitor
display_recs(102019,model)

+-----------+
|itemid_recs|
+-----------+
|     218612|
|      81944|
|      76196|
|     369661|
|     119736|
|      22161|
|       7236|
|     331244|
|     402462|
|        147|
+-----------+



show properties of items recommended and bought

In [15]:
items=pd.read_csv('data/item_properties_part1.csv')
items1=pd.read_csv('data/item_properties_part2.csv')
items=pd.concat([items1,items])
times=[]
for i in items['timestamp']:
    times.append(datetime.datetime.fromtimestamp(i//1000.0))
items['datatime'] = times

def item_property(datetime, item_set, items):
    temp = items[(items.itemid.isin(item_set)) & (items.timestamp<datetime)]
    temp = temp[temp.timestamp==temp.timestamp.max()]
    return temp
    

In [16]:
pref_item_true = item_property(1433176736375,[150318,49521],items)
pref_item_true

Unnamed: 0,timestamp,itemid,property,value,datatime
1965555,1433041200000,150318,available,1,2015-05-30 23:00:00
2094576,1433041200000,49521,888,222207 927133 1307549,2015-05-30 23:00:00
3198360,1433041200000,49521,365,1116693,2015-05-30 23:00:00
4861168,1433041200000,49521,792,1116693,2015-05-30 23:00:00
5912026,1433041200000,150318,917,331643,2015-05-30 23:00:00
7302564,1433041200000,49521,364,490489,2015-05-30 23:00:00
8676901,1433041200000,150318,888,1265514 1224110 734600 629205,2015-05-30 23:00:00
730641,1433041200000,150318,categoryid,1236,2015-05-30 23:00:00
936625,1433041200000,150318,790,n54120.000,2015-05-30 23:00:00
1283131,1433041200000,49521,1058,n48.000,2015-05-30 23:00:00


In [17]:
recs_item = item_property(1433176736375,[218612],items)
recs_item 

Unnamed: 0,timestamp,itemid,property,value,datatime
1256893,1433041200000,218612,1054,n180.000 639502 n192.000,2015-05-30 23:00:00
2016778,1433041200000,218612,758,n48.000 1187104,2015-05-30 23:00:00
2849028,1433041200000,218612,available,1,2015-05-30 23:00:00
4329432,1433041200000,218612,663,1297729 n156.000 606827,2015-05-30 23:00:00
6789711,1433041200000,218612,962,664227,2015-05-30 23:00:00
6945194,1433041200000,218612,602,663779,2015-05-30 23:00:00
7590722,1433041200000,218612,921,526589 39986 1257525,2015-05-30 23:00:00
9207165,1433041200000,218612,452,1055803 278045,2015-05-30 23:00:00
253026,1433041200000,218612,478,1282402,2015-05-30 23:00:00
2600657,1433041200000,218612,888,1093699 n9060.000 1055803 278045 1282402 n24.0...,2015-05-30 23:00:00


In [18]:
recs_item.property.unique()

array(['1054', '758', 'available', '663', '962', '602', '921', '452',
       '478', '888', '529', '202', '619', '790', '1066', '451'],
      dtype=object)

In [19]:
#common properties
set(pref_item_true.property).intersection(set(recs_item.property))

{'202', '790', '888', 'available'}

In [20]:
pref_item_true.property.unique()

array(['available', '888', '365', '792', '917', '364', 'categoryid',
       '790', '1058', '839', '202', '463'], dtype=object)