Collaborative Filtering

In [0]:
## 01 - Install Spark Packages
!wget -q https://archive.apache.org/dist/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
!apt-get -y install openjdk-8-jre-headless
!pip install pyspark
from pyspark.sql import *
from pyspark import *
spark = SparkSession.builder.master("local").getOrCreate()
sc = SparkContext.getOrCreate()

In [0]:
import sys
import pandas as pd
import seaborn as sns
from time import time
from pyspark.sql import Row
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit,CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline, PipelineModel

In [0]:
df_output = spark.read.format("json").load("dbfs:/FileStore/shared_uploads/kkctong@connect.ust.hk/outputdata.json")
df_output.show()

In [0]:
# drop not needed columns
df = df_output.drop("_corrupt_record", "cleaned_words", "label", "features", "indexedLabel","prediction", "probability", "rawFeatures", "rawPrediction", "words", 'reviewText','reviewTime','reviewerName','unixReviewTime','summary')
df = df.selectExpr("asin as item","pre_label as rating", "reviewerID")
df.count()

In [0]:
df.describe().show()

In [0]:
#Turn itemID and reviewerID to string
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
indexer = [StringIndexer(inputCol=column, outputCol=column+"_index") for column in list(set(df.columns)-set(['rating'])) ]
pipeline = Pipeline(stages=indexer)
transformed = pipeline.fit(df).transform(df)
transformed.show()

(df_train,df_test)=transformed.randomSplit([0.8, 0.2])
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="reviewerID_index",itemCol="item_index",ratingCol="rating",coldStartStrategy="drop",
        nonnegative=True)
model=als.fit(df_train)

In [0]:
#Build ALS model
(df_train,df_test)=transformed.randomSplit([0.8, 0.2])
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="reviewerID_index",itemCol="item_index",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
model=als.fit(df_train)

In [0]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
predictions=model.transform(df_test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

In [0]:
user_recs=model.recommendForAllUsers(20).show(10,False)

In [0]:
import pandas as pd
recs=model.recommendForAllUsers(10).toPandas()
nrecs=recs.recommendations.apply(pd.Series) \
            .merge(recs, right_index = True, left_index = True) \
            .drop(["recommendations"], axis = 1) \
            .melt(id_vars = ['reviewerID_index'], value_name = "recommendation") \
            .drop("variable", axis = 1) \
            .dropna() 
nrecs=nrecs.sort_values('reviewerID_index')
nrecs=pd.concat([nrecs['recommendation'].apply(pd.Series), nrecs['reviewerID_index']], axis = 1)
nrecs.columns = [
        
        'item_index',
        'rating',
        'reviewerID_index'
       
     ]
md=transformed.select(transformed['reviewerID'],transformed['reviewerID_index'],transformed['item'],transformed['item_index'])
md=md.toPandas()
dict1 =dict(zip(md['reviewerID_index'],md['reviewerID']))
dict2=dict(zip(md['item_index'],md['item']))
nrecs['reviewerID']=nrecs['reviewerID_index'].map(dict1)
nrecs['item']=nrecs['item_index'].map(dict2)
nrecs=nrecs.sort_values('reviewerID')
nrecs.reset_index(drop=True, inplace=True)
new=nrecs[['reviewerID','item','rating']]
new['recommendations'] = list(zip(new.item, new.rating))
res=new[['reviewerID','recommendations']]  
res_new=res['recommendations'].groupby([res.reviewerID]).apply(list).reset_index()
print(res_new)

In [0]:
#Relationship between user with common items reviewed
item_0 = transformed.filter("item = 'B001HBHNHE'")
item_1 = transformed.filter("item = 'B001T7QJ9O'")

In [0]:
item_0 = transformed.filter("item = 'B001HBHNHE'")
item_1 = transformed.filter("item = 'B001T7QJ9O'")

item_0_a = item_0.alias("item_0_a")
item_1_a = item_1.alias("item_1_a")

item_0_a.join(item_1_a, col('item_0_a.reviewerID') == col('item_0_a.reviewerID')).show()

In [0]:
transformed.filter("reviewerID_index = '262'").show()
transformed.filter("reviewerID_index = '31021'").show()
transformed.filter("reviewerID_index = '1560'").show()
transformed.filter("reviewerID_index = '3549'").show()

In [0]:
#Recommendation list for user 262
recommend = model.recommendForAllUsers(10)
recommend_262 = recommend.filter(recommend.reviewerID_index == "262")
recommend_262 = recommend_262.toPandas()
pd.set_option('display.max_colwidth', -1)
recommend_262

Unnamed: 0,reviewerID_index,recommendations
0,262,"[(18696, 9.23350715637207), (19871, 9.138558387756348), (18559, 8.91126537322998), (3924, 8.356781959533691), (2868, 8.314875602722168), (5648, 8.288540840148926), (10424, 8.111278533935547), (29006, 8.094313621520996), (20584, 8.070418357849121), (1649, 8.037528038024902)]"


In [0]:
#Recommendation list for user 31021
recommend = model.recommendForAllUsers(10)
recommend_31021 = recommend.filter(recommend.reviewerID_index == "31021")
recommend_31021 = recommend_31021.toPandas()
pd.set_option('display.max_colwidth', -1)
recommend_31021

Unnamed: 0,reviewerID_index,recommendations
0,31021,"[(18696, 9.333854675292969), (18559, 9.202361106872559), (2868, 9.099369049072266), (9678, 9.02236557006836), (3924, 9.015478134155273), (13454, 9.012986183166504), (5606, 8.955412864685059), (10819, 8.93883991241455), (19846, 8.938278198242188), (22966, 8.913759231567383)]"


In [0]:
#Recommendation list for user 1560
recommend = model.recommendForAllUsers(10)
recommend_1560 = recommend.filter(recommend.reviewerID_index == "1560")
recommend_1560 = recommend_1560.toPandas()
pd.set_option('display.max_colwidth', -1)
recommend_1560

Unnamed: 0,reviewerID_index,recommendations
0,1560,"[(5606, 9.2345609664917), (10819, 9.110645294189453), (5648, 9.037565231323242), (19871, 8.993192672729492), (18696, 8.937543869018555), (22966, 8.925411224365234), (2868, 8.906685829162598), (16623, 8.88484001159668), (13454, 8.852377891540527), (3924, 8.85135269165039)]"


In [0]:
#Recommendation list for user 3549
recommend = model.recommendForAllUsers(10)
recommend_3549 = recommend.filter(recommend.reviewerID_index == "3549")
recommend_3549 = recommend_3549.toPandas()
pd.set_option('display.max_colwidth', -1)
recommend_3549

Unnamed: 0,reviewerID_index,recommendations
0,3549,"[(18696, 9.078761100769043), (19871, 9.066671371459961), (20584, 8.975422859191895), (18559, 8.911794662475586), (4687, 8.714488983154297), (3924, 8.555971145629883), (7041, 8.475933074951172), (19846, 8.474696159362793), (9526, 8.466764450073242), (2868, 8.464468955993652)]"


In [0]:
from functools import reduce
from pyspark.sql.functions import col, lit, when
from graphframes import *

Popularity Recommender model

In [0]:
#import the reqired libraries
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import numpy as np
import pandas as pd
import math
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.externals import joblib
import scipy.sparse
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
import warnings; warnings.simplefilter('ignore')
%matplotlib inline

In [0]:
# drop not needed columns
df_new = df_output.drop("_corrupt_record", "cleaned_words", "label", "features", "indexedLabel", "probability", "rawFeatures", "rawPrediction", "words", 'reviewText','reviewTime','reviewerName','unixReviewTime','summary')
df_new_output = df_new.selectExpr("asin as item", "prediction as comment", "pre_label as rating", "reviewerID")
df_new_output.count()

In [0]:
df_new_output.describe().show()

In [0]:
#For popularity recommender
#Without filtering 
df_pd = df_new_output.toPandas()
df_pd

Unnamed: 0,item,comment,rating,reviewerID
0,B00002R28C,1.0,4.0,A103B6MQ5IF2BK
1,B00004SVRR,1.0,4.0,A103QIFFX532JU
2,B00004SVXA,1.0,5.0,A105GCSUC8LQ1Z
3,B00004TJCL,1.0,1.0,A105GWGM7PDAI2
4,B00004YRQA,1.0,5.0,A105GWGM7PDAI2
...,...,...,...,...
125205,B00FHB2XB6,1.0,4.0,AYOMAHLWRQHUG
125206,B00F5Y4C7I,1.0,5.0,AYV43L6DGMH5C
125207,B00IVFCGKS,1.0,4.0,AZMISC9FAUBIR
125208,B00FOEHUTQ,1.0,5.0,AZOCGY2PBYHCI


In [0]:
df_negative = df_new_output.filter(df_new_output.comment==0)
df_pd_negative = df_negative.toPandas()
df_pd_negative

Unnamed: 0,item,comment,rating,reviewerID
0,B00004SVMX,0.0,1.0,A1COW0UIB5HA9J
1,B000034DBS,0.0,2.0,A1IQV6AI7L9NOK
2,B00000IODY,0.0,2.0,A1LBAC84TLIGAX
3,B00000JHPT,0.0,1.0,A1LBAC84TLIGAX
4,B000035XPH,0.0,1.0,A1LBAC84TLIGAX
...,...,...,...,...
405,B004LOKRDG,0.0,1.0,ADWXLAKKZP2JC
406,B0074H49NG,0.0,2.0,A25IRWH03ZHRRE
407,B009F7OUPK,0.0,1.0,A2YV6F6DLENNTM
408,B00C6Q9Y2G,0.0,1.0,AX4L6DVN3E0XX


Without filtering

In [0]:
#Count of user_id for each unique product as recommendation score 
train_data_grouped = df_pd.groupby('item').agg({'reviewerID': 'count'}).reset_index()
train_data_grouped.rename(columns = {'reviewerID': 'score'},inplace=True)
train_data_grouped

Unnamed: 0,item,score
0,0439893577,1
1,048645195X,2
2,0545496470,3
3,0615444172,3
4,0670010936,1
...,...,...
33857,B00L1N7FDK,5
33858,B00L4ZCPRG,1
33859,B00LA12PNI,1
33860,B00LBI9BKA,6


In [0]:
#Sort the products on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'item'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations

Unnamed: 0,item,score,rank
14126,B001HBHNHE,211,1.0
15248,B001T7QJ9O,156,2.0
31430,B00BGA9WK2,146,3.0
9693,B000S5ODN2,132,4.0
11643,B0015AARJI,119,5.0


In [0]:
# Use popularity based recommender model to make predictions
def recommend(reviewerID):     
    user_recommendations = popularity_recommendations 
          
    #Add reviewerID column for which the recommendations are being generated 
    user_recommendations['reviewerID'] = reviewerID
      
    #Bring reviewerID column to the front 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
          
    return user_recommendations

In [0]:
find_recom = [10,50,100,150]   # This list is user choice.
for i in find_recom:
    print("The list of recommendations for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n")

In [0]:
#Count first recommendation with negative comment
data = df_pd_negative.groupby('item').agg({'reviewerID': 'count'}).reset_index()
data.rename(columns = {'reviewerID': 'score'},inplace=True)
data.sort_values(by='score')


Unnamed: 0,item,score
0,1589944968,1
255,B003XJ35GG,1
254,B003VWGBCK,1
253,B003VUO6H4,1
252,B003VJP92Q,1
...,...,...
90,B000G0HJ3K,3
330,B007XVTR3K,3
3,B0000296O5,3
22,B00005TNI6,3


With filtering

In [0]:
#With filtering
df_pd_new = df_new_output.filter(df_new_output.comment == 1)
df_pd_new = df_pd_new.toPandas()
df_pd_new

Unnamed: 0,item,comment,rating,reviewerID
0,B00002R28C,1.0,4.0,A103B6MQ5IF2BK
1,B00004SVRR,1.0,4.0,A103QIFFX532JU
2,B00004SVXA,1.0,5.0,A105GCSUC8LQ1Z
3,B00004TJCL,1.0,1.0,A105GWGM7PDAI2
4,B00004YRQA,1.0,5.0,A105GWGM7PDAI2
...,...,...,...,...
124795,B00FHB2XB6,1.0,4.0,AYOMAHLWRQHUG
124796,B00F5Y4C7I,1.0,5.0,AYV43L6DGMH5C
124797,B00IVFCGKS,1.0,4.0,AZMISC9FAUBIR
124798,B00FOEHUTQ,1.0,5.0,AZOCGY2PBYHCI


In [0]:
#Split the data randomnly into train and test datasets into 70:30 ratio
#train_data, test_data = train_test_split(
    #df_pd_new, test_size = 0.3, random_state=0)
#train_data.head()

In [0]:
#Count of user_id for each unique product as recommendation score 
train_data_grouped = df_pd_new.groupby('item').agg({'reviewerID': 'count'}).reset_index()
train_data_grouped.rename(columns = {'reviewerID': 'score'},inplace=True)
train_data_grouped

Unnamed: 0,item,score
0,0439893577,1
1,048645195X,2
2,0545496470,3
3,0615444172,3
4,0670010936,1
...,...,...
33824,B00L1N7FDK,5
33825,B00L4ZCPRG,1
33826,B00LA12PNI,1
33827,B00LBI9BKA,6


In [0]:
#Sort the products on recommendation score 
train_data_sort = train_data_grouped.sort_values(['score', 'item'], ascending = [0,1]) 
      
#Generate a recommendation rank based upon score 
train_data_sort['rank'] = train_data_sort['score'].rank(ascending=0, method='first') 
          
#Get the top 5 recommendations 
popularity_recommendations = train_data_sort.head(5) 
popularity_recommendations

Unnamed: 0,item,score,rank
14110,B001HBHNHE,211,1.0
15232,B001T7QJ9O,156,2.0
31399,B00BGA9WK2,146,3.0
9681,B000S5ODN2,132,4.0
11630,B0015AARJI,119,5.0


In [0]:
find_recom = [10,50,100,150]   # This list is user choice.
for i in find_recom:
    print("The list of recommendations for the userId: %d\n" %(i))
    print(recommend(i))    
    print("\n")