In [1]:
# import libraries
from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
appName="Collaborative Filtering with PySpark"
# initialize the spark session
spark = SparkSession.builder.appName(appName).getOrCreate()
# get sparkcontext from the sparksession
sc = spark.sparkContext

In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from surprise import Dataset, Reader, KNNWithMeans, SVD
from surprise.model_selection import GridSearchCV
import time

In [2]:
#define schema
schema = StructType([
    StructField("item", StringType(), True),
    StructField("user", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("timestamp", IntegerType(), True)])
#read the file as a dataframe
df = spark.read.csv("Gift_Cards.csv",header=False,schema=schema)
#print the schema
df.printSchema()
#show the dataframe header
#df.show(n=5)
#number of rows
df.count()
#convert rating colum from string to integer
df = df.withColumn("rating", df["rating"].cast(IntegerType()))
df.show(n=5)
#provide index values for item and user to convert them into integers
stringIndexer = StringIndexer(inputCols=["item","user"], outputCols=["itemIndex","userIndex"])
model = stringIndexer.fit(df)
df_indexed = model.transform(df)
df_indexed.show(n=5)

root
 |-- item: string (nullable = true)
 |-- user: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: integer (nullable = true)

+----------+--------------+------+----------+
|      item|          user|rating| timestamp|
+----------+--------------+------+----------+
|B001GXRQW0| APV13CM0919JD|     1|1229644800|
|B001GXRQW0|A3G8U1G1V082SN|     5|1229472000|
|B001GXRQW0| A11T2Q0EVTUWP|     5|1229472000|
|B001GXRQW0| A9YKGBH3SV22C|     5|1229472000|
|B001GXRQW0|A34WZIHVF3OKOL|     1|1229472000|
+----------+--------------+------+----------+
only showing top 5 rows

+----------+--------------+------+----------+---------+---------+
|      item|          user|rating| timestamp|itemIndex|userIndex|
+----------+--------------+------+----------+---------+---------+
|B001GXRQW0| APV13CM0919JD|     1|1229644800|     39.0| 120184.0|
|B001GXRQW0|A3G8U1G1V082SN|     5|1229472000|     39.0|  87652.0|
|B001GXRQW0| A11T2Q0EVTUWP|     5|1229472000|     39.0|  13165.0|
|B001GX

In [9]:
#convert the indexed spark dataframe to pandas dataframe
pandas_df = df.toPandas()
del pandas_df["timestamp"]
pandas_df.head()

Unnamed: 0,item,user,rating
0,B001GXRQW0,APV13CM0919JD,1
1,B001GXRQW0,A3G8U1G1V082SN,5
2,B001GXRQW0,A11T2Q0EVTUWP,5
3,B001GXRQW0,A9YKGBH3SV22C,5
4,B001GXRQW0,A34WZIHVF3OKOL,1


In [12]:
# create a reader object to parse the dataset in the format appropriate for Surprise
reader = Reader(rating_scale=(1, 5))
# load the dataframe with reader as Surprise dataset object
data = Dataset.load_from_df(pandas_df, reader)

In [13]:
# To use user-based cosine similarity
sim_options_user = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}
# To use item-based cosine similarity
sim_options_item = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}

In [14]:
#choose centered KNN as the algorithm for finding similar users
algo = KNNWithMeans(sim_options=sim_options_user)
#build the training set
trainingSet = data.build_full_trainset()
#fit the training set into the model
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7faa9f2d8a60>

In [15]:
#options for grid search
sim_options_gs = {
    "name": ["cosine"],
    "min_support": [5],
    "user_based": [True],
}

param_grid = {"sim_options": sim_options_gs}
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=5)
gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.9492619721671044
{'sim_options': {'name': 'cosine', 'min_support': 5, 'user_based': True}}


In [18]:
start_time = time.time()
#main()
#for matrix factorization
param_grid = {
    "n_epochs": [10],
    "lr_all": [0.01],
    "reg_all": [0.5]
}

gs2 = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=5)

gs2.fit(data)

print(gs2.best_score["rmse"])
print(gs2.best_params["rmse"])
print("--- %s seconds ---" % (time.time() - start_time))

0.9286156303881361
{'n_epochs': 10, 'lr_all': 0.01, 'reg_all': 0.5}
--- 23.315891981124878 seconds ---
