In [1]:
# import libraries
import time
from pyspark import SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
appName="Collaborative Filtering with PySpark"
# initialize the spark session
spark = SparkSession.builder.appName(appName).getOrCreate()
# get sparkcontext from the sparksession
sc = spark.sparkContext

In [2]:
#define schema
schema = StructType([
    StructField("item", StringType(), True),
    StructField("user", StringType(), True),
    StructField("rating", StringType(), True),
    StructField("timestamp", IntegerType(), True)])
#read the file as a dataframe
df = spark.read.csv("Gift_Cards.csv",header=False,schema=schema)
#print the schema
df.printSchema()
#show the dataframe header
#df.show(n=5)
#number of rows
df.count()
#convert rating colum from string to integer
df = df.withColumn("rating", df["rating"].cast(IntegerType()))
df.show(n=5)
#provide index values for item and user to convert them into integers
stringIndexer = StringIndexer(inputCols=["item","user"], outputCols=["itemIndex","userIndex"])
model = stringIndexer.fit(df)
df_indexed = model.transform(df)
df_indexed.show(n=5)

root
 |-- item: string (nullable = true)
 |-- user: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: integer (nullable = true)

+----------+--------------+------+----------+
|      item|          user|rating| timestamp|
+----------+--------------+------+----------+
|B001GXRQW0| APV13CM0919JD|     1|1229644800|
|B001GXRQW0|A3G8U1G1V082SN|     5|1229472000|
|B001GXRQW0| A11T2Q0EVTUWP|     5|1229472000|
|B001GXRQW0| A9YKGBH3SV22C|     5|1229472000|
|B001GXRQW0|A34WZIHVF3OKOL|     1|1229472000|
+----------+--------------+------+----------+
only showing top 5 rows

+----------+--------------+------+----------+---------+---------+
|      item|          user|rating| timestamp|itemIndex|userIndex|
+----------+--------------+------+----------+---------+---------+
|B001GXRQW0| APV13CM0919JD|     1|1229644800|     39.0| 120184.0|
|B001GXRQW0|A3G8U1G1V082SN|     5|1229472000|     39.0|  87652.0|
|B001GXRQW0| A11T2Q0EVTUWP|     5|1229472000|     39.0|  13165.0|
|B001GX

In [3]:
#split the data into training and testing set
(training, test) = df_indexed.randomSplit([0.8, 0.2])
#training the model
#define the model parameters
als = ALS(maxIter=5, 
          implicitPrefs=False,
          userCol="userIndex", 
          itemCol="itemIndex", 
          ratingCol="rating",
          coldStartStrategy="drop")
#start time
start_time = time.time()
#train the model
model = als.fit(training)
# predict using the testing datatset
predictions = model.transform(test)
#end time
print("--- %s seconds ---" % (time.time() - start_time))
predictions.show()

--- 14.222917079925537 seconds ---
+----------+--------------+------+----------+---------+---------+-----------+
|      item|          user|rating| timestamp|itemIndex|userIndex| prediction|
+----------+--------------+------+----------+---------+---------+-----------+
|B00MV9O08G|A10H4SOMLYPYS0|     5|1458000000|    148.0|   1457.0|  1.2122207|
|B00MV9O08G| ATZ9ZSR664K54|     5|1469404800|    148.0|     61.0|  3.0067074|
|B00MV9O08G| ALQUMN89FI5HD|     5|1511740800|    148.0|      7.0|   2.729206|
|B00MV9O08G|A3PFH1ZJH5DAXN|     4|1463961600|    148.0|   9201.0|0.026842471|
|B00MV9O08G|A241GI5D9EVJI6|     5|1482278400|    148.0|    482.0|  1.5886235|
|B00MV9O08G| AA47T2SDZBR8A|     5|1487808000|    148.0|   9960.0|-0.88929456|
|B00MV9O08G|A1GKK3NBBZ9W6I|     5|1453420800|    148.0|   4371.0|  1.3437798|
|B00MV9O08G| AA0XW0HGIL0XK|     5|1501113600|    148.0|    357.0|   3.972007|
|B00BT1XEB6|A1DC5RDWVSJBXQ|     5|1419120000|    463.0|    256.0|  3.1039681|
|B00BXLV9NM|A13OFOB1394G31|  

In [4]:
def topLikes(dataframe,userIndex,limit):
 df = dataframe.filter(dataframe.userIndex==userIndex)\
 .sort(dataframe.rating.desc())\
 .select(dataframe.userIndex,dataframe.itemIndex,dataframe.rating)\
 .limit(limit)
 return df
# display top liked items for a user
topLikes(df_indexed,9386,10).show(truncate=False)

+---------+---------+------+
|userIndex|itemIndex|rating|
+---------+---------+------+
|9386.0   |463.0    |5     |
|9386.0   |53.0     |4     |
+---------+---------+------+



In [5]:
def recommendedItems(userIndex,limit):
    test =  model.recommendForAllUsers(5)\
        .filter(col('userIndex')==userIndex)\
        .select(["recommendations.itemIndex","recommendations.rating"])\
        .collect()
    return test
# display top 5 recommended artists for user 2062243
recommendedItems(9386,5)

[Row(itemIndex=[884, 1163, 1196, 905, 612], rating=[9.235403060913086, 7.620708465576172, 7.283617973327637, 7.173076629638672, 7.070064544677734])]