In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
df = spark\
    .read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(r"C:\Users\Britt\Documents\RUG\Year 1\Big Data Analystics\ratings_Beauty.csv")

In [3]:
df.show(5)

+--------------+----------+------+----------+
|        UserId| ProductId|Rating| Timestamp|
+--------------+----------+------+----------+
|A39HTATAQ9V7YF|0205616461|   5.0|1369699200|
|A3JM6GV9MNOF9X|0558925278|   3.0|1355443200|
|A1Z513UWSAAO0F|0558925278|   5.0|1404691200|
|A1WMRR494NWEWV|0733001998|   4.0|1382572800|
|A3IAAVS479H7M7|0737104473|   1.0|1274227200|
+--------------+----------+------+----------+
only showing top 5 rows



In [4]:
df.createOrReplaceTempView("dfTable")

In [5]:
df = df.na.drop()

In [6]:
df = spark.sql("SELECT UserId, CAST(ProductId AS INT) as ProductId, Rating FROM dfTable")

In [7]:
#to find out if there are doubles in the user id column
df.selectExpr("count(*)").show()

+--------+
|count(1)|
+--------+
| 2023070|
+--------+



In [8]:
#to find out if there are doubles in the user id column
df.selectExpr("count(distinct(UserId))").show()
#there are doubles

+----------------------+
|count(DISTINCT UserId)|
+----------------------+
|               1210271|
+----------------------+



In [9]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="UserId", outputCol="Users")
df = indexer.fit(df).transform(df)

In [10]:
df.show(3)

+--------------+---------+------+--------+
|        UserId|ProductId|Rating|   Users|
+--------------+---------+------+--------+
|A39HTATAQ9V7YF|205616461|   5.0| 70392.0|
|A3JM6GV9MNOF9X|558925278|   3.0|265306.0|
|A1Z513UWSAAO0F|558925278|   5.0|552933.0|
+--------------+---------+------+--------+
only showing top 3 rows



In [11]:
df.printSchema()

root
 |-- UserId: string (nullable = true)
 |-- ProductId: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Users: double (nullable = false)



## ALS Model

In [12]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [13]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["Users", "ProductId"], outputCol="features", handleInvalid="skip")
df = assembler.transform(df)

In [14]:
train, test = df.randomSplit([0.8, 0.2])

In [15]:
als = ALS(maxIter=5, \
          regParam=0.01, \
          userCol="Users", \
          itemCol="ProductId", \
          ratingCol="Rating", \
         coldStartStrategy="drop"\
          )

In [None]:
model = als.fit(train)

In [None]:
predictions = model.transform(test)
predictions.show(5)

In [None]:
evaluator = RegressionEvaluator(metricName = 'rmse',\
                                labelCol = 'rating', \
                                predictionCol = 'predictions'\
                                )

In [None]:
rmse = evaluator.evaluate(predictions)
print('RMSE:', rmse)