In [14]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row, SparkSession

In [15]:
spark = SparkSession.builder.appName('Recommender-app').getOrCreate()

In [16]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

schema = StructType([
    StructField("user_id", IntegerType(), True),
    StructField("book_id", IntegerType(), True),
    StructField("is_read", IntegerType(), True),
    StructField("rating", FloatType(), True)])


ratings = spark.read.csv("../data/books_ratings.csv", header=True, schema=schema)

In [17]:
ratings.show()

+-------+-------+-------+------+
|user_id|book_id|is_read|rating|
+-------+-------+-------+------+
|      0|    948|      1|   5.0|
|      0|    947|      1|   5.0|
|      0|    946|      1|   5.0|
|      0|    945|      1|   5.0|
|      0|    944|      1|   5.0|
|      0|    943|      1|   5.0|
|      0|    942|      1|   5.0|
|      0|    941|      1|   5.0|
|      0|    940|      1|   5.0|
|      0|    939|      1|   5.0|
|      0|    938|      1|   5.0|
|      0|    937|      1|   4.0|
|      0|    936|      1|   4.0|
|      0|    935|      1|   4.0|
|      0|    934|      1|   5.0|
|      0|    933|      1|   4.0|
|      0|    932|      1|   4.0|
|      0|    931|      1|   5.0|
|      0|    930|      1|   2.0|
|      0|    929|      1|   4.0|
+-------+-------+-------+------+
only showing top 20 rows



22/02/01 14:36:08 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 5, schema size: 4
CSV file: file:///home/oussama/Desktop/workspace/matrix_factorization_recommender/data/books_ratings.csv


In [18]:
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="book_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(ratings)

                                                                                

In [6]:
# predictions = model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
#                                 predictionCol="prediction")
# rmse = evaluator.evaluate(predictions)
# print("Root-mean-square error = " + str(rmse))

22/02/01 13:04:33 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 5, schema size: 4
CSV file: file:///home/oussama/Desktop/workspace/matrix_factorization_recommender/data/books_ratings.csv

Root-mean-square error = 2.225046600971715


                                                                                

In [7]:
# movieRecs = model.recommendForAllItems(1)



In [8]:
# movieRecs.show()



+-------+-------------------+
|book_id|    recommendations|
+-------+-------------------+
|      1|         [{0, 0.0}]|
|     12|[{1775, 5.1165895}]|
|     13|  [{58, 10.170677}]|
|     22| [{545, 3.8608067}]|
|     26| [{329, 5.6786933}]|
|     27|         [{0, 0.0}]|
|     28|  [{58, 12.186274}]|
|     31|  [{58, 7.4835396}]|
|     34|         [{0, 0.0}]|
|     44|         [{0, 0.0}]|
|     47|  [{181, 8.949086}]|
|     52|[{1249, 4.8477545}]|
|     53|[{1760, 14.496603}]|
|     65|         [{0, 0.0}]|
|     76|         [{0, 0.0}]|
|     78|   [{509, 9.35182}]|
|     81|         [{0, 0.0}]|
|     85| [{545, 6.4346776}]|
|     91|         [{0, 0.0}]|
|     93|[{1111, 10.883413}]|
+-------+-------------------+
only showing top 20 rows



                                                                                

In [19]:
import pandas as pd

df_user_0 = pd.DataFrame([0], columns=['user_id'])

In [20]:
df_user_0.head()

Unnamed: 0,user_id
0,0


In [23]:
user_0_ratings = model.recommendForUserSubset(spark.createDataFrame(pd.DataFrame([0], columns=['user_id'])), 10)



In [None]:
from pyspark.sql.functions import explode

user_0_ratings.select("user_id", explode("recommendations")
  .alias("recommendation")).select("user_id", "recommendation.*").drop('user_id').toPandas()

In [111]:
user_0_ratings.select("recommendations.book_id", "recommendations.rating").first().rating[0]

                                                                                

11.715402603149414

                                                                                

'{"book_id":{"0":146016,"1":18787,"2":101359,"3":76684,"4":89758,"5":83432,"6":76414,"7":173268,"8":50156,"9":116020},"rating":{"0":11.7154026031,"1":10.4560403824,"2":10.2674369812,"3":10.2039899826,"4":10.1534767151,"5":9.6832637787,"6":9.5981225967,"7":9.5338859558,"8":9.5289888382,"9":9.5102005005}}'