In [1]:
#ch12_ALS_51.ipynb, Collaborative Filtering- spark.ml, https://spark.apache.org/docs/2.0.0-preview/ml-collaborative-filtering.html

In [2]:
#Collaborative filtering is commonly used for recommender systems. 
#spark.ml currently supports model-based collaborative filtering, in which users and products are described by a small set of latent factors 
#that can be used to predict missing entries.
#spark.ml uses the alternating least squares (ALS) algorithm to learn these latent factors.

#https://spark.apache.org/docs/2.0.0-preview/api/python/pyspark.ml.html#pyspark.ml.recommendation.ALS

In [3]:
#The implementation in spark.ml has the following parameters:
#numBlocks --> is the number of blocks the users and items will be partitioned into in order to parallelize computation (defaults to 10).
#           rank --> is the number of latent factors in the model (defaults to 10).
#     maxIter --> is the maximum number of iterations to run (defaults to 10).
#  regParam --> specifies the regularization parameter in ALS (defaults to 1.0).
#implicitPrefs --> specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data
#                          (defaults to false which means using explicit feedback).
#        alpha  --> is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations 
#                         (defaults to 1.0).
#nonnegative --> specifies whether or not to use nonnegative constraints for least squares (defaults to false).

In [4]:
#1. example:
#In the following example, we load rating data from the MovieLens dataset, each row consisting of a user, a movie, a rating and a timestamp.
#We then train an ALS model which assumes, by default, that the ratings are explicit (implicitPrefs is False). 
#We evaluate the recommendation model by measuring the root-mean-square error of rating prediction.

In [5]:
#

In [6]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [9]:
#create SparkSession "spark01"
from pyspark.sql import SparkSession
spark01 = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [11]:
#以 SparkSession.read.text() 讀取 text file, 轉成DataFrame "df01"
df01=spark01.read.text("sample_movielens_ratings.txt")

In [20]:
df01.printSchema()

root
 |-- value: string (nullable = true)



In [21]:
df01.show()

+--------------------+
|               value|
+--------------------+
| 0::2::3::1424380312|
| 0::3::1::1424380312|
| 0::5::2::1424380312|
| 0::9::4::1424380312|
|0::11::1::1424380312|
|0::12::2::1424380312|
|0::15::1::1424380312|
|0::17::1::1424380312|
|0::19::1::1424380312|
|0::21::1::1424380312|
|0::23::1::1424380312|
|0::26::3::1424380312|
|0::27::1::1424380312|
|0::28::1::1424380312|
|0::29::1::1424380312|
|0::30::1::1424380312|
|0::31::1::1424380312|
|0::34::1::1424380312|
|0::37::1::1424380312|
|0::41::2::1424380312|
+--------------------+
only showing top 20 rows



In [23]:
#以 DataFrame.rdd 將 DataFrame "df01" 轉成RDD "rawDataRDD"
rawDataRDD=df01.rdd

In [24]:
rawDataRDD.first() #檢視 rawDataRDD 第一筆資料

Row(value=u'0::2::3::1424380312')

In [25]:
#以RDD.map(lambda) 將 RDD "rawDataRDD" 依 "::" 分割 map 成 RDD "partsRDD"
partsRDD = rawDataRDD.map(lambda r: r.value.split("::"))

In [26]:
partsRDD.first() #['0','2','3','143......'] 

[u'0', u'2', u'3', u'1424380312']

In [27]:
#資料型別轉換, 使用 Row type
ratingsRDD = partsRDD.map(lambda x: Row(userId=int(x[0]), movieId=int(x[1]),
                                     rating=float(x[2]), timestamp=long(x[3])))

In [30]:
ratingsRDD.first()

Row(movieId=2, rating=3.0, timestamp=1424380312L, userId=0)

In [31]:
#最後,再以SparkSeeeion.createDataFrame(RDD), 將RDD "ratingsRDD" 轉回成 DataFrame "ratingsDF"
ratingsDF = spark01.createDataFrame(ratingsRDD)

In [33]:
ratingsDF.show(5)

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
+-------+------+----------+------+
only showing top 5 rows



In [34]:
#於是,我們可以用 DataFrame-Based Collaborative filtering 運算

In [35]:
#step 1: 以DataFrame.randomSplit() 將 DataFrame "ratingsDF" 依設定機率比例 (8:2) 分成訓練資料 "trainingDF" 及 測試資料 "testDF"
(trainingDF, testDF) = ratingsDF.randomSplit([0.8, 0.2])

In [37]:
print('訓練資料筆數: '+str(trainingDF.count())+' 測試資料筆數: '+str(testDF.count()))

訓練資料筆數: 1187 測試資料筆數: 314


In [38]:
#step 2:  Build the recommendation model  " alsModel" using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating") #als, Alternating Least Squares (ALS) matrix factorization.

In [41]:
#以 Estimator.fit(training Data), 訓練階段, 得到模型 alsModel
alsModel=als.fit(trainingDF)  #alsModel, pyspark.ml.recommendation.ALSModel

In [43]:
type(alsModel)

pyspark.ml.recommendation.ALSModel

In [44]:
#step 3:  Evaluate the model by computing the RMSE on the test data
#             We evaluate the recommendation model by measuring the root-mean-square error of rating prediction.

In [45]:
#使用 Transformer.transform(test data), 預測test 資料 "testDF", 回傳的是 DataFrame "predictionsDF"
predictionsDF = alsModel.transform(testDF)

In [46]:
predictionsDF.show(5)

+-------+------+----------+------+----------+
|movieId|rating| timestamp|userId|prediction|
+-------+------+----------+------+----------+
|     31|   3.0|1424380312|     7|0.02137813|
|     31|   1.0|1424380312|    18| 1.6260848|
|     85|   1.0|1424380312|    12| 2.9139917|
|     85|   3.0|1424380312|     6| 2.6702542|
|     85|   5.0|1424380312|     8| 5.3887396|
+-------+------+----------+------+----------+
only showing top 5 rows



In [47]:
#以 RegressionEvaluator() 建立評估器 "evaluator01", pyspark.ml.evaluation.RegressionEvaluator
evaluator01 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [49]:
#以 RegressionEvaluator.evaluate() 計算 rmse
rmse = evaluator01.evaluate(predictionsDF)

In [51]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.89281881552
