In [27]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [28]:
#create SparkSession "spark01"
from pyspark.sql import SparkSession
spark01 = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [29]:
#以 SparkSession.read.text() 讀取 text file, 轉成DataFrame "df01"
df01=spark01.read.text("file:///home/hadoop/sample_movielens_ratings.txt")

In [30]:
df01.printSchema()

root
 |-- value: string (nullable = true)



In [31]:
df01.show()

+--------------------+
|               value|
+--------------------+
| 0::2::3::1424380312|
| 0::3::1::1424380312|
| 0::5::2::1424380312|
| 0::9::4::1424380312|
|0::11::1::1424380312|
|0::12::2::1424380312|
|0::15::1::1424380312|
|0::17::1::1424380312|
|0::19::1::1424380312|
|0::21::1::1424380312|
|0::23::1::1424380312|
|0::26::3::1424380312|
|0::27::1::1424380312|
|0::28::1::1424380312|
|0::29::1::1424380312|
|0::30::1::1424380312|
|0::31::1::1424380312|
|0::34::1::1424380312|
|0::37::1::1424380312|
|0::41::2::1424380312|
+--------------------+
only showing top 20 rows



In [32]:
#以 DataFrame.rdd 將 DataFrame "df01" 轉成RDD "rawDataRDD"
rawDataRDD=df01.rdd

In [33]:
rawDataRDD.first() #檢視 rawDataRDD 第一筆資料

Row(value='0::2::3::1424380312')

In [34]:
#以RDD.map(lambda) 將 RDD "rawDataRDD" 依 "::" 分割 map 成 RDD "partsRDD"
partsRDD = rawDataRDD.map(lambda r: r.value.split("::"))

In [35]:
partsRDD.first() #['0','2','3','143......']

['0', '2', '3', '1424380312']

In [38]:
#資料型別轉換, 使用 Row type
ratingsRDD = partsRDD.map(lambda x: Row(userId=int(x[0]), movieId=int(x[1]),
                                     rating=float(x[2]), timestamp=int(x[3])))

In [39]:
ratingsRDD.first()

Row(movieId=2, rating=3.0, timestamp=1424380312, userId=0)

In [40]:
#最後,再以SparkSeeeion.createDataFrame(RDD), 將RDD "ratingsRDD" 轉回成 DataFrame "ratingsDF"
ratingsDF = spark01.createDataFrame(ratingsRDD)

In [41]:
ratingsDF.show(5)

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
+-------+------+----------+------+
only showing top 5 rows



In [42]:
#於是,我們可以用 DataFrame-Based Collaborative filtering 運算

In [43]:
#step 1: 以DataFrame.randomSplit() 將 DataFrame "ratingsDF" 依設定機率比例 (8:2) 分成訓練資料 "trainingDF" 及 測試資料 "testDF"
(trainingDF, testDF) = ratingsDF.randomSplit([0.8, 0.2])

In [44]:
print('訓練資料筆數: '+str(trainingDF.count())+' 測試資料筆數: '+str(testDF.count()))

訓練資料筆數: 1183 測試資料筆數: 318


In [45]:
#step 2:  Build the recommendation model  " alsModel" using ALS on the training data
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating") #als, Alternating Least Squares (ALS) matrix factorization.

In [46]:
#以 Estimator.fit(training Data), 訓練階段, 得到模型 alsModel
alsModel=als.fit(trainingDF)  #alsModel, pyspark.ml.recommendation.ALSModel

In [47]:
type(alsModel)

pyspark.ml.recommendation.ALSModel

In [48]:
#step 3:  Evaluate the model by computing the RMSE on the test data
#             We evaluate the recommendation model by measuring the root-mean-square error of rating prediction.

In [49]:
#使用 Transformer.transform(test data), 預測test 資料 "testDF", 回傳的是 DataFrame "predictionsDF"
predictionsDF = alsModel.transform(testDF)

In [50]:
predictionsDF.show(5)

+-------+------+----------+------+----------+
|movieId|rating| timestamp|userId|prediction|
+-------+------+----------+------+----------+
|     31|   1.0|1424380312|    13|  1.167619|
|     31|   3.0|1424380312|     7| 1.1293797|
|     31|   3.0|1424380312|    14| 1.7049236|
|     31|   1.0|1424380312|     0| 1.3734546|
|     85|   1.0|1424380312|    13| 1.2992957|
+-------+------+----------+------+----------+
only showing top 5 rows



In [51]:
#以 RegressionEvaluator() 建立評估器 "evaluator01", pyspark.ml.evaluation.RegressionEvaluator
evaluator01 = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")

In [52]:
#以 RegressionEvaluator.evaluate() 計算 rmse
rmse = evaluator01.evaluate(predictionsDF)

In [53]:
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.7657409802124948
