In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-mllib-recommendation").getOrCreate()
spark.sparkContext

In [3]:
sc = spark.sparkContext

In [4]:
movielens=sc.textFile("/home/jovyan/dataset/u.data")

In [5]:
#user id, item id, rating, timestamp
movielens.first()

'196\t242\t3\t881250949'

In [6]:
movielens.count()

100000

In [7]:
#處理分隔符號
clean_data=movielens.map(lambda x: x.split('\t'))

In [8]:
clean_data.take(1)

[['196', '242', '3', '881250949']]

In [9]:
#取第三個欄位
rate=clean_data.map(lambda y: int(y[2]))

In [10]:
#avg rating
rate.mean()

3.529859999999947

In [11]:
#看看有多少個不重複用戶
users=clean_data.map(lambda y: int(y[0]))
users.distinct().count()

943

In [12]:
#看看有多少個不重複產品
clean_data.map(lambda y: int(y[1])).distinct().count()

1682

In [13]:
#ALS推薦演算法
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [14]:
#準備 Rating 物件規定的資料格式
#Rating物件的資料格式樣貌 (user, item, rating)
mls=movielens.map(lambda l: l.split('\t'))
ratings=mls.map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))

In [15]:
#分割資料
train, test = ratings.randomSplit([0.7,0.3],7856)

In [16]:
train.take(1)

[Rating(user=196, product=242, rating=3.0)]

In [17]:
train.count()

69958

In [18]:
test.count()

30042

In [19]:
#catch內容於記憶體中
#這樣後續調用比較快
train.cache()
test.cache()

PythonRDD[20] at RDD at PythonRDD.scala:53

In [20]:
#設定潛在特徵數值，先設定5
rank=5

#收斂過程中，要跑幾次
numIterations=10

#建立模型
model = ALS.train(train, rank, numIterations)

In [21]:
#產品的5個特徵數值
model.productFeatures().first()

(1,
 array('d', [-0.40243133902549744, 0.20456479489803314, -1.7681992053985596, -0.6547878980636597, 0.8563587069511414]))

In [22]:
#用戶的5個特徵數值
model.userFeatures().first()

(1,
 array('d', [0.018600981682538986, 0.3614598214626312, -1.7710529565811157, -0.9418526887893677, -0.13516807556152344]))

In [23]:
#用戶196，看前10個推薦商品與該推薦分數(由高到低)
model.recommendProducts(196,10)

[Rating(user=196, product=634, rating=6.0896945945802425),
 Rating(user=196, product=1160, rating=6.084288504969073),
 Rating(user=196, product=850, rating=5.986961408581385),
 Rating(user=196, product=1664, rating=5.965617009384317),
 Rating(user=196, product=1140, rating=5.517412905333394),
 Rating(user=196, product=1275, rating=5.505260294750222),
 Rating(user=196, product=361, rating=5.501632058760078),
 Rating(user=196, product=1159, rating=5.467000901668468),
 Rating(user=196, product=1269, rating=5.426614211716),
 Rating(user=196, product=915, rating=5.422986113889523)]

In [24]:
#產品242，看前10個推薦用戶與該推薦分數(由高到低)
model.recommendUsers(242,10)

[Rating(user=928, product=242, rating=5.947215033417579),
 Rating(user=219, product=242, rating=5.851296766565415),
 Rating(user=697, product=242, rating=5.806991712425756),
 Rating(user=68, product=242, rating=5.7767404014294295),
 Rating(user=691, product=242, rating=5.583135445084304),
 Rating(user=808, product=242, rating=5.545778647649928),
 Rating(user=440, product=242, rating=5.510175904262549),
 Rating(user=4, product=242, rating=5.480801918707131),
 Rating(user=75, product=242, rating=5.434570083270194),
 Rating(user=725, product=242, rating=5.417899612803919)]

In [25]:
#單獨查找
model.predict(196,242)

3.352108932428621

In [26]:
pred_input = train.map(lambda x:(x[0],x[1])) 
pred = model.predictAll(pred_input) 

In [29]:
pred_input.take(2)
pred.take(2)

[Rating(user=195, product=1084, rating=4.378518709746748),
 Rating(user=58, product=1084, rating=3.715786220803074)]

In [30]:
#true_reorg: 真實的結果
#pred_reorg: 測試的結果
true_reorg=train.map(lambda x: ((x[0],x[1]), x[2]))
pred_reorg=pred.map(lambda x: ((x[0],x[1]), x[2]))

In [31]:
#結合兩個資料
true_pred=true_reorg.join(pred_reorg)

In [39]:
#預測與實際結果
true_pred.take(2)

[((196, 242), (3.0, 3.352108932428621)),
 ((186, 302), (3.0, 3.2152710532629647))]

In [32]:
from math import sqrt

In [33]:
#MSE=(預測值評分-實際值評分)平方/樣本數
MSE=true_pred.map(lambda r: (r[1][0]-r[1][1])**2).mean()

In [34]:
RMSE=sqrt(MSE)

In [35]:
print("MSE={}".format(MSE))
print("RMSE={}".format(sqrt(MSE)))

MSE=0.5883659152677971
RMSE=0.7670501386922481


In [36]:
#測試集的模型績效
test_input=test.map(lambda x:(x[0],x[1]))
pred_test=model.predictAll(test_input)
test_reorg=test.map(lambda x:((x[0],x[1]),x[2]))
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))
test_pred = test_reorg.join(pred_reorg)
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
test_RMSE = sqrt(test_MSE)

In [37]:
#儲存模型
model.save(sc,"/home/jovyan/dataset/ml-recommandation-model")

In [38]:
#讀取模型
sameModel=MatrixFactorizationModel.load(sc, "/home/jovyan/dataset/ml-recommandation-model")

In [40]:
#讀取模型後立即預測
data=sc.parallelize([(196,242)])
pred=predictions=sameModel.predictAll(data)

In [41]:
pred.take(1)

[Rating(user=196, product=242, rating=3.352108932428621)]

In [42]:
sc.stop()