In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-mllib-recommendation").getOrCreate()
spark.sparkContext

In [3]:
sc = spark.sparkContext

In [4]:
movielens=sc.textFile("/home/jovyan/dataset/u.data")

In [5]:
#user id, item id, rating, timestamp
movielens.first()

'196\t242\t3\t881250949'

In [6]:
movielens.count()

100000

In [7]:
#處理分隔符號
clean_data=movielens.map(lambda x: x.split('\t'))

In [8]:
clean_data.take(1)

[['196', '242', '3', '881250949']]

In [10]:
#取第三個欄位
rate=clean_data.map(lambda y: int(y[2]))

In [11]:
#avg rating
rate.mean()

3.529859999999947

In [12]:
#看看有多少個不重複用戶
users=clean_data.map(lambda y: int(y[0]))
users.distinct().count()

943

In [13]:
#看看有多少個不重複產品
clean_data.map(lambda y: int(y[1])).distinct().count()

1682

In [14]:
#ALS推薦演算法
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [15]:
#準備 Rating 物件規定的資料格式
#Rating物件的資料格式樣貌 (user, item, rating)
mls=movielens.map(lambda l: l.split('\t'))
ratings=mls.map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))

In [16]:
#分割資料
train, test = ratings.randomSplit([0.7,0.3],7856)

In [40]:
train.take(1)

[Rating(user=196, product=242, rating=3.0)]

In [17]:
train.count()

69958

In [18]:
test.count()

30042

In [19]:
#catch內容於記憶體中
#這樣後續調用比較快
train.cache()
test.cache()

PythonRDD[19] at RDD at PythonRDD.scala:53

In [23]:
#設定潛在特徵數值，先設定5
rank=5

#收斂過程中，要跑幾次
numIterations=10

#建立模型
model = ALS.train(train, rank, numIterations)

In [24]:
#產品的5個特徵數值
model.productFeatures().first()

(1,
 array('d', [0.5213958621025085, 0.13763175904750824, -0.017594898119568825, 1.8354758024215698, 0.40850913524627686]))

In [25]:
#用戶的5個特徵數值
model.userFeatures().first()

(1,
 array('d', [0.021873801946640015, 0.5708807706832886, 0.04599589854478836, 2.074838638305664, 0.2731609046459198]))

In [26]:
#用戶196，看前10個推薦商品與該推薦分數(由高到低)
model.recommendProducts(196,10)

[Rating(user=196, product=1426, rating=7.237634527305012),
 Rating(user=196, product=867, rating=7.192165136068159),
 Rating(user=196, product=593, rating=7.183379122634411),
 Rating(user=196, product=793, rating=6.719495557695396),
 Rating(user=196, product=394, rating=6.658248992680534),
 Rating(user=196, product=1294, rating=6.544150037831152),
 Rating(user=196, product=1153, rating=6.384086541982359),
 Rating(user=196, product=1327, rating=6.355679960337517),
 Rating(user=196, product=1410, rating=6.340010332289142),
 Rating(user=196, product=850, rating=6.336501945554332)]

In [28]:
#產品242，看前10個推薦用戶與該推薦分數(由高到低)
model.recommendUsers(242,10)

[Rating(user=153, product=242, rating=7.499225878582978),
 Rating(user=341, product=242, rating=6.267978213321133),
 Rating(user=415, product=242, rating=5.720972375940892),
 Rating(user=310, product=242, rating=5.7013117262324435),
 Rating(user=519, product=242, rating=5.667413133641783),
 Rating(user=4, product=242, rating=5.6000255372331935),
 Rating(user=731, product=242, rating=5.582284195579756),
 Rating(user=219, product=242, rating=5.57167447872445),
 Rating(user=550, product=242, rating=5.382509048782568),
 Rating(user=34, product=242, rating=5.355218000933439)]

In [29]:
#單獨查找
model.predict(196,242)

3.7152653226588477

In [34]:
pred_input = train.map(lambda x:(x[0],x[1])) 
pred = model.predictAll(pred_input) 

In [44]:
#true_reorg: 真實的結果
#pred_reorg: 測試的結果
true_reorg=train.map(lambda x: ((x[0],x[1]), x[2]))
pred_reorg=pred.map(lambda x: ((x[0],x[1]), x[2]))

In [47]:
#結合兩個資料
true_pred=true_reorg.join(pred_reorg)

In [48]:
from math import sqrt

In [50]:
#MSE=(預測值評分-實際值評分)平方/樣本數
MSE=true_pred.map(lambda r: (r[1][0]-r[1][1])**2).mean()

In [51]:
RMSE=sqrt(MSE)

In [52]:
print("MSE={}".format(MSE))
print("RMSE={}".format(sqrt(MSE)))

MSE=0.5782882600292798
RMSE=0.7604526678428315


In [53]:
#測試集的模型績效
test_input=test.map(lambda x:(x[0],x[1]))
pred_test=model.predictAll(test_input)
test_reorg=test.map(lambda x:((x[0],x[1]),x[2]))
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))
test_pred = test_reorg.join(pred_reorg)
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
test_RMSE = sqrt(test_MSE)

In [54]:
#儲存模型
model.save(sc,"/home/jovyan/dataset/ml-recommandation-model")

In [55]:
#讀取模型
sameModel=MatrixFactorizationModel.load(sc, "/home/jovyan/dataset/ml-recommandation-model")

In [56]:
#讀取模型後立即預測
data=sc.parallelize([(196,242)])
pred=predictions=sameModel.predictAll(data)

In [57]:
pred.take(1)

[Rating(user=196, product=242, rating=3.7152653226588477)]

In [58]:
sc.stop()