In [1]:
from pyspark.sql import SparkSession 

In [2]:
spark = SparkSession.builder.master("local").appName("pyspark-recomm-1").getOrCreate()
spark.sparkContext

In [3]:
sc = spark.sparkContext

In [9]:
movielens=sc.textFile("/home/jovyan/dataset/u.data")

In [10]:
#user id | item id | rating | timestamp
movielens.first()

'196\t242\t3\t881250949'

In [11]:
movielens.count()

100000

In [12]:
#處理分隔符號
clean_data=movielens.map(lambda x: x.split('\t'))

In [13]:
clean_data.take(1)

[['196', '242', '3', '881250949']]

In [14]:
rate=clean_data.map(lambda y: int(y[2]))

In [17]:
#avg rating
rate.mean()

3.529859999999947

In [21]:
#extract unique users
users=clean_data.map(lambda y: int(y[0]))
users.distinct().count()

943

In [23]:
#extract unique items
clean_data.map(lambda y: int(y[1])).distinct().count()

1682

In [24]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [25]:
#準備 Rating 物件規定的資料格式
#Rating物件的資料格式樣貌 (user, item, rating)
mls=movielens.map(lambda l: l.split('\t'))
ratings=mls.map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))

In [26]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [27]:
train.count()

69958

In [28]:
test.count()

30042

In [29]:
#catch內容於記憶體中
train.cache()
test.cache()

PythonRDD[36] at RDD at PythonRDD.scala:53

In [31]:
# Setting up the parameters for ALS

# latent factors to be made
rank=5

# times to repeat process
numIterations=10

# create the model on the training data
model = ALS.train(train, rank, numIterations)



In [32]:
# examine the latent features for one product
model.productFeatures().first()

(1,
 array('d', [1.0611644983291626, -0.3682734966278076, 0.8250443339347839, -1.5765724182128906, 0.3025120198726654]))

In [33]:
# examine the latent features for one user
model.userFeatures().first()

(1,
 array('d', [0.8092861175537109, 0.035514868795871735, 0.272292822599411, -1.825940728187561, -0.4982907474040985]))

In [34]:
# for user Y find N products to promote
model.recommendProducts(196,10)

[Rating(user=196, product=1426, rating=10.267291555774372),
 Rating(user=196, product=1131, rating=9.773806810157895),
 Rating(user=196, product=536, rating=9.149057931645125),
 Rating(user=196, product=1288, rating=9.093548768685203),
 Rating(user=196, product=593, rating=9.025447828274348),
 Rating(user=196, product=1164, rating=8.799332434096375),
 Rating(user=196, product=1256, rating=8.673851959043471),
 Rating(user=196, product=1438, rating=8.641675414602329),
 Rating(user=196, product=960, rating=7.868645752414352),
 Rating(user=196, product=867, rating=7.693642080122711)]

In [37]:
# for product X find N users to sell to
model.recommendUsers(242,100)

[Rating(user=362, product=242, rating=5.991686728647808),
 Rating(user=180, product=242, rating=5.663905182138553),
 Rating(user=240, product=242, rating=5.410674476543293),
 Rating(user=148, product=242, rating=5.388165293143466),
 Rating(user=941, product=242, rating=5.355764045984312),
 Rating(user=310, product=242, rating=5.342369057568135),
 Rating(user=686, product=242, rating=5.3248222960986515),
 Rating(user=732, product=242, rating=5.304250103307785),
 Rating(user=688, product=242, rating=5.2433280418538235),
 Rating(user=563, product=242, rating=5.232785722718464),
 Rating(user=4, product=242, rating=5.229012756772483),
 Rating(user=415, product=242, rating=5.214984396327335),
 Rating(user=923, product=242, rating=5.133650187942909),
 Rating(user=565, product=242, rating=5.092504052307568),
 Rating(user=317, product=242, rating=5.0163693751905045),
 Rating(user=770, product=242, rating=5.010972999248235),
 Rating(user=9, product=242, rating=5.010693103063883),
 Rating(user=81

In [38]:
# predict single product for single user
model.predict(196,242)

3.621867639596772

In [45]:
# predict multi users and multi products
pred_input=train.map(lambda x:(x[0],x[1]))
pred_input.take(1)

[(196, 242)]

In [47]:
# returns Ratings(user, item, prediction)
pred = model.predictAll(pred_input) 
pred.take(1)

[Rating(user=195, product=1084, rating=3.842449993928904)]

In [50]:
# Get Performance Estimate
# Organize the data to make (user, product)

true_reorg=train.map(lambda x: ((x[0],x[1]), x[2]))
pred_reorg=pred.map(lambda x: ((x[0],x[1]), x[2]))

In [51]:
#結合兩個資料
true_pred=true_reorg.join(pred_reorg)

In [54]:
from math import sqrt

In [55]:
MSE=true_pred.map(lambda r: (r[1][0]-r[1][1])**2).mean()

In [56]:
RMSE=sqrt(MSE)

In [61]:
print("MSE={}".format(MSE))
print("RMSE={}".format(sqrt(MSE)))

MSE=0.5826547535558756
RMSE=0.763318251816289


In [65]:
# evaluation
test_input=test.map(lambda x:(x[0],x[1]))
pred_test=model.predictAll(test_input)
test_reorg=test.map(lambda x:((x[0],x[1]),x[2]))
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))
test_pred = test_reorg.join(pred_reorg)
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
test_RMSE = sqrt(test_MSE)

In [66]:
# save model
model.save(sc,"/home/jovyan/dataset/ml-recommandation-model")

In [67]:
# load model
sameModel = MatrixFactorizationModel.load(sc, "/home/jovyan/dataset/ml-recommandation-model")

In [78]:
# prediction
data=sc.parallelize([(196,242)])
pred=predictions = sameModel.predictAll(data)

In [81]:
pred.take(1)

[Rating(user=196, product=242, rating=3.621867639596772)]