In [3]:
from pyspark.sql import SparkSession 

In [4]:
spark = SparkSession.builder.master("local").appName("pyspark-recomm-1").getOrCreate()
spark.sparkContext

In [5]:
sc = spark.sparkContext

In [6]:
movielens=sc.textFile("/home/jovyan/dataset/u.data")

In [7]:
#user id | item id | rating | timestamp
movielens.first()

'196\t242\t3\t881250949'

In [8]:
movielens.count()

100000

In [9]:
#處理分隔符號
clean_data=movielens.map(lambda x: x.split('\t'))

In [10]:
clean_data.take(1)

[['196', '242', '3', '881250949']]

In [11]:
rate=clean_data.map(lambda y: int(y[2]))

In [12]:
#avg rating
rate.mean()

3.529859999999947

In [13]:
#extract unique users
users=clean_data.map(lambda y: int(y[0]))
users.distinct().count()

943

In [14]:
#extract unique items
clean_data.map(lambda y: int(y[1])).distinct().count()

1682

In [15]:
from pyspark.mllib.recommendation import ALS,MatrixFactorizationModel, Rating

In [16]:
#準備 Rating 物件規定的資料格式
#Rating物件的資料格式樣貌 (user, item, rating)
mls=movielens.map(lambda l: l.split('\t'))
ratings=mls.map(lambda x: Rating(int(x[0]), int(x[1]), float(x[2])))

In [17]:
train, test = ratings.randomSplit([0.7,0.3],7856)

In [18]:
train.count()

69958

In [19]:
test.count()

30042

In [20]:
#catch內容於記憶體中
train.cache()
test.cache()

PythonRDD[19] at RDD at PythonRDD.scala:53

In [21]:
# Setting up the parameters for ALS

# latent factors to be made
rank=5

# times to repeat process
numIterations=10

# create the model on the training data
model = ALS.train(train, rank, numIterations)



In [22]:
# examine the latent features for one product
model.productFeatures().first()

(1,
 array('d', [1.7188128232955933, -0.019026046618819237, 0.7569171190261841, 0.2330550104379654, -0.3280242085456848]))

In [23]:
# examine the latent features for one user
model.userFeatures().first()

(1,
 array('d', [1.4764412641525269, -0.0729249119758606, 1.3866300582885742, 0.462444931268692, -0.6250641345977783]))

In [24]:
# for user Y find N products to promote
model.recommendProducts(196,10)

[Rating(user=196, product=1434, rating=8.483746248652208),
 Rating(user=196, product=899, rating=7.19221208167518),
 Rating(user=196, product=1324, rating=7.049903990227193),
 Rating(user=196, product=1512, rating=7.025723133404712),
 Rating(user=196, product=745, rating=6.715743133553768),
 Rating(user=196, product=394, rating=6.69689580053476),
 Rating(user=196, product=589, rating=6.527601022770478),
 Rating(user=196, product=1385, rating=6.515465647363044),
 Rating(user=196, product=1472, rating=6.515231027601235),
 Rating(user=196, product=1279, rating=6.478517705616138)]

In [25]:
# for product X find N users to sell to
model.recommendUsers(242,100)

[Rating(user=471, product=242, rating=6.661716974806898),
 Rating(user=928, product=242, rating=6.0578018749782245),
 Rating(user=98, product=242, rating=5.994626918190361),
 Rating(user=240, product=242, rating=5.597313982227215),
 Rating(user=53, product=242, rating=5.5562673578912705),
 Rating(user=516, product=242, rating=5.456702612145154),
 Rating(user=274, product=242, rating=5.37396717354893),
 Rating(user=362, product=242, rating=5.360063082894397),
 Rating(user=180, product=242, rating=5.345888410391948),
 Rating(user=739, product=242, rating=5.275808446369879),
 Rating(user=4, product=242, rating=5.268901132570653),
 Rating(user=310, product=242, rating=5.221365004636517),
 Rating(user=848, product=242, rating=5.1792662765830535),
 Rating(user=419, product=242, rating=5.17166694890166),
 Rating(user=686, product=242, rating=5.146540012479504),
 Rating(user=153, product=242, rating=5.12077486273501),
 Rating(user=513, product=242, rating=5.0842307748023625),
 Rating(user=768,

In [26]:
# predict single product for single user
model.predict(196,242)

3.8833548721460325

In [27]:
# predict multi users and multi products
pred_input=train.map(lambda x:(x[0],x[1]))
pred_input.take(1)

[(196, 242)]

In [28]:
# returns Ratings(user, item, prediction)
pred = model.predictAll(pred_input) 
pred.take(1)

[Rating(user=195, product=1084, rating=3.865942608674886)]

In [29]:
# Get Performance Estimate
# Organize the data to make (user, product)

true_reorg=train.map(lambda x: ((x[0],x[1]), x[2]))
pred_reorg=pred.map(lambda x: ((x[0],x[1]), x[2]))

In [30]:
#結合兩個資料
true_pred=true_reorg.join(pred_reorg)

In [31]:
from math import sqrt

In [32]:
MSE=true_pred.map(lambda r: (r[1][0]-r[1][1])**2).mean()

In [33]:
RMSE=sqrt(MSE)

In [34]:
print("MSE={}".format(MSE))
print("RMSE={}".format(sqrt(MSE)))

MSE=0.5840948079485064
RMSE=0.7642609553997289


In [35]:
# evaluation
test_input=test.map(lambda x:(x[0],x[1]))
pred_test=model.predictAll(test_input)
test_reorg=test.map(lambda x:((x[0],x[1]),x[2]))
pred_reorg = pred_test.map(lambda x:((x[0],x[1]), x[2]))
test_pred = test_reorg.join(pred_reorg)
test_MSE = test_pred.map(lambda r: (r[1][0] - r[1][1])**2).mean()
test_RMSE = sqrt(test_MSE)

In [36]:
# save model
model.save(sc,"/home/jovyan/dataset/ml-recommandation-model")

In [37]:
# load model
sameModel = MatrixFactorizationModel.load(sc, "/home/jovyan/dataset/ml-recommandation-model")

In [38]:
# prediction
data=sc.parallelize([(196,242)])
pred=predictions = sameModel.predictAll(data)

In [39]:
pred.take(1)

[Rating(user=196, product=242, rating=3.8833548721460325)]