# Spark Mllib

In [1]:
from __future__ import print_function
from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
import pandas

In [2]:
spark = SparkContext(appName="Pspark mllib Example")

#### Load data

In [3]:
ds = spark.textFile('D:/data/csv/sparkmllib_test.csv')
ds_rdd = ds.map(lambda l: l.split(',')).map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
ds_rdd.take(10)

[Rating(user=1, product=1, rating=5.0),
 Rating(user=1, product=2, rating=1.0),
 Rating(user=1, product=3, rating=5.0),
 Rating(user=1, product=4, rating=1.0),
 Rating(user=2, product=1, rating=5.0),
 Rating(user=2, product=2, rating=1.0),
 Rating(user=2, product=3, rating=5.0),
 Rating(user=2, product=4, rating=1.0),
 Rating(user=3, product=1, rating=1.0),
 Rating(user=3, product=2, rating=5.0)]

#### Build & train model
Build the recommendation model using Alternating Least Squares

In [4]:
rank = 10
epochs = 10
model = ALS.train(ds_rdd, rank, epochs)

#### Test

In [5]:
test_ds = ds_rdd.map(lambda p: (p[0], p[1]))
test_ds.take(10)

[(1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (3, 1),
 (3, 2)]

In [6]:
test_ds_pred = model.predictAll(test_ds).map(lambda r: ((r[0], r[1]), r[2]))
test_ds_pred.take(10)

[((1, 1), 4.996853911927506),
 ((1, 2), 1.0005066589656035),
 ((1, 3), 4.996853911927506),
 ((1, 4), 1.0005066589656035),
 ((2, 1), 4.996853911927506),
 ((2, 2), 1.0005066589656035),
 ((2, 3), 4.996853911927506),
 ((2, 4), 1.0005066589656035),
 ((3, 1), 1.0004866575445481),
 ((3, 2), 4.996953923842356)]

In [7]:
test_ds_valid = ds_rdd.map(lambda r: ((r[0], r[1]), r[2])).join(test_ds_pred)
test_ds_valid.take(10)

[((2, 2), (1.0, 1.0005066589656035)),
 ((1, 2), (1.0, 1.0005066589656035)),
 ((1, 4), (1.0, 1.0005066589656035)),
 ((3, 4), (5.0, 4.996953923842356)),
 ((2, 4), (1.0, 1.0005066589656035)),
 ((4, 2), (5.0, 4.996953923842356)),
 ((4, 3), (1.0, 1.0004866575445481)),
 ((2, 1), (5.0, 4.996853911927506)),
 ((4, 1), (1.0, 1.0004866575445481)),
 ((3, 1), (1.0, 1.0004866575445481))]

In [22]:
mse = test_ds_valid.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("MSE = ", mse)

MSE =  6.5692100820873525e-06


#### Save & Load

In [45]:
model_file = 'D:/data/model/spmllib1'
model.save(sc, model_file)
sameModel = MatrixFactorizationModel.load(sc, model_file)

In [23]:
spark.stop()

## Credits & Links

https://www.tutorialspoint.com/pyspark/pyspark_mllib.htm