数据读取

In [26]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

spark = SparkSession.builder.appName("ALS_demo").getOrCreate()
ratings = spark.read.csv('hdfs://localhost:9000/input/ml_data/ratings.csv', header=True)
ratings = ratings.withColumn("userId", ratings.userId.astype("int"))
ratings = ratings.withColumn("movieId", ratings.userId.astype("int"))
ratings = ratings.withColumn("rating", ratings.userId.astype("float"))
ratings = ratings.withColumn("timestamp", ratings.userId.astype("int"))

(training, test) = ratings.randomSplit([0.8, 0.2])
print("Read successfully!")

Read successfully!


ALS模型的构建与学习

In [27]:
als = ALS(
    rank=20,
    maxIter=10,
    regParam=0.1,
    userCol='userId',
    itemCol='movieId',
    ratingCol='rating',
    coldStartStrategy="drop",
    seed=0
)
als_model = als.fit(training)

print("Train successfully!")

Train successfully!


模型的预测与评估

In [28]:
predictions = als_model.transform(test)
evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)
rmse = evaluator.evaluate(predictions)

print("Predict successfully!")
print("RMS error = " + str(rmse))

Predict successfully!
RMS error = 0.011343638354936717


模型输出

In [29]:
import pandas as pd
users = ratings.select(als.getUserCol()).distinct().limit(3)
movies = ratings.select(als.getItemCol()).distinct().limit(3)
user_output = als_model.recommendForUserSubset(users, 2)
movie_output = als_model.recommendForItemSubset(movies, 2)

print(user_output.show(3, False))
print(movie_output.show(3, False))

# user_recs = user_output.toPandas().to_json(orient = 'records')
# movie_recs = movie_output.toPandas().to_json(orient = 'records')

# print(user_recs)
# print(movie_recs)

+------+------------------------------------------+
|userId|recommendations                           |
+------+------------------------------------------+
|471   |[{257565, 203324.53}, {278340, 203005.73}]|
|463   |[{267032, 198604.31}, {275279, 198525.1}] |
|148   |[{267663, 203747.0}, {275506, 196118.67}] |
+------+------------------------------------------+

None
+-------+---------------------------------------+
|movieId|recommendations                        |
+-------+---------------------------------------+
|471    |[{471, 470.99933}, {57342, 383.8851}]  |
|463    |[{463, 462.99936}, {134990, 369.27213}]|
|148    |[{148, 147.99792}, {21845, 123.926765}]|
+-------+---------------------------------------+

None
