In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Frequent Itemsets Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001EF02246C88>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Praproses

Praproses digunakan untuk membaca data user rating. Sesuaikan dengan dataset.

In [11]:
lines = spark.read.text("sample_movielens_ratings.txt").rdd
print(lines.take(5))

[Row(value='0::2::3::1424380312'), Row(value='0::3::1::1424380312'), Row(value='0::5::2::1424380312'), Row(value='0::9::4::1424380312'), Row(value='0::11::1::1424380312')]


In [17]:
parts = lines.map(lambda row: row.value.split("::"))

In [24]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))

In [25]:
print(ratingsRDD.take(5))

[Row(movieId=2, rating=3.0, timestamp=1424380312, userId=0), Row(movieId=3, rating=1.0, timestamp=1424380312, userId=0), Row(movieId=5, rating=2.0, timestamp=1424380312, userId=0), Row(movieId=9, rating=4.0, timestamp=1424380312, userId=0), Row(movieId=11, rating=1.0, timestamp=1424380312, userId=0)]


In [27]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.show()

+-------+------+----------+------+
|movieId|rating| timestamp|userId|
+-------+------+----------+------+
|      2|   3.0|1424380312|     0|
|      3|   1.0|1424380312|     0|
|      5|   2.0|1424380312|     0|
|      9|   4.0|1424380312|     0|
|     11|   1.0|1424380312|     0|
|     12|   2.0|1424380312|     0|
|     15|   1.0|1424380312|     0|
|     17|   1.0|1424380312|     0|
|     19|   1.0|1424380312|     0|
|     21|   1.0|1424380312|     0|
|     23|   1.0|1424380312|     0|
|     26|   3.0|1424380312|     0|
|     27|   1.0|1424380312|     0|
|     28|   1.0|1424380312|     0|
|     29|   1.0|1424380312|     0|
|     30|   1.0|1424380312|     0|
|     31|   1.0|1424380312|     0|
|     34|   1.0|1424380312|     0|
|     37|   1.0|1424380312|     0|
|     41|   2.0|1424380312|     0|
+-------+------+----------+------+
only showing top 20 rows



## Create Model

In [28]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [29]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [30]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.7234586840615076


In [31]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [32]:
userRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    28|[[81, 4.8676157],...|
|    26|[[90, 5.7242403],...|
|    27|[[69, 4.5157194],...|
|    12|[[17, 4.8915787],...|
|    22|[[39, 5.1836114],...|
|     1|[[51, 4.562784], ...|
|    13|[[76, 3.2516534],...|
|     6|[[25, 4.856877], ...|
|    16|[[85, 5.1414394],...|
|     3|[[85, 4.76359], [...|
|    20|[[22, 4.468463], ...|
|     5|[[90, 4.2194633],...|
|    19|[[90, 3.9035926],...|
|    15|[[46, 4.9867916],...|
|    17|[[46, 5.169761], ...|
|     9|[[31, 4.4160566],...|
|     4|[[77, 4.1378527],...|
|     8|[[29, 5.15897], [...|
|    23|[[17, 5.540481], ...|
|     7|[[25, 4.7176924],...|
+------+--------------------+
only showing top 20 rows



In [33]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     31|[[9, 4.4160566], ...|
|     85|[[16, 5.1414394],...|
|     65|[[23, 4.8290615],...|
|     53|[[22, 5.088476], ...|
|     78|[[26, 1.8592147],...|
|     34|[[2, 3.986309], [...|
|     81|[[28, 4.8676157],...|
|     28|[[18, 5.100177], ...|
|     76|[[14, 4.9362936],...|
|     26|[[0, 2.913889], [...|
|     27|[[11, 4.890229], ...|
|     44|[[25, 4.750662], ...|
|     12|[[22, 5.0110507],...|
|     91|[[12, 3.1462245],...|
|     22|[[26, 5.1236525],...|
|     93|[[14, 3.0236852],...|
|     47|[[7, 4.0010138], ...|
|      1|[[15, 3.775174], ...|
|     52|[[24, 5.166397], ...|
|     13|[[11, 4.1079755],...|
+-------+--------------------+
only showing top 20 rows

