## Ony Novianti
#### 2041720029 (TI-3A)


In [None]:
# Sumber : https://github.com/cloudxlab/bigdata/blob/master/spark/examples/mllib/ml-recommender.scala

In [None]:
# Connect Google Drive Untuk Ambil Data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install PySpark
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=238e84fd597bd5e3fe4816d7ea558ac7540d4edf8a4b2d75761922b7f17665c6
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [None]:
# Import Library
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("Movie Lens").getOrCreate()

In [None]:
# Parse String Menjadi Objek Rating
def parseRating(str):
    fields = str.split("::")
    assert len(fields) == 4
    return (int(fields[0]), int(fields[1]), float(fields[2]), int(fields[3]))


In [None]:
# Baca File
raw = spark.read.text("/content/drive/MyDrive/TI-3A/ml-1m/ratings.dat").rdd.map(lambda x: x[0])
ratings = raw.map(parseRating).toDF(["userId", "movieId", "rating", "timestamp"])
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|   1193|   5.0|978300760|
|     1|    661|   3.0|978302109|
|     1|    914|   3.0|978301968|
|     1|   3408|   4.0|978300275|
|     1|   2355|   5.0|978824291|
+------+-------+------+---------+
only showing top 5 rows



In [None]:
# Data Training 80% dan Test 20%
training, test = ratings.randomSplit([0.8, 0.2])

In [None]:
# Membuat Model
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)
model.save("mymodel")

In [None]:
# Prediksi Data
predictions = model.transform(test)
mse = predictions.withColumn("diff", col("rating") - col("prediction")).select((col("diff") ** 2).alias("squared_diff")).filter(~col("squared_diff").isNull()).agg({"squared_diff": "sum"}).collect()[0][0]
print("Mean Squared Error:", mse)

predictions.show(10)

Mean Squared Error: nan
+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|    527|   5.0|978824195|  4.426703|
|     1|    594|   4.0|978302268| 4.0526624|
|     1|    595|   5.0|978824268|  4.909911|
|     1|    661|   3.0|978302109| 2.5657692|
|     1|    919|   4.0|978301368| 4.2621155|
|     1|   1028|   5.0|978301777|  4.359399|
|     1|   1035|   5.0|978301753|  4.582177|
|     1|   1097|   4.0|978301953| 4.1121674|
|     1|   1836|   5.0|978300172| 3.2417402|
|     1|   1907|   4.0|978824330| 4.5896945|
+------+-------+------+---------+----------+
only showing top 10 rows



In [None]:
# Menyimpan Hasil Prediksi
predictions.write.format("csv").save("ml-predictions.csv")