In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType,ArrayType,LongType
import logging
import sys
import traceback
import ast
import json
import os
import sys
import traceback
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr,when,to_date ,udf, concat_ws,posexplode, from_json
from pyspark.sql.types import StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g")  \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .config("spark.sql.pivotMaxValues", 100000) \
    .getOrCreate()


In [3]:
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Đọc dữ liệu movie_names và ratings_data từ Delta
movies  = spark.read.format("delta").load("s3a://lakehouse/gold/dim_movie")
movies  = movies .select("id", "title")
movies  = movies .withColumnRenamed("id", "movieId")

ratings = spark.read.format("delta").load("s3a://lakehouse/silver/ratings")
ratings = ratings.select("userId", "movieId", "rating")
ratings = ratings.limit(1000000)


In [4]:
ratings = ratings.select("userId", "movieId", "rating")
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [5]:
df = ratings.withColumn('userId', ratings['userId'].cast('int')).\
withColumn('movieId', ratings['movieId'].cast('int')).withColumn('rating', ratings['rating'].cast('float'))
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: float (nullable = true)



In [6]:
train, validation, test = df.randomSplit([0.6,0.2,0.2], seed = 0)
print("The number of ratings in each set: {}, {}, {}".format(train.count(), validation.count(), test.count()))

The number of ratings in each set: 600468, 199309, 200223


In [7]:
from pyspark.sql.functions import col, sqrt
def RMSE(predictions):
    squared_diff = predictions.withColumn("squared_diff", pow(col("rating") - col("prediction"), 2))
    mse = squared_diff.selectExpr("mean(squared_diff) as mse").first().mse
    return mse ** 0.5

In [8]:
# implement the model using ALS algorithm and find the right hyperparameters using Grid Search
from pyspark.ml.recommendation import ALS
import time  

def GridSearch(train, valid, num_iterations, reg_param, n_factors):
    min_rmse = float('inf')
    best_n = -1
    best_reg = 0
    best_model = None
    # run Grid Search for all the parameter defined in the range in a loop
    for n in n_factors:
        for reg in reg_param:
            als = ALS(rank = n, 
                      maxIter = num_iterations, 
                      seed = 0, 
                      regParam = reg,
                      userCol="userId", 
                      itemCol="movieId", 
                      ratingCol="rating", 
                      coldStartStrategy="drop")            
            model = als.fit(train)
            predictions = model.transform(valid)
            rmse = RMSE(predictions)     
            print('{} latent factors and regularization = {}: validation RMSE is {}'.format(n, reg, rmse))
            # track the best model using RMSE
            if rmse < min_rmse:
                min_rmse = rmse
                best_n = n
                best_reg = reg
                best_model = model
                
    pred = best_model.transform(train)
    train_rmse = RMSE(pred)
    # best model and its metrics
    print('\nThe best model has {} latent factors and regularization = {}:'.format(best_n, best_reg))
    print('traning RMSE is {}; validation RMSE is {}'.format(train_rmse, min_rmse))
    return best_model

In [9]:
# build the model using different ranges for Grid Search
from pyspark.sql.functions import col, sqrt
num_iterations = 10
ranks = [6, 8, 10, 12]
reg_params = [0.05, 0.1, 0.2, 0.4, 0.8]

start_time = time.time()
final_model = GridSearch(train, validation, num_iterations, reg_params, ranks)
print('Total Runtime: {:.2f} seconds'.format(time.time() - start_time))

6 latent factors and regularization = 0.05: validation RMSE is 0.8560102329688704
6 latent factors and regularization = 0.1: validation RMSE is 0.8424453107085027
6 latent factors and regularization = 0.2: validation RMSE is 0.8665058463950894
6 latent factors and regularization = 0.4: validation RMSE is 0.9518573820543371
6 latent factors and regularization = 0.8: validation RMSE is 1.1770926108102437
8 latent factors and regularization = 0.05: validation RMSE is 0.8557048412606626
8 latent factors and regularization = 0.1: validation RMSE is 0.8388843064545881
8 latent factors and regularization = 0.2: validation RMSE is 0.8646939342217799
8 latent factors and regularization = 0.4: validation RMSE is 0.9515318661395938
8 latent factors and regularization = 0.8: validation RMSE is 1.1770993896790543
10 latent factors and regularization = 0.05: validation RMSE is 0.8588868809703648
10 latent factors and regularization = 0.1: validation RMSE is 0.8397100311799146
10 latent factors and r

In [10]:
pred_test = final_model.transform(test)
print('The testing RMSE is ' + str(RMSE(pred_test)))

The testing RMSE is 0.8411055768066206


In [11]:
single_user = test.filter(test['userId']==12).select(['movieId','userId'])
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     50|    12|
|     82|    12|
|    157|    12|
|    194|    12|
|    198|    12|
|    247|    12|
|    301|    12|
|    334|    12|
|    363|    12|
|    492|    12|
|    589|    12|
|    590|    12|
|    728|    12|
|    759|    12|
|    903|    12|
|    940|    12|
|    994|    12|
|   1023|    12|
|   1059|    12|
|   1177|    12|
+-------+------+
only showing top 20 rows



In [12]:
single_user.join(movies, single_user.movieId == movies.movieId, 'inner').show()


+-------+------+-------+--------------------+
|movieId|userId|movieId|               title|
+-------+------+-------+--------------------+
|     82|    12|     82|          Miami Vice|
|    157|    12|    157|Star Trek III: Th...|
|    194|    12|    194|              Amélie|
|    198|    12|    198|  To Be or Not to Be|
|    247|    12|    247|         The Killing|
|    301|    12|    301|           Rio Bravo|
|    334|    12|    334|            Magnolia|
|    363|    12|    363|             Head-On|
|    492|    12|    492|Being John Malkovich|
|    590|    12|    590|           The Hours|
|    759|    12|    759|Gentlemen Prefer ...|
|    903|    12|    903|      Cool Hand Luke|
|    940|    12|    940|   The Lady Vanishes|
|    994|    12|    994|          Straw Dogs|
|   1023|    12|   1023|       Adam's Apples|
|   1059|    12|   1059| The Hidden Fortress|
|   1639|    12|   1639|Speed 2: Cruise C...|
|   1641|    12|   1641|    Forces of Nature|
|   1643|    12|   1643| Last Tang

In [13]:
reccomendations = final_model.transform(single_user)
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|   3077|    12| 4.6133533|
|   1952|    12| 4.4621687|
|     50|    12|  4.459092|
|    903|    12| 4.2934875|
|    247|    12|  4.289667|
|    194|    12|   4.28108|
|   2020|    12|   4.23416|
|   1299|    12|  4.185307|
|   1648|    12|  4.159925|
|   1293|    12| 4.1201406|
|   1960|    12| 4.1085896|
|     82|    12|  4.091055|
|    363|    12|  4.085567|
|   1264|    12|  4.062988|
|    994|    12|  4.050111|
|   1834|    12| 3.9850245|
|   1643|    12| 3.9663813|
|   1641|    12| 3.9311318|
|   1214|    12| 3.9072285|
|   2580|    12| 3.9030151|
+-------+------+----------+
only showing top 20 rows



In [14]:
reccomendations.join(movies, reccomendations.movieId == movies.movieId, 'inner').show()


+-------+------+----------+-------+--------------------+
|movieId|userId|prediction|movieId|               title|
+-------+------+----------+-------+--------------------+
|     82|    12|  4.091055|     82|          Miami Vice|
|    157|    12| 2.3696418|    157|Star Trek III: Th...|
|    194|    12|   4.28108|    194|              Amélie|
|    198|    12|  3.441933|    198|  To Be or Not to Be|
|    247|    12|  4.289667|    247|         The Killing|
|    301|    12| 2.4510481|    301|           Rio Bravo|
|    334|    12| 3.8907888|    334|            Magnolia|
|    363|    12|  4.085567|    363|             Head-On|
|    492|    12|  3.868984|    492|Being John Malkovich|
|    590|    12| 3.2765384|    590|           The Hours|
|    759|    12| 3.8330197|    759|Gentlemen Prefer ...|
|    903|    12| 4.2934875|    903|      Cool Hand Luke|
|    940|    12| 3.4327934|    940|   The Lady Vanishes|
|    994|    12|  4.050111|    994|          Straw Dogs|
|   1023|    12| 3.4472713|   1

In [15]:
from pyspark.sql.functions import col, lit

# select a single user from the test set
user_id = 12
single_user_ratings = test.filter(test['userId'] == user_id).select(['movieId', 'userId', 'rating'])

# display the movies the user has liked
print("Movies liked by user with ID", user_id)
single_user_ratings.join(movies, 'movieId').select('movieId', 'title', 'rating').show()

# generate recommendations for the user
all_movies = df.select('movieId').distinct()
user_movies = single_user_ratings.select('movieId').distinct()
movies_to_recommend = all_movies.subtract(user_movies)

# predict ratings for movies the user has not rated yet
recommendations = final_model.transform(movies_to_recommend.withColumn('userId', lit(user_id)))

# filter out the movies that the user has already rated or seen (this filters out the movies that the user has not liked as well)
recommendations = recommendations.filter(col('prediction') > 0)

# display the recommendations with movie names
print("Recommended movies for user with ID", user_id)
recommended_movies = recommendations.join(movies, 'movieId').select('movieId', 'title', 'prediction')

# Sort recommended movies by prediction in descending order
ordered_recommendations = recommended_movies.orderBy(col('prediction').desc())

# Display the ordered recommendations
ordered_recommendations.show()

Movies liked by user with ID 12
+-------+--------------------+------+
|movieId|               title|rating|
+-------+--------------------+------+
|     82|          Miami Vice|   4.0|
|    157|Star Trek III: Th...|   1.0|
|    194|              Amélie|   5.0|
|    198|  To Be or Not to Be|   1.0|
|    247|         The Killing|   5.0|
|    301|           Rio Bravo|   2.0|
|    334|            Magnolia|   3.0|
|    363|             Head-On|   4.0|
|    492|Being John Malkovich|   4.0|
|    590|           The Hours|   4.0|
|    759|Gentlemen Prefer ...|   5.0|
|    903|      Cool Hand Luke|   5.0|
|    940|   The Lady Vanishes|   3.0|
|    994|          Straw Dogs|   5.0|
|   1023|       Adam's Apples|   5.0|
|   1059| The Hidden Fortress|   4.0|
|   1639|Speed 2: Cruise C...|   5.0|
|   1641|    Forces of Nature|   3.0|
|   1643| Last Tango in Paris|   4.0|
|   1648|Bill & Ted's Exce...|   5.0|
+-------+--------------------+------+
only showing top 20 rows

Recommended movies for user wi