In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
import os
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import os


In [None]:
# Set up Spark session
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION'] = spark_version

In [None]:
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,000 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,533 kB]
Get:13 http://security.ubuntu.com/ubun

In [None]:
# Start a SparkSession
import findspark
findspark.init()
# Initialize Spark Session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

In [None]:
# Load datasets
ratings_df = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)
links_df = spark.read.csv("/content/links.csv", header=True, inferSchema=True)
tmdb_df = spark.read.json("/content/movie_results.json")

# Merge ratings with links to get tmdbId
ratings_with_tmdb_df = ratings_df.join(links_df, ratings_df.movieId == links_df.movieId, how="inner")
final_df = ratings_with_tmdb_df.join(tmdb_df, ratings_with_tmdb_df.tmdbId == tmdb_df.id, how="inner")

# Clean data: Remove unnecessary columns
final_cleaned_df = final_df.drop("movieId", "id", "imdbId")

# Normalize ratings by subtracting the average rating for each user
average_rating = final_cleaned_df.groupBy("userId").agg(F.avg("rating").alias("avg_rating"))
normalized_ratings_df = final_cleaned_df.join(average_rating, on="userId") \
    .withColumn("normalized_rating", final_cleaned_df["rating"] - average_rating["avg_rating"])



In [None]:
# Split data into training and testing sets (80%/20%)
(training_df, testing_df) = normalized_ratings_df.randomSplit([0.8, 0.2], seed=42)


In [None]:
# Instantiate ALS model
als = ALS(userCol="userId", itemCol="tmdbId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)


In [None]:
# Set up the parameter grid for cross-validation (this is an example, feel free to adjust)
paramGrid = (ParamGridBuilder()
             .addGrid(als.rank, [5])  # Lower rank
             .addGrid(als.maxIter, [5])  # Fewer iterations
             .addGrid(als.regParam, [0.01])  # Lower regularization parameter
             .build())

evaluator = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")


In [None]:
# Initialize CrossValidator with only 1 folds
cv = CrossValidator(estimator=als, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=1)


# Use a smaller subset of the training data (5% or 1%)
training_subset = training_df.sample(withReplacement=False, fraction=0.05, seed=42)

# Now perform cross-validation on the smaller subset
cvModel = cv.fit(training_subset)

IllegalArgumentException: requirement failed: No ratings available from MapPartitionsRDD[15489] at map at ALS.scala:729

In [None]:
# Get the best model from cross-validation
bestModel = cvModel.bestModel
# Display the best model's parameters
print(f"Best Model Rank: {bestModel._java_obj.parent().getRank()}")
print(f"Best Model Max Iterations: {bestModel._java_obj.parent().getMaxIter()}")
print(f"Best Model Regularization Parameter: {bestModel._java_obj.parent().getRegParam()}")

In [None]:
# Make predictions on the testing data
predictions = bestModel.transform(testing_df)

# Evaluate the model's RMSE (Root Mean Squared Error)
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) of the best model: {rmse}")

# Optionally, generate recommendations for all users
user_recommendations = bestModel.recommendForAllUsers(10)

+------+------+---------+-------------------------------------+
|userId|tmdbId|rating   |title                                |
+------+------+---------+-------------------------------------+
|1     |25237 |5.655685 |Come and See                         |
|1     |17360 |5.602119 |Escape to Victory                    |
|1     |17529 |5.5713644|True Grit                            |
|1     |34647 |5.488819 |Enter the Void                       |
|1     |39324 |5.4854193|Dragon Ball Z: The History of Trunks |
|1     |5925  |5.4283996|The Great Escape                     |
|1     |961   |5.39575  |The General                          |
|1     |7984  |5.357944 |In the Name of the Father            |
|1     |4593  |5.3478084|The Discreet Charm of the Bourgeoisie|
|1     |3009  |5.3188953|The Trial                            |
+------+------+---------+-------------------------------------+
only showing top 10 rows



In [None]:
# Flatten the recommendations into individual rows
from pyspark.sql import functions as F
flat_recommendations = user_recommendations.withColumn("movie", F.explode("recommendations"))

# Extract movie details (tmdbId and rating) from the recommendation column
flat_recommendations = flat_recommendations.select("userId", "movie.tmdbId", "movie.rating")

# Optionally join the recommendations with tmdb_df to get movie titles
final_recommendations = flat_recommendations.join(tmdb_df, flat_recommendations.tmdbId == tmdb_df.id, how="inner")

# Select relevant columns: userId, tmdbId (movieId), rating, and title
final_recommendations = final_recommendations.select("userId", "tmdbId", "rating", "title")

# Show the recommendations for the first few users
final_recommendations.show(10, False)

# Optionally save the best model for later use
bestModel.save("best_movie_recommendation_model")

# Stop the Spark session after the process is complete
spark.stop()

R² Score: 0.3131933460729662
