In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.recommendation import ALS
import os


In [2]:
# Set up Spark session
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION'] = spark_version

In [3]:
# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,375 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,753 kB]
Get

In [4]:
# Start a SparkSession
import findspark
findspark.init()
# Initialize Spark Session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

In [7]:
# Load datasets using Spark
ratings_df = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)
links_df = spark.read.csv("/content/links.csv", header=True, inferSchema=True)

# Load movie data from movies.json (assuming you have a movies.json file)
tmdb_df = spark.read.json("/content/movie_results.json")

# Merge ratings with links to get the tmdbId
ratings_with_tmdb_df = ratings_df.join(links_df, ratings_df.movieId == links_df.movieId, how="inner")

# Merge the result with TMDB movie metadata using tmdbId
final_df = ratings_with_tmdb_df.join(tmdb_df, ratings_with_tmdb_df.tmdbId == tmdb_df.id, how="inner")

# Drop unnecessary columns (like imdbId and original movieId)
final_cleaned_df = final_df.drop("movieId", "id", "imdbId")

In [8]:
# Split the data into training and testing sets (80% for training, 20% for testing)
(training_df, testing_df) = final_cleaned_df.randomSplit([0.8, 0.2], seed=42)


In [9]:
# Initialize ALS model
als = ALS(userCol="userId", itemCol="tmdbId", ratingCol="rating", coldStartStrategy="drop")

# Train the model on the training data
model = als.fit(training_df)

In [13]:
# Generate recommendations for all users in the training set
user_recommendations = model.recommendForAllUsers(10)

# Flatten the recommendations into individual rows
flat_recommendations = user_recommendations.withColumn("movie", F.explode("recommendations"))

# Extract movie details (tmdbId and rating) from the recommendation column
flat_recommendations = flat_recommendations.select("userId", "movie.tmdbId", "movie.rating")

# Join the recommendations with the tmdb dataset to get the movie title
final_recommendations = flat_recommendations.join(tmdb_df, flat_recommendations.tmdbId == tmdb_df.id, how="inner")

# Select the columns you need: userId, tmdbId (movieId), rating, and title
final_recommendations = final_recommendations.select("userId", "tmdbId", "rating", "title")

# Show the recommendations for the first few users
final_recommendations.show(10, False)

+------+------+---------+-------------------------------------+
|userId|tmdbId|rating   |title                                |
+------+------+---------+-------------------------------------+
|1     |25237 |5.655685 |Come and See                         |
|1     |17360 |5.602119 |Escape to Victory                    |
|1     |17529 |5.5713644|True Grit                            |
|1     |34647 |5.488819 |Enter the Void                       |
|1     |39324 |5.4854193|Dragon Ball Z: The History of Trunks |
|1     |5925  |5.4283996|The Great Escape                     |
|1     |961   |5.39575  |The General                          |
|1     |7984  |5.357944 |In the Name of the Father            |
|1     |4593  |5.3478084|The Discreet Charm of the Bourgeoisie|
|1     |3009  |5.3188953|The Trial                            |
+------+------+---------+-------------------------------------+
only showing top 10 rows

