In [1]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rand
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import hash, col
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, abs

In [2]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [Connected to cloud.r-                                                                                                    Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
0% [2 InRelease 88.0 kB/128 kB 69%] [Waiting for headers] [Connected to cloud.r-project.org (108.1380% [Waiting for headers] [Connected to cloud.r-project.org (108.138.128.85)] [Connected to r2u.stat.                                                                                                    Get:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:

In [4]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

In [14]:
# Load datasets using Spark
ratings_df = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)
links_df = spark.read.csv("/content/links.csv", header=True, inferSchema=True)
tmdb_df = spark.read.csv("/content/tmdb_movie_metadata.csv", header=True, inferSchema=True)

# Merge ratings with links to get the tmdbId
ratings_with_tmdb_df = ratings_df.join(links_df, ratings_df.movieId == links_df.movieId, how="inner")

# Merge the result with TMDB movie metadata using tmdbId
final_df = ratings_with_tmdb_df.join(tmdb_df, ratings_with_tmdb_df.tmdbId == tmdb_df.id, how="inner")

# Drop unnecessary columns (like imdbId and original movieId)
final_cleaned_df = final_df.drop("movieId", "id", "imdbId")

# Show cleaned DataFrame
final_cleaned_df.show(5)

+------+------+-------------------+------+----+-----+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+----------+
|userId|rating|          timestamp|tmdbId| _c0|adult|       backdrop_path|           genre_ids|original_language|      original_title|            overview|          popularity|         poster_path|        release_date|               title|               video|vote_average|vote_count|
+------+------+-------------------+------+----+-----+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+----------+
|     1|   3.5|2005-04-02 23:53:47|  8844|2301|False|/pb0FOvhen6lVcNJ7...|['Adventure', 'Fa...|               en|             Jumanji|When siblin

In [15]:
final_cleaned_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- tmdbId: integer (nullable = true)
 |-- _c0: string (nullable = true)
 |-- adult: string (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- genre_ids: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: string (nullable = true)
 |-- vote_average: string (nullable = true)
 |-- vote_count: string (nullable = true)



In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession


In [18]:
# Take a sample of the dataset (e.g., 10% of the data)
sampled_df = final_cleaned_df.sample(withReplacement=False, fraction=0.1, seed=42)

# Now train the model on the sampled data
als = ALS(userCol="userId", itemCol="tmdbId", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(sampled_df)

In [19]:
# Generate recommendations for all users
user_recommendations = model.recommendForAllUsers(10)

# Show the recommendations for the first user to check the format
user_recommendations.show(1, False)

+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                                                        |
+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1     |[{27040, 4.7850823}, {110416, 4.6502905}, {69735, 4.6469326}, {9702, 4.5780654}, {65229, 4.5767784}, {110420, 4.5746064}, {31011, 4.540538}, {252178, 4.4971094}, {19955, 4.492777}, {77561, 4.405632}]|
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
# Flatten the recommendations into individual rows
flat_recommendations = user_recommendations.withColumn("movie", F.explode("recommendations"))

# Extract movie details (tmdbId and rating) from the recommendation column
flat_recommendations = flat_recommendations.select("userId", "movie.tmdbId", "movie.rating")

# Join the recommendations with the tmdb dataset to get the movie title
final_recommendations = flat_recommendations.join(tmdb_df, flat_recommendations.tmdbId == tmdb_df.id, how="inner")

# Select relevant columns (userId, movie title, and rating)
final_recommendations = final_recommendations.select("userId", "title", "rating")

# Show the recommendations for the first user
final_recommendations.show(10, False)

+------+-----------------------+---------+
|userId|title                  |rating   |
+------+-----------------------+---------+
|1     |Meshes of the Afternoon|4.7850823|
|1     |Song of the Sea        |4.6502905|
|1     |Batman: Year One       |4.6469326|
|1     | Miklo."               |4.5780654|
|1     |Tomboy                 |4.5767784|
|1     |Wolf Children          |4.5746064|
|1     |Mr. Nobody             |4.540538 |
|1     |'71                    |4.4971094|
|1     |I Do                   |4.492777 |
|1     |EVA                    |4.405632 |
+------+-----------------------+---------+
only showing top 10 rows



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer on 'overview' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['overview'])

# Compute cosine similarity between movies based on their overview
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend similar movies based on movie title
def recommend_movies(title, cosine_sim=cosine_sim):
    idx = merged_df.index[merged_df['title'] == title].tolist()[0]  # Get the movie index
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get similarity scores for the given movie
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort based on similarity scores
    sim_scores = sim_scores[1:11]  # Get top 10 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return merged_df['title'].iloc[movie_indices]  # Return the recommended movie titles

# Example: Recommend movies similar to 'Toy Story'
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)