In [1]:
import pandas as pd

# Load TMDB movie metadata (assuming 'id' is the movie identifier)
tmdb_movies_df = pd.read_csv('tmdb_movie_metadata.csv')

# Load MovieLens ratings data (assuming 'movieId' and 'userId' are the relevant columns)
ratings_df = pd.read_csv('ratings.csv')

# Preview the datasets to see their structure
print("TMDB Movie Metadata:")
print(tmdb_movies_df.head())

print("MovieLens Ratings:")
print(ratings_df.head())

TMDB Movie Metadata:
   Unnamed: 0  adult                     backdrop_path  \
0           0  False  /zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg   
1           1  False  /tmU7GeKVybMWFButWEGl2M4GeiP.jpg   
2           2  False  /kGzFbGhp99zva6oZODW5atUtnqi.jpg   
3           3  False  /zb6fM1CX41D9rF9hdgclu0peUmy.jpg   
4           4  False  /bxgTSUenZDHNFerQ1whRKplrMKF.jpg   

                     genre_ids   id original_language  \
0           ['Drama', 'Crime']  278                en   
1           ['Drama', 'Crime']  238                en   
2           ['Drama', 'Crime']  240                en   
3  ['Drama', 'History', 'War']  424                en   
4                    ['Drama']  389                en   

             original_title  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   
4              12 Angry Men   

                                            overview  popularity  \
0  Imprisoned in the 1940s for t

In [2]:
# Clean TMDB dataset: Drop any rows with missing 'id' or 'overview' and fill missing values if necessary
tmdb_movies_df = tmdb_movies_df.dropna(subset=['id', 'overview'])
tmdb_movies_df['overview'] = tmdb_movies_df['overview'].fillna('')

# Clean MovieLens dataset: Drop rows with missing values in the 'movieId' or 'rating'
ratings_df = ratings_df.dropna(subset=['movieId', 'rating'])

# Make sure column names match (e.g., 'id' in TMDB corresponds to 'movieId' in MovieLens)
ratings_df = ratings_df.rename(columns={'movieId': 'id'})

In [4]:
# Merge the datasets on 'id' (from TMDB) and 'movieId' (from MovieLens)
merged_df = pd.merge(ratings_df, tmdb_movies_df, on='id', how='inner')

# Preview the merged dataset
print("Merged DataFrame:")
merged_df.head()

Merged DataFrame:


Unnamed: 0.1,userId,id,rating,timestamp,Unnamed: 0,adult,backdrop_path,genre_ids,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,1,2,3.5,2005-04-02 23:53:47,2864,False,/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg,"['Comedy', 'Drama', 'Romance', 'Crime']",fi,Ariel,A Finnish man goes to the city to find a job a...,10.454,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,1988-10-21,Ariel,False,7.1,339
1,1,223,4.0,2005-04-02 23:46:13,443,False,/i8fiqKaQklF02VmQYhSxmwa8KOH.jpg,"['Mystery', 'Romance', 'Thriller', 'Drama']",en,Rebecca,Story of a young woman who marries a fascinati...,21.327,/1qz3qUOHnVy7dL7M7G8jSErxE4b.jpg,1940-03-23,Rebecca,False,7.892,1772
2,1,253,4.0,2005-04-02 23:35:40,5645,False,/hzmmXx6UeYQeylioNbliFKjSbV7.jpg,"['Adventure', 'Action', 'Thriller']",en,Live and Let Die,James Bond must investigate a mysterious murde...,28.271,/39qkrjqMZs6utwNmihVImC3ghas.jpg,1973-06-27,Live and Let Die,False,6.513,2056
3,1,260,4.0,2005-04-02 23:33:46,1863,False,/aCUka2PmLUIUINsiTNzOIqjS3sW.jpg,"['Mystery', 'Thriller']",en,The 39 Steps,Richard Hanney has a rude awakening when a gla...,43.708,/yRnl3nTtKVTIBcLHHyXrrXPZWVS.jpg,1935-06-06,The 39 Steps,False,7.344,950
4,1,293,4.0,2005-04-02 23:31:43,3204,False,/a9bChCqxLBixVknDXicw2Z30I8H.jpg,"['Drama', 'Family']",en,A River Runs Through It,"The Maclean brothers, Paul and Norman, live a ...",23.5,/aVP45oS2cBL4WtZ1kB7r8uarruB.jpg,1992-10-09,A River Runs Through It,False,7.029,1077


In [7]:
# Keep only relevant columns: 'userId', 'id', 'rating', 'title', 'genre_ids', 'overview'
merged_clean_df = merged_df[['userId', 'id', 'rating', 'title', 'genre_ids', 'overview']]

# Preview the cleaned dataset
merged_clean_df.head()

Unnamed: 0,userId,id,rating,title,genre_ids,overview
0,1,2,3.5,Ariel,"['Comedy', 'Drama', 'Romance', 'Crime']",A Finnish man goes to the city to find a job a...
1,1,223,4.0,Rebecca,"['Mystery', 'Romance', 'Thriller', 'Drama']",Story of a young woman who marries a fascinati...
2,1,253,4.0,Live and Let Die,"['Adventure', 'Action', 'Thriller']",James Bond must investigate a mysterious murde...
3,1,260,4.0,The 39 Steps,"['Mystery', 'Thriller']",Richard Hanney has a rude awakening when a gla...
4,1,293,4.0,A River Runs Through It,"['Drama', 'Family']","The Maclean brothers, Paul and Norman, live a ..."


In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession


In [None]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:


# Start Spark session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

# Create Spark DataFrame from the merged Pandas DataFrame
merged_spark_df = spark.createDataFrame(merged_df)

# Build the ALS model (Collaborative Filtering)
als = ALS(userCol="userId", itemCol="id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(merged_spark_df)

# Generate recommendations for all users
recommendations = model.recommendForAllUsers(10)

recommendations.show()

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: org.apache.spark.SparkException: Invalid Spark URL: spark://HeartbeatReceiver@nitu_bola:57335
	at org.apache.spark.rpc.RpcEndpointAddress$.apply(RpcEndpointAddress.scala:66)
	at org.apache.spark.rpc.netty.NettyRpcEnv.asyncSetupEndpointRefByURI(NettyRpcEnv.scala:140)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.executor.Executor.<init>(Executor.scala:301)
	at org.apache.spark.scheduler.local.LocalEndpoint.<init>(LocalSchedulerBackend.scala:64)
	at org.apache.spark.scheduler.local.LocalSchedulerBackend.start(LocalSchedulerBackend.scala:132)
	at org.apache.spark.scheduler.TaskSchedulerImpl.start(TaskSchedulerImpl.scala:235)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:599)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer on 'overview' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['overview'])

# Compute cosine similarity between movies based on their overview
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to recommend similar movies based on movie title
def recommend_movies(title, cosine_sim=cosine_sim):
    idx = merged_df.index[merged_df['title'] == title].tolist()[0]  # Get the movie index
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get similarity scores for the given movie
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort based on similarity scores
    sim_scores = sim_scores[1:11]  # Get top 10 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return merged_df['title'].iloc[movie_indices]  # Return the recommended movie titles

# Example: Recommend movies similar to 'Toy Story'
recommended_movies = recommend_movies('Toy Story')
print(recommended_movies)