In [None]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, rand
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import hash, col
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, abs

In [None]:
# Step 1: Initialize Spark Session
#spark = SparkSession.builder \
 #   .appName("MovieRecommendation") \
 #   .getOrCreate()

In [None]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [69.0 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 330 kB in 2s (193 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("sparkDates").getOrCreate()

In [None]:
# Step 2: Load the dataset (replace with your actual dataset path)

df = spark.read.json("/content/movie_results.json")

In [None]:
# Step 3: Data Cleaning & Filtering
# Step 2: Data Preprocessing
df_filtered = df.select("id", "title", "vote_average", "vote_count").filter(
    (col("vote_average").isNotNull()) & (col("vote_count") >= 50)
)
    # Check the first few rows of the filtered data
df_filtered.show(5)

+-------+--------------------+------------+----------+
|     id|               title|vote_average|vote_count|
+-------+--------------------+------------+----------+
| 950396|           The Gorge|       7.775|      1813|
|1126166|         Flight Risk|       6.086|       429|
|1064213|               Anora|       7.066|      1601|
| 762509|Mufasa: The Lion ...|         7.5|      1601|
|1241982|             Moana 2|         7.2|      1809|
+-------+--------------------+------------+----------+
only showing top 5 rows



In [None]:
# Step 3: Simulate Users for ALS Model
# Create user_id by hashing the movie_id to ensure overlap and make it positive
df_filtered = df_filtered.withColumn("user_id", abs((hash("id") % 1000)).cast(IntegerType()))

# Check the first few rows to ensure the user_id is created
df_filtered.select("user_id", "id", "vote_average").show(5)

+-------+-------+------------+
|user_id|     id|vote_average|
+-------+-------+------------+
|    610| 950396|       7.775|
|    671|1126166|       6.086|
|    860|1064213|       7.066|
|    718| 762509|         7.5|
|    579|1241982|         7.2|
+-------+-------+------------+
only showing top 5 rows



In [None]:
# Step 4: Prepare Data for ALS (ALS expects columns: user_id, movie_id, rating)
# Step 4: Prepare Data for ALS (ALS expects columns: user_id, movie_id, rating)
df_ratings = df_filtered.withColumnRenamed("vote_average", "rating") \
                        .select("user_id", "id", "rating")

In [None]:
# Step 5: Train-Test Split
train_data, test_data = df_ratings.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Step 2: Check for overlapping user-movie pairs between train and test
train_user_movie_pairs = train_data.select("user_id", "id").distinct()
test_user_movie_pairs = test_data.select("user_id", "id").distinct()

# Find overlapping pairs by joining
overlapping_pairs = train_user_movie_pairs.join(test_user_movie_pairs, on=["user_id", "id"], how="inner")

# Output the number of overlapping pairs
print(f"Overlapping user-movie pairs: {overlapping_pairs.count()}")

Overlapping user-movie pairs: 0


In [None]:
# Show the column names in both DataFrames
print("df_ratings columns:", df_ratings.columns)
print("df_filtered columns:", df_filtered.columns)

df_ratings columns: ['user_id', 'id', 'rating']
df_filtered columns: ['id', 'title', 'vote_average', 'vote_count', 'user_id']


In [None]:
# Step 1: Perform inner join between df_ratings and df_filtered on movie 'id'
common_movies = df_ratings.join(df_filtered, 'id', 'inner')

# Step 2: Resolve column name ambiguity
# Since both df_ratings and df_filtered have 'user_id', let's rename 'user_id' in df_filtered
df_filtered = df_filtered.withColumnRenamed('user_id', 'movie_user_id')

# Join again after renaming to avoid ambiguity
common_movies = df_ratings.join(df_filtered, 'id', 'inner')

# Step 3: Select only relevant columns
common_movies = common_movies.select(
    'user_id', 'title', 'rating', 'vote_average', 'vote_count'
)

# Step 4: Show the common movies
common_movies.show(10)

+-------+--------------------+------+------------+----------+
|user_id|               title|rating|vote_average|vote_count|
+-------+--------------------+------+------------+----------+
|    610|           The Gorge| 7.775|       7.775|      1813|
|    671|         Flight Risk| 6.086|       6.086|       429|
|    860|               Anora| 7.066|       7.066|      1601|
|    718|Mufasa: The Lion ...|   7.5|         7.5|      1601|
|    579|             Moana 2|   7.2|         7.2|      1809|
|    328|Sonic the Hedgehog 3|   7.7|         7.7|      2123|
|    599|Captain America: ...| 6.156|       6.156|       967|
|      2|                Flow| 8.298|       8.298|      1461|
|    660|              Amaran|   7.4|         7.4|       188|
|    431|           Mickey 17| 7.039|       7.039|       384|
+-------+--------------------+------+------------+----------+
only showing top 10 rows



In [None]:
# Step 5: Analyzing the ratings
# Find the average rating per movie
avg_ratings = common_movies.groupBy('title').agg({'rating': 'avg'}).withColumnRenamed('avg(rating)', 'avg_rating')
avg_ratings.show(10)

# Step 6: Recommend top 5 movies based on highest average rating
top_movies = avg_ratings.orderBy(col('avg_rating').desc()).limit(5)
top_movies.show()

+--------------------+----------+
|               title|avg_rating|
+--------------------+----------+
|Raya and the Last...|     7.841|
|Penguins of Madag...|     6.525|
|            Warcraft|      6.38|
|            Poseidon|     5.856|
|     My Name Is Khan|       8.0|
|  The Last Airbender|     4.637|
|My Cousin the Sex...|     6.938|
|          The Ritual|     6.257|
|Before I Go to Sleep|       6.5|
|    The Last Warrior|       6.9|
+--------------------+----------+
only showing top 10 rows

+--------------------+----------+
|               title|avg_rating|
+--------------------+----------+
|                Nude|       9.5|
|            Succubus|       9.5|
|           Kill Shot|     9.201|
|The Shawshank Red...|     8.708|
|       The Godfather|     8.689|
+--------------------+----------+



In [None]:
print(f"Train data user-movie pairs count: {train_user_movie_pairs.count()}")
print(f"Test data user-movie pairs count: {test_user_movie_pairs.count()}")

Train data user-movie pairs count: 5093
Test data user-movie pairs count: 1197


In [None]:
# Step 6: Build ALS Model
als = ALS(rank=10, maxIter=10, regParam=0.1, userCol="user_id", itemCol="id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(train_data)

In [None]:
# Step 8: Generate Predictions
predictions = model.transform(test_data).na.drop()
print(f"Total predictions: {predictions.count()}")

Total predictions: 0


In [None]:
# If there are no predictions, print a warning
# Step 9: Show Prediction Data
print(f"Total predictions: {predictions.count()}")

# Corrected column names: Use 'id' for movie_id instead of 'movie_id'
predictions.select("id", "user_id", "rating", "prediction").show(10, truncate=False)

Total predictions: 0
+---+-------+------+----------+
|id |user_id|rating|prediction|
+---+-------+------+----------+
+---+-------+------+----------+



In [None]:
# Step 10: Evaluate Model Performance
if predictions.count() > 0:
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print(f"Root Mean Squared Error (RMSE): {rmse}")
else:
    print("RMSE cannot be computed as no predictions were generated.")

RMSE cannot be computed as no predictions were generated.


In [None]:
print(df_enriched.columns)

['movie_id', 'title', 'rating', 'num_votes', 'user_id']


In [None]:
# Visualizing the 'num_votes' vs 'rating' (acting as popularity vs rating)
df_pandas = df_enriched.toPandas()
plt.figure(figsize=(10, 6))
plt.scatter(df_pandas['num_votes'], df_pandas['rating'], alpha=0.6)
plt.title("Popularity (num_votes) vs Rating")
plt.xlabel("Popularity (num_votes)")
plt.ylabel("Rating")
plt.grid(True)
plt.show()

AttributeError: 'NoneType' object has no attribute 'setCallSite'

In [None]:
from pyspark.sql.functions import col

# Function to get movie recommendations
def get_movie_recommendations(user_id, num_recommendations=5):
    # Get the top N movie recommendations for the given user
    user_recs = model.recommendForAllUsers(num_recommendations)

    # Filter for the specific user
    user_recs = user_recs.filter(col("user_id") == user_id)

    # Flatten the movie recommendations
    user_recs_flat = user_recs.select("user_id", "recommendations.movie_id", "recommendations.rating")

    # Join with the movie titles to get the names
    movie_titles = df_filtered.select("id", "title")
    recommendations_with_titles = user_recs_flat.join(movie_titles, user_recs_flat.movie_id == movie_titles.id, "inner")

    # Show the results
    recommendations_with_titles.select("title", "rating").show(num_recommendations)

# Example usage: Get top 5 recommendations for user with user_id = 1
get_movie_recommendations(1)

AnalysisException: [FIELD_NOT_FOUND] No such struct field `movie_id` in `id`, `rating`.