In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [69.9 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,378 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelea

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import json

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("MoviesData").getOrCreate()

# Load the JSON file into a PySpark DataFrame
df = spark.read.json("movie_results.json")  # Replace with your JSON file path

# Show the structure of the JSON to understand the data
df.printSchema()

root
 |-- adult: boolean (nullable = true)
 |-- backdrop_path: string (nullable = true)
 |-- belongs_to_collection: struct (nullable = true)
 |    |-- backdrop_path: string (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- poster_path: string (nullable = true)
 |-- budget: long (nullable = true)
 |-- error: long (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: long (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- origin_country: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: double (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production

In [4]:
# Extract the genre names (explode the array of genres to a new row per genre)
df_genres = df.select(
    col("id").alias("movieId"),
    explode(col("genres")).alias("genre")
).select(
    "movieId",
    col("genre.name").alias("genre_name")
)
# Index the genre names (convert them to numerical values)
indexer = StringIndexer(inputCol="genre_name", outputCol="genre_index")

# Encode the genre indices using OneHotEncoder
encoder = OneHotEncoder(inputCol="genre_index", outputCol="genre_vec")

# Combine indexing and encoding in a pipeline
pipeline = Pipeline(stages=[indexer, encoder])
genre_model = pipeline.fit(df_genres)
df_genres_encoded = genre_model.transform(df_genres)

# Now join the original movie DataFrame with the encoded genres
df_selected = df.select(
    col("id").alias("movieId"),
    col("title").alias("name"),
    col("popularity")
)

df_with_genre = df_selected.join(df_genres_encoded, on="movieId", how="left")

In [5]:
# Now join with ratings DataFrame (assuming 'ratings.csv' exists)
rating_df = spark.read.csv('ratings.csv', header=True, inferSchema=True)
links_df = spark.read.csv('links.csv', header=True, inferSchema=True)

# Join df_with_genre with links_df to add tmdbId and then join with rating_df
df_with_tmdb = df_with_genre.join(links_df, 'movieId', 'inner')
final_df = df_with_tmdb.join(rating_df, 'movieId', 'inner')

# Select the minimal required columns
final_df_selected = final_df.select(
    col("userId"),
    col("tmdbId"),
    col("name"),
    col("popularity"),
    col("rating"),
    col("genre_vec")  # Include the genre vector
)

# Show the final DataFrame to verify
final_df_selected.show(truncate=False)

+------+------+------------------------------------+----------+------+---------------+
|userId|tmdbId|name                                |popularity|rating|genre_vec      |
+------+------+------------------------------------+----------+------+---------------+
|104   |61337 |Ray                                 |3.581     |2.0   |(18,[0],[1.0]) |
|104   |61337 |Ray                                 |3.581     |2.0   |(18,[16],[1.0])|
|156   |61337 |Ray                                 |3.581     |3.0   |(18,[0],[1.0]) |
|156   |61337 |Ray                                 |3.581     |3.0   |(18,[16],[1.0])|
|156   |24086 |Harley Davidson and the Marlboro Man|3.623     |3.0   |(18,[1],[1.0]) |
|156   |24086 |Harley Davidson and the Marlboro Man|3.623     |3.0   |(18,[3],[1.0]) |
|196   |24086 |Harley Davidson and the Marlboro Man|3.623     |4.0   |(18,[1],[1.0]) |
|196   |24086 |Harley Davidson and the Marlboro Man|3.623     |4.0   |(18,[3],[1.0]) |
|267   |24086 |Harley Davidson and the Marl

In [6]:
# Ensure there are no missing ratings
final_df_selected = final_df_selected.dropna(subset=['rating'])

# Split data into training and test sets
(training_data, test_data) = final_df_selected.randomSplit([0.8, 0.2], seed=1234)

In [10]:
def train_evaluate_als(rank_value=100):
    als = ALS(
        userCol="userId",
        itemCol="tmdbId",
        ratingCol="rating",
        maxIter=20,  # Keeping iterations constant
        rank=rank_value,  # Fixed rank at 100
        regParam=0.05,  # Regularization
        alpha=0.5,  # For implicit feedback (if applicable)
        coldStartStrategy="drop"
    )

    # Train the model
    model = als.fit(training_data)

    # Get predictions
    predictions = model.transform(test_data)

    # Evaluate R-squared
    evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
    r2 = evaluator_r2.evaluate(predictions)

    print(f"R-squared for rank={rank_value}: {r2}")

    # Save the trained model
    model.write().overwrite().save("als_movie_model")
    print("Model saved successfully")

    return r2

# Train and evaluate ALS with rank=100
r2_value = train_evaluate_als(rank_value=100)

# Display the R-squared value for rank=100
print(f"\nR-squared for rank=100: {r2_value}")

R-squared for rank=100: 0.7688095109185988
Model saved successfully

R-squared for rank=100: 0.7688095109185988


In [13]:
from pyspark.ml.recommendation import ALSModel

# Load the saved model
model_path = "als_movie_model"  # Update with your actual path if different
model = ALSModel.load(model_path)
print("Model loaded successfully!")

# Get predictions using the loaded model
predictions = model.transform(test_data)

# Evaluate R-squared
evaluator_r2 = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
r2 = evaluator_r2.evaluate(predictions)

print(f"R-squared from loaded model: {r2}")

Model loaded successfully!
R-squared from loaded model: 0.7688095109185988


In [45]:
# Evaluate RMSE
evaluator_rmse = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator_rmse.evaluate(predictions)
print(f"RMSE: {rmse}")

# Evaluate MAE
evaluator_mae = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")
mae = evaluator_mae.evaluate(predictions)
print(f"MAE: {mae}")


RMSE: 0.40584829697679436
MAE: 0.2973501348507395


In [44]:
from pyspark.sql import functions as F  # Import F for functions
predictions.select(F.min("prediction").alias("min_prediction"),
                   F.max("prediction").alias("max_prediction")).show()

+--------------+--------------+
|min_prediction|max_prediction|
+--------------+--------------+
|     0.4065936|     5.3228064|
+--------------+--------------+



In [50]:
# Generate top N recommendations for all users (e.g., Top 10)
top_n_recommendations = model.recommendForAllUsers(10)

# Explode the recommendations to get one row per movie recommendation
exploded_recommendations = top_n_recommendations.select(
    "userId",
    explode("recommendations").alias("recommendation")
)

# Extract tmdbId and predicted rating from the exploded recommendations
final_recommendations = exploded_recommendations.select(
    "userId",
    col("recommendation.tmdbId").alias("tmdbId"),
    col("recommendation.rating").alias("predicted_rating")
)

# Now, join the recommendations with the movie metadata (using the correct column names)
final_with_metadata = final_recommendations.join(df_with_tmdb, "tmdbId", "inner")

# Show the recommendations with movie details (name, popularity, genre, etc.)
final_with_metadata.select(
    "userId",
    "tmdbId",
    "predicted_rating",
    "name",  # 'name' instead of 'title'
    "genre_name"  # 'genre_name' instead of 'genre'
).show(truncate=False)


+------+------+----------------+----------------------------------+---------------+
|userId|tmdbId|predicted_rating|name                              |genre_name     |
+------+------+----------------+----------------------------------+---------------+
|1     |16133 |3.9899487       |Four Lions                        |Comedy         |
|1     |16133 |3.9899487       |Four Lions                        |Crime          |
|1     |16133 |3.9899487       |Four Lions                        |Drama          |
|1     |11    |3.9896953       |The 39 Steps                      |Mystery        |
|1     |11    |3.9896953       |The 39 Steps                      |Thriller       |
|1     |101   |3.9872937       |A River Runs Through It           |Drama          |
|1     |101   |3.9872937       |A River Runs Through It           |Family         |
|1     |914   |3.967852        |Bean                              |Family         |
|1     |914   |3.967852        |Bean                              |Adventure