In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Connecting to security.ubuntu.com] [Connected to cloud.r-project.org (3.166.118.85)] [Connected                                                                                                     Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 257 kB in 2s (139 kB/s)
Reading p

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import functions as F

# Initialize Spark session
spark = SparkSession.builder.appName("MovieRecommendation").getOrCreate()

# Load ratings and movies data
ratings_df = spark.read.csv("ratings.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("movies.csv", header=True, inferSchema=True)  # movies.csv should contain movieId, genre
ratings_df.show()
movies_df.show()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
|     1|     70|   3.0|964982400|
|     1|    101|   5.0|964980868|
|     1|    110|   4.0|964982176|
|     1|    151|   5.0|964984041|
|     1|    157|   5.0|964984100|
|     1|    163|   5.0|964983650|
|     1|    216|   5.0|964981208|
|     1|    223|   3.0|964980985|
|     1|    231|   5.0|964981179|
|     1|    235|   4.0|964980908|
|     1|    260|   5.0|964981680|
|     1|    296|   3.0|964982967|
|     1|    316|   3.0|964982310|
|     1|    333|   5.0|964981179|
|     1|    349|   4.0|964982563|
+------+-------+------+---------+
only showing top 20 rows

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
| 

In [3]:
# Step 1: Process the genre column by splitting it (assuming genres are in a single column like "Adventure|Animation|Children|Comedy|Fantasy")
# Split the genre column using the '|' delimiter
movies_df = movies_df.withColumn("genre", F.split(F.col("genres"), "\|"))

# Step 2: Explode the genre column to have one genre per row (multiple rows for each movie)
movies_df = movies_df.withColumn("genre", F.explode(F.col("genre")))

# Step 3: Use StringIndexer to convert genres into numerical indices
indexer = StringIndexer(inputCol="genre", outputCol="genreIndex")
encoder = OneHotEncoder(inputCol="genreIndex", outputCol="genreVec")

# Step 4: Create a pipeline for genre encoding
pipeline = Pipeline(stages=[indexer, encoder])
movies_with_genre = pipeline.fit(movies_df).transform(movies_df)

# Step 5: Join the ratings data with the genre information (ratings_df and movies_with_genre)
ratings_with_genre = ratings_df.join(movies_with_genre, on="movieId", how="inner")

In [4]:
# Step 6: Train-test split
train_data, test_data = ratings_with_genre.randomSplit([0.8, 0.2], seed=42)

# Step 7: Build the ALS model
# Define ALS model with hyperparameters
als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    maxIter=10,  # can be adjusted
    rank=20,     # Adjust rank if needed
    regParam=0.1,
    coldStartStrategy="drop"
)


# Step 8: Train the ALS model on the training data
model = als.fit(train_data)


In [5]:
# Step 9: Get predictions on the test data
predictions = model.transform(test_data)
# Select the columns: user, movie, actual rating, predicted rating
predicted_vs_actual = predictions.select("userid", "movieid", "rating", "prediction")

# Show a few rows to compare actual vs predicted ratings
predicted_vs_actual.show(10)


+------+-------+------+----------+
|userid|movieid|rating|prediction|
+------+-------+------+----------+
|     1|      1|   4.0|  4.465457|
|     5|      1|   4.0| 3.9757326|
|     5|      1|   4.0| 3.9757326|
|     7|      1|   4.5|  4.148654|
|    15|      1|   2.5| 2.8565345|
|    17|      1|   4.5| 4.0303845|
|    18|      1|   3.5| 3.8686466|
|    21|      1|   3.5|  3.439942|
|    31|      1|   5.0|  4.249846|
|    31|      1|   5.0|  4.249846|
+------+-------+------+----------+
only showing top 10 rows



In [6]:


# Step 10: Evaluate the model using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

Root Mean Squared Error (RMSE) = 0.6346458391040313


In [7]:
# Step 11: Get top recommendations for users
user_recs = model.recommendForAllUsers(10)  # Get top 10 recommendations for each user
user_recs.show(5)

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{26171, 5.83101}...|
|     2|[{131724, 4.89435...|
|     3|[{70946, 4.941532...|
|     4|[{2300, 5.0318265...|
|     5|[{1262, 4.8456035...|
+------+--------------------+
only showing top 5 rows



In [8]:
from pyspark.sql import functions as F
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Define a threshold for relevant ratings (e.g., ratings > 3.0 are considered relevant)
threshold = 3.0

# Step 1: Filter predictions and actual ratings based on the threshold
predictions = predictions.withColumn("prediction", F.when(F.col("prediction") > threshold, 1).otherwise(0))
actual_ratings = test_data.withColumn("actual", F.when(F.col("rating") > threshold, 1).otherwise(0))

# Step 2: Join predictions and actual ratings to get a single DataFrame for comparison
predictions_and_actuals = predictions.join(actual_ratings, ["userId", "movieId"], "inner")

# Step 3: Calculate True Positives, False Positives, False Negatives
# True Positive (TP): The item is relevant and it was predicted as relevant
# False Positive (FP): The item is not relevant but was predicted as relevant
# False Negative (FN): The item is relevant but was not predicted as relevant

TP = predictions_and_actuals.filter((col("prediction") == 1) & (col("actual") == 1)).count()
FP = predictions_and_actuals.filter((col("prediction") == 1) & (col("actual") == 0)).count()
FN = predictions_and_actuals.filter((col("prediction") == 0) & (col("actual") == 1)).count()

# Step 4: Calculate Precision, Recall, and F1-Score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Step 5: Print the results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


Precision: 0.7845894837085913
Recall: 0.9444819750660198
F1-Score: 0.8571428571428573


In [18]:
from pyspark.sql import functions as F

def get_movie_recommendations_for_user(user_id, model, ratings_df, movies_df, top_n=10):
    """
    Get movie recommendations for a specific user and return them with scaled ratings.

    :param user_id: The ID of the user for whom recommendations are generated.
    :param model: The trained recommendation model.
    :param ratings_df: The ratings DataFrame containing user ratings.
    :param movies_df: The movies DataFrame containing movie information (movieId and title).
    :param top_n: The number of recommendations to return (default is 10).

    :return: DataFrame with movie recommendations including movie title and scaled ratings.
    """
    # Generate movie recommendations for the specified user
    user_recs = model.recommendForUserSubset(ratings_df.filter(ratings_df.userId == user_id), top_n)

    # Exploding the recommendations to get individual movie rows
    user_recs_flat = user_recs.select("userId", F.explode("recommendations").alias("movie_recommendation"))

    # Join with the movies_df to get movie names
    user_recs_with_names = user_recs_flat.join(movies_df, user_recs_flat.movie_recommendation.movieId == movies_df.movieId, how="inner") \
        .select("userId", "movieId", "title", "movie_recommendation.rating")

    # Drop duplicates based on userId and movieId
    user_recs_with_names = user_recs_with_names.dropDuplicates(["userId", "movieId"])

    # Scale the ratings from 0-6 to 1-5 range
    user_recs_with_names = user_recs_with_names.withColumn(
        "scaled_rating",
        1 + (F.col("rating") / 6) * 4  # This scales the ratings to the 1-5 range
    )

    # Return the recommendations with scaled ratings
    return user_recs_with_names.select("userId", "movieId", "title", "scaled_rating")

# Example of calling the function for user 1
user_1_recs = get_movie_recommendations_for_user(1, model, ratings_df, movies_df, top_n=10)

# Show the recommended movies for user 1
user_1_recs.show(truncate=False)



+------+-------+---------------------------------------------------+------------------+
|userId|movieId|title                                              |scaled_rating     |
+------+-------+---------------------------------------------------+------------------+
|1     |123    |Chungking Express (Chung Hing sam lam) (1994)      |4.6438093185424805|
|1     |1262   |Great Escape, The (1963)                           |4.6117149988810215|
|1     |3494   |True Grit (1969)                                   |4.584958712259928 |
|1     |5490   |The Big Bus (1976)                                 |4.644354184468588 |
|1     |5915   |Victory (a.k.a. Escape to Victory) (1981)          |4.631017684936523 |
|1     |6375   |Gigantic (A Tale of Two Johns) (2002)              |4.573835372924805 |
|1     |26171  |Play Time (a.k.a. Playtime) (1967)                 |4.88733990987142  |
|1     |92475  |All Watched Over by Machines of Loving Grace (2011)|4.648160934448242 |
|1     |132333 |Seve (2014)     

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator

# Make predictions on the test data
predictions = model.transform(test_data)

# Instantiate a RegressionEvaluator to compute R2
evaluator = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")

# Calculate R2
r2_value = evaluator.evaluate(predictions)

# Print the R2 value
print(f"R² Value: {r2_value}")


R² Value: 0.6244224782581024


# New section