<a href="https://colab.research.google.com/github/parthasaratheeg/Analytics_Vidhya/blob/master/10M_short.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!/usr/bin/env python3
"""
Hybrid Recommendation System - MovieLens 10M
Cleaned, robust and ready-to-run script.

Notes:
- Ensure Java (JDK 11+) and PySpark are installed in the environment.
- The script downloads and extracts the MovieLens 10M dataset to /tmp/ml-10M100K if not present.
- The script is defensive: it handles missing tags/movies files and uses safe defaults.
- GBTRegressor is instantiated without unsupported parameters to avoid version incompatibilities.
"""

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, Bucketizer
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import os
import warnings
warnings.filterwarnings('ignore')

# ----------------------
# Spark session (tweak resources to your cluster)
# ----------------------
spark = SparkSession.builder \
    .appName("HybridRecommendationSystemMovieLens10M") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.default.parallelism", "200") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# ----------------------
# Main class
# ----------------------
class HybridRecommendationSystem:
    def __init__(self):
        self.results = {}
        self.dataset_name = "MovieLens 10M"
        self.feature_importance = None

    # ---------- dataset download / load ----------
    def download_dataset(self):
        """Download and extract MovieLens 10M to /tmp/ml-10M100K"""
        try:
            import urllib.request
            import zipfile

            dataset_url = "https://files.grouplens.org/datasets/movielens/ml-10m.zip"
            zip_path = "/tmp/ml-10m.zip"
            extract_path = "/tmp/ml-10M100K"

            if not os.path.exists(extract_path):
                os.makedirs(extract_path, exist_ok=True)
                print("Downloading MovieLens 10M dataset...")
                urllib.request.urlretrieve(dataset_url, zip_path)
                with zipfile.ZipFile(zip_path, 'r') as z:
                    z.extractall("/tmp/")
                print("Dataset downloaded and extracted to /tmp/ml-10M100K")
            else:
                print("Dataset already present at /tmp/ml-10M100K")

            return extract_path
        except Exception as e:
            print(f"Error downloading dataset: {e}")
            return None

    def load_movielens_data(self, dataset_path):
        """Load ratings, movies and tags robustly. Returns ratings_df, unified_df"""
        try:
            ratings_path = os.path.join(dataset_path, "ratings.dat")
            movies_path = os.path.join(dataset_path, "movies.dat")
            tags_path = os.path.join(dataset_path, "tags.dat")

            # Helper: safe read CSV with '::' splitter
            def safe_read(path, expected_cols):
                if not os.path.exists(path):
                    return None
                # Spark's CSV reader will create _c0.. if no header
                return spark.read.option("sep", "::").option("inferSchema", True).option("header", False).csv(path)

            print("Loading ratings data...")
            ratings_raw = safe_read(ratings_path, 4)
            if ratings_raw is None:
                print(f"Ratings file not found at {ratings_path}")
                return None, None

            # Map to consistent column names (handle _cN or _1 style)
            cols = ratings_raw.columns
            # expect 4 columns: userId, movieId, rating, timestamp
            ratings_df = ratings_raw.select(
                col(cols[0]).cast('int').alias('userId'),
                col(cols[1]).cast('int').alias('movieId'),
                col(cols[2]).cast('double').alias('rating'),
                col(cols[3]).cast('long').alias('timestamp')
            ).coalesce(200)

            print("Loading movies data (if available)...")
            movies_raw = safe_read(movies_path, 3)
            if movies_raw is None:
                # create minimal movies_df with movieId only
                movies_df = ratings_df.select('movieId').distinct().withColumn('movieTitle', lit(None).cast('string')).withColumn('genres', lit(None).cast('string'))
            else:
                mcols = movies_raw.columns
                movies_df = movies_raw.select(
                    col(mcols[0]).cast('int').alias('movieId'),
                    col(mcols[1]).alias('movieTitle'),
                    col(mcols[2]).alias('genres')
                )

            print("Loading tags data (if available)...")
            tags_raw = safe_read(tags_path, 4)
            if tags_raw is None:
                # empty tags
                tags_df = spark.createDataFrame([], schema=['userId', 'movieId', 'tag', 'tag_timestamp'])
            else:
                tcols = tags_raw.columns
                tags_df = tags_raw.select(
                    col(tcols[0]).cast('int').alias('userId'),
                    col(tcols[1]).cast('int').alias('movieId'),
                    col(tcols[2]).alias('tag'),
                    col(tcols[3]).cast('long').alias('tag_timestamp')
                )

            # Compute tag counts per movie (if tags exist)
            if len(tags_df.columns) > 0:
                tag_counts = tags_df.groupBy('movieId').agg(
                    count('tag').alias('tag_count'),
                    countDistinct('userId').alias('distinct_taggers')
                )
            else:
                tag_counts = spark.createDataFrame([], schema=['movieId', 'tag_count', 'distinct_taggers'])

            # Left join: ensure every rating row has movie metadata + tag counts (fill missing with 0)
            unified = ratings_df.join(movies_df, 'movieId', how='left')\
                                .join(tag_counts, 'movieId', how='left')\
                                .fillna({'tag_count': 0, 'distinct_taggers': 0})

            print(f"Total ratings: {ratings_df.count():,}")
            print(f"Unique users: {ratings_df.select('userId').distinct().count():,}")
            print(f"Unique movies: {ratings_df.select('movieId').distinct().count():,}")

            return ratings_df, unified
        except Exception as e:
            print(f"Error loading data: {e}")
            return None, None

    # ---------- statistics & features ----------
    def compute_statistical_features(self, df):
        """Compute user/movie/global statistics and return (user_stats, movie_stats, global_stats)"""
        print("Computing statistical features...")

        user_stats = df.groupBy('userId').agg(
            mean('rating').alias('user_mean_rating'),
            approx_percentile('rating', 0.5).alias('user_median_rating'),
            stddev('rating').alias('user_stddev_rating'),
            (max('rating') - min('rating')).alias('user_range_rating'),
            count('rating').alias('user_rating_count'),
            min('rating').alias('user_min_rating'),
            max('rating').alias('user_max_rating')
        ).fillna(0)

        movie_stats = df.groupBy('movieId').agg(
            mean('rating').alias('movie_mean_rating'),
            approx_percentile('rating', 0.5).alias('movie_median_rating'),
            stddev('rating').alias('movie_stddev_rating'),
            (max('rating') - min('rating')).alias('movie_range_rating'),
            count('rating').alias('movie_rating_count'),
            min('rating').alias('movie_min_rating'),
            max('rating').alias('movie_max_rating')
        ).fillna(0)

        global_row = df.agg(
            mean('rating').alias('global_mean'),
            approx_percentile('rating', 0.5).alias('global_median'),
            stddev('rating').alias('global_stddev'),
            min('rating').alias('global_min'),
            max('rating').alias('global_max')
        ).collect()[0]

        global_stats = {
            'global_mean': global_row['global_mean'],
            'global_median': global_row['global_median'],
            'global_stddev': global_row['global_stddev'],
            'global_min': global_row['global_min'],
            'global_max': global_row['global_max']
        }

        return user_stats, movie_stats, global_stats

    def compute_item_features(self, df, user_stats, movie_stats, global_stats):
        """Join stats and create engineered features"""
        print("Computing item-related features...")

        df_enriched = df.join(movie_stats, 'movieId', how='left')\
                        .join(user_stats, 'userId', how='left')\
                        .fillna(0)

        # log counts
        df_enriched = df_enriched.withColumn('log_movie_count', when(col('movie_rating_count') > 1, log(col('movie_rating_count'))).otherwise(lit(0.0)))\
                                 .withColumn('log_user_count', when(col('user_rating_count') > 1, log(col('user_rating_count'))).otherwise(lit(0.0)))\
                                 .withColumn('log_tag_count', when(col('tag_count') > 0, log(col('tag_count') + 1)).otherwise(lit(0.0)))

        # binning
        movie_bins = [0.0, 10.0, 50.0, 100.0, 500.0, 1000.0, float('inf')]
        user_bins = movie_bins

        bucket_movie = Bucketizer(splits=movie_bins, inputCol='movie_rating_count', outputCol='movie_count_bin')
        bucket_user = Bucketizer(splits=user_bins, inputCol='user_rating_count', outputCol='user_count_bin')

        df_enriched = bucket_movie.transform(df_enriched)
        df_enriched = bucket_user.transform(df_enriched)

        # scaling (min-max using global min/max)
        gmin = global_stats['global_min'] if global_stats['global_min'] is not None else 0.0
        gmax = global_stats['global_max'] if global_stats['global_max'] is not None else 5.0
        rng = gmax - gmin if gmax != gmin else 1.0

        scale_cols = ['user_mean_rating', 'movie_mean_rating', 'user_stddev_rating', 'movie_stddev_rating', 'user_range_rating', 'movie_range_rating']
        for c in scale_cols:
            scaled = f"{c}_scaled"
            df_enriched = df_enriched.withColumn(scaled, (col(c) - lit(gmin)) / lit(rng)).fillna(0)

        # time features
        df_enriched = df_enriched.withColumn('rating_year', year(from_unixtime(col('timestamp'))))\
                                   .withColumn('rating_month', month(from_unixtime(col('timestamp'))))\
                                   .withColumn('rating_day_of_week', dayofweek(from_unixtime(col('timestamp'))))\
                                   .withColumn('rating_hour', hour(from_unixtime(col('timestamp'))))

        # impute missing
        imputation_cols = ['tag_count', 'distinct_taggers', 'log_tag_count']
        for c in imputation_cols:
            if c in [x[0] for x in df_enriched.dtypes]:
                df_enriched = df_enriched.withColumn(c, when(col(c).isNull(), lit(0)).otherwise(col(c)))

        return df_enriched

    def compute_user_features(self, df):
        print("Computing user confidence features...")
        user_stats = df.groupBy('userId').agg(
            mean('rating').alias('user_mean'),
            stddev('rating').alias('user_stddev'),
            count('rating').alias('user_n')
        ).fillna(0)

        z = 1.96
        user_stats = user_stats.withColumn('std_error', when(col('user_n') > 0, col('user_stddev') / sqrt(col('user_n'))).otherwise(lit(0.0)))\
                               .withColumn('lower_confidence_level', col('user_mean') - lit(z) * col('std_error'))\
                               .withColumn('upper_confidence_level', col('user_mean') + lit(z) * col('std_error'))\
                               .withColumn('confidence_interval_width', col('upper_confidence_level') - col('lower_confidence_level'))\
                               .withColumn('confidence_multiplier', when(col('confidence_interval_width') > 0, lit(1.0) / col('confidence_interval_width')).otherwise(lit(1.0)))\
                               .fillna(0)

        return user_stats

    def prepare_training_data(self, df_enriched, user_conf_features):
        print("Preparing training dataset...")

        df_final = df_enriched.join(user_conf_features.select('userId', 'lower_confidence_level', 'upper_confidence_level', 'confidence_interval_width', 'confidence_multiplier'), 'userId', how='left').fillna(0)

        feature_cols = [
            'user_mean_rating','user_median_rating','user_stddev_rating','user_range_rating',
            'movie_mean_rating','movie_median_rating','movie_stddev_rating','movie_range_rating',
            'user_rating_count','movie_rating_count','log_movie_count','log_user_count',
            'movie_count_bin','user_count_bin',
            'movie_mean_rating_scaled','user_mean_rating_scaled','user_stddev_rating_scaled','movie_stddev_rating_scaled',
            'rating_year','rating_month','rating_day_of_week','rating_hour',
            'tag_count','log_tag_count','distinct_taggers',
            'lower_confidence_level','upper_confidence_level','confidence_interval_width','confidence_multiplier'
        ]

        # keep only columns that exist to avoid assembly errors
        input_cols = [c for c in feature_cols if c in [x[0] for x in df_final.dtypes]]

        assembler = VectorAssembler(inputCols=input_cols, outputCol='features', handleInvalid='skip')
        df_final = assembler.transform(df_final).select(col('rating').alias('label'), 'features').filter(col('label').isNotNull())

        df_final = df_final.repartition(200).cache()
        return df_final, input_cols

    def train_gbt_model(self, train_data, test_data):
        print('Training GBTRegressor...')
        gbt = GBTRegressor(maxIter=50, maxDepth=6, stepSize=0.1, seed=42, subsamplingRate=0.8, minInstancesPerNode=5)
        model = gbt.fit(train_data)
        self.feature_importance = model.featureImportances
        return model

    def evaluate_model(self, model, test_data):
        print('Evaluating model...')
        predictions = model.transform(test_data).cache()
        evaluator_rmse = RegressionEvaluator(metricName='rmse')
        evaluator_mae = RegressionEvaluator(metricName='mae')
        evaluator_mse = RegressionEvaluator(metricName='mse')
        evaluator_r2 = RegressionEvaluator(metricName='r2')
        rmse = evaluator_rmse.evaluate(predictions)
        mae = evaluator_mae.evaluate(predictions)
        mse = evaluator_mse.evaluate(predictions)
        r2 = evaluator_r2.evaluate(predictions)
        print(f"Root Mean Squared Error (RMSE):{rmse:>15.6f}")
        print(f"R² Score:                      {r2:>15.6f}")
        predictions.unpersist()
        return {'RMSE': rmse, 'MAE': mae,'mse':mse,'r2':r2}

    def process_dataset(self):
        print('Starting pipeline...')
        dataset_path = self.download_dataset()
        if dataset_path is None:
            return

        ratings_df, unified_df = self.load_movielens_data(dataset_path)
        if unified_df is None:
            return

        # compute stats
        user_stats, movie_stats, global_stats = self.compute_statistical_features(unified_df)

        # item features
        df_enriched = self.compute_item_features(unified_df, user_stats, movie_stats, global_stats)

        # split enriched data
        train_enriched, test_enriched = df_enriched.randomSplit([0.8, 0.2], seed=42)

        user_conf = self.compute_user_features(ratings_df)

        train_data, feature_cols = self.prepare_training_data(train_enriched, user_conf)
        test_data, _ = self.prepare_training_data(test_enriched, user_conf)

        model = self.train_gbt_model(train_data, test_data)
        metrics = self.evaluate_model(model, test_data)

        self.results[self.dataset_name] = metrics
        print('Pipeline finished')
        return metrics

    def print_summary(self):
        print('\nFINAL RESULTS SUMMARY')
        for k, v in self.results.items():
            print(f"Dataset: {k} -> {v}")


if __name__ == '__main__':
    system = HybridRecommendationSystem()
    system.process_dataset()
    system.print_summary()
    spark.stop()


Starting pipeline...
Downloading MovieLens 10M dataset...
Dataset downloaded and extracted to /tmp/ml-10M100K
Loading ratings data...
Loading movies data (if available)...
Loading tags data (if available)...
Total ratings: 10,000,054
Unique users: 69,878
Unique movies: 10,677
Computing statistical features...
Computing item-related features...
Computing user confidence features...
Preparing training dataset...
Preparing training dataset...
Training GBTRegressor...
Evaluating model...
Root Mean Squared Error (RMSE):       0.862001
R² Score:                             0.338847
Pipeline finished

FINAL RESULTS SUMMARY
Dataset: MovieLens 10M -> {'RMSE': 0.8620009491830785, 'MAE': 0.6657621321555833, 'mse': 0.7430456363925283, 'r2': 0.3388471298271255}
