# Validation and Test Prediction Preparation

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, udf, lit
from pyspark.sql.types import StringType, IntegerType
from unidecode import unidecode
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import col, lit, when, udf

import os
import pandas as pd



In [29]:
spark = SparkSession.builder \
    .appName("IMDB Prediction") \
    .getOrCreate()


In [None]:

validation_path = '/Users/bognarlili/Big_Data_IMDb-1/data/validation_hidden.csv'
test_path = '/Users/bognarlili/Big_Data_IMDb-1/data/test_hidden.csv'

In [28]:
validation = spark.read.csv(validation_path, header=True, inferSchema=True)
test = spark.read.csv(test_path, header=True, inferSchema=True)
validation.show()


+---+---------+--------------------+-------------------+---------+-------+--------------+--------+
|_c0|   tconst|        primaryTitle|      originalTitle|startYear|endYear|runtimeMinutes|numVotes|
+---+---------+--------------------+-------------------+---------+-------+--------------+--------+
|  0|tt0003740|             Cabiria|               NULL|     1914|     \N|           148|  3452.0|
|  1|tt0008663|     A Man There Was|        Terje Vigen|     1917|     \N|            65|  1882.0|
|  3|tt0010307|           J'accuse!|               NULL|     1919|     \N|           166|  1692.0|
| 18|tt0014429|        Safety Last!|       Safety Last!|     1923|     \N|            74| 19898.0|
| 27|tt0015175|Die Nibelungen: S...|               NULL|     1924|     \N|           143|  5676.0|
| 39|tt0016332|       Seven Chances|               NULL|     1925|     \N|            56|  9914.0|
| 61|tt0018737|       Pandora's Box|               NULL|       \N|   1929|           109| 10475.0|
| 65|tt001

25/03/19 21:26:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes
Expected: _c0 but found: 
CSV file: file:///Users/bognarlili/Big_Data_IMDb-1/data/validation_hidden.csv


In [27]:
test.show()

+---+---------+--------------------+-------------------+---------+-------+--------------+--------+
|_c0|   tconst|        primaryTitle|      originalTitle|startYear|endYear|runtimeMinutes|numVotes|
+---+---------+--------------------+-------------------+---------+-------+--------------+--------+
| 22|tt0014972| He Who Gets Slapped|He Who Gets Slapped|     1924|     \N|            95|  3654.0|
| 23|tt0015016|      The Iron Horse|               NULL|     1924|     \N|           150|  2136.0|
| 26|tt0015174|Die Nibelungen: K...|               NULL|     1924|     \N|           129|  4341.0|
| 28|tt0015214|             At 3:25|               NULL|       \N|   1925|            59|  1724.0|
| 34|tt0015863|             Go West|               NULL|     1925|     \N|            69|  4188.0|
| 40|tt0016481|             Variety|            Varieté|     1925|     \N|           104|  1188.0|
| 46|tt0017136|          Metropolis|               NULL|     1927|     \N|           153|168372.0|
| 66|tt001

25/03/19 21:26:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes
Expected: _c0 but found: 
CSV file: file:///Users/bognarlili/Big_Data_IMDb-1/data/test_hidden.csv


# Preprocess the validation and the test data

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.validation_df = None
        self.test_df = None
        
       
        self.expected_columns = [
            'tconst', 'movie_title', 'year', 'numVotes', 'genre', 
            'content_rating', 'production_company', 'tomatometer_status', 
            'tomatometer_rating', 'audience_status', 'audience_rating', 
            'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'
        ]
        
        
        self.fill_values = {
            'movie_title': 'Unknown',
            'genre': 'Unknown',
            'content_rating': 'Unknown',
            'production_company': 'Unknown',
            'tomatometer_status': -1,
            'tomatometer_rating': -1,
            'audience_status': -1,
            'audience_rating': -1,
            'review_score': 'Unknown',
            'like_count': -1,
            'label_int': -1,
            'reviews': 'Unknown',
            'review_lemmatized': 'Unknown'
        }
        
        #  (special characters removal)
        self.normalize_text_udf = udf(self.normalize_text, StringType())

    # Validation Data
    def load_validation_data(self):
        path = "data/"
        validation_df = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
        validation_df = validation_df.drop(columns="runtimeMinutes")

        # Handle \N values as None
        self.validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
        
        # Rename primaryTitle 
        self.validation_df = self.validation_df.withColumnRenamed("primaryTitle", "movie_title")
        print(" Validation data loaded")

    # Test Data
    def load_test_data(self):
        path = "data/"
        test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
        test_df = test_df.drop(columns="runtimeMinutes")

        # Handle \N values as None
        self.test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
        
        # Rename primaryTitle → movie_title
        self.test_df = self.test_df.withColumnRenamed("primaryTitle", "movie_title")
        print("Test data loaded")

    #Handle Year Data
    def handle_years(self, df):
        if 'startYear' in df.columns and 'endYear' in df.columns:
            df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                               .otherwise(col("endYear")))
            df = df.drop("startYear", "endYear")
        return df

    #Normalize Titles
    @staticmethod
    def normalize_text(text):
        if text is None:
            return None
        return unidecode(text)
    
    def clean_titles(self, df):
        df = df.withColumn("movie_title", self.normalize_text_udf(col("movie_title")))
        
        if 'originalTitle' in df.columns:
            df = df.withColumn("originalTitle", self.normalize_text_udf(col("originalTitle")))
            df = df.withColumn("movie_title", 
                               F.concat_ws(' - ', F.col('movie_title'), F.col('originalTitle')))
            df = df.drop("originalTitle")
        return df

    #add Missing Columns to Match Training Set
    def add_missing_columns(self):
        missing_columns = [
            'label', 'genre', 'content_rating', 'production_company', 
            'tomatometer_status', 'tomatometer_rating', 'audience_status', 
            'audience_rating', 'review_score', 'like_count', 'label_int', 
            'reviews', 'review_lemmatized'
        ]
        
        for col_name in missing_columns:
            if col_name not in self.validation_df.columns:
                if col_name in ['like_count', 'label_int', 'tomatometer_status', 'tomatometer_rating', 
                                'audience_status', 'audience_rating']:
                    self.validation_df = self.validation_df.withColumn(col_name, lit(-1).cast(IntegerType()))
                else:
                    self.validation_df = self.validation_df.withColumn(col_name, lit('Unknown').cast(StringType()))

            if col_name not in self.test_df.columns:
                if col_name in ['like_count', 'label_int', 'tomatometer_status', 'tomatometer_rating', 
                                'audience_status', 'audience_rating']:
                    self.test_df = self.test_df.withColumn(col_name, lit(-1).cast(IntegerType()))
                else:
                    self.test_df = self.test_df.withColumn(col_name, lit('Unknown').cast(StringType()))

        print("Missing columns added")

    # Fill Missing Values
    def fill_missing_values(self):
        for key, value in self.fill_values.items():
            if key in self.validation_df.columns:
                if isinstance(value, int):
                    self.validation_df = self.validation_df.withColumn(key, F.coalesce(col(key), lit(value).cast(IntegerType())))
                else:
                    self.validation_df = self.validation_df.withColumn(key, F.coalesce(col(key), lit(value).cast(StringType())))

            if key in self.test_df.columns:
                if isinstance(value, int):
                    self.test_df = self.test_df.withColumn(key, F.coalesce(col(key), lit(value).cast(IntegerType())))
                else:
                    self.test_df = self.test_df.withColumn(key, F.coalesce(col(key), lit(value).cast(StringType())))

        print(" Missing values filled")

    # Reorder Columns to Match Training Set
    def reorder_columns(self):
        self.validation_df = self.validation_df.select(*[col for col in self.expected_columns if col in self.validation_df.columns])
        self.test_df = self.test_df.select(*[col for col in self.expected_columns if col in self.test_df.columns])
        print("Columns reordered")

    # Write to CSV
    def save_to_csv(self, validation_path, test_path):
        self.validation_df.coalesce(1).write.csv(validation_path, header=True, mode='overwrite', emptyValue="")
        self.test_df.coalesce(1).write.csv(test_path, header=True, mode='overwrite', emptyValue="")
        print("Final validation and test data saved!")

    # Process Everything
    def process(self, validation_path, test_path):
        self.load_validation_data()
        self.load_test_data()
        self.validation_df = self.handle_years(self.validation_df)
        self.test_df = self.handle_years(self.test_df)
        self.validation_df = self.clean_titles(self.validation_df)
        self.test_df = self.clean_titles(self.test_df)
        self.add_missing_columns()
        self.fill_missing_values()
        self.reorder_columns()
        self.save_to_csv(validation_path, test_path)



# Adding a new column


In [None]:


class DataPreprocessor:
    def __init__(self, validation_df, test_df):
        self.validation_df = validation_df
        self.test_df = test_df
        
        
        self.expected_columns = [
            'tconst', 'movie_title', 'year', 'numVotes', 'genre', 
            'content_rating', 'production_company', 'tomatometer_status', 
            'tomatometer_rating', 'audience_status', 'audience_rating', 
            'review_score', 'like_count', 'label_int', 'reviews', 'review_lemmatized'
        ]
        
        
        self.fill_values = {
            'movie_title': 'Unknown',
            'genre': 'Unknown',
            'content_rating': 'Unknown',
            'production_company': 'Unknown',
            'tomatometer_status': -1,
            'tomatometer_rating': -1,
            'audience_status': -1,
            'audience_rating': -1,
            'review_score': 'Unknown',
            'like_count': -1,
            'label_int': -1,
            'reviews': 'Unknown',
            'review_lemmatized': 'Unknown'
        }

    def add_missing_columns(self):
        missing_columns = [
            'label', 'genre', 'content_rating', 'production_company', 
            'tomatometer_status', 'tomatometer_rating', 'audience_status', 
            'audience_rating', 'review_score', 'like_count', 'label_int', 
            'reviews', 'review_lemmatized'
        ]
        
        for col_name in missing_columns:
            if col_name not in self.validation_df.columns:
                if col_name in ['like_count', 'label_int', 'tomatometer_status', 'tomatometer_rating', 
                                'audience_status', 'audience_rating']:
                    self.validation_df = self.validation_df.withColumn(col_name, lit(-1).cast(IntegerType()))
                else:
                    self.validation_df = self.validation_df.withColumn(col_name, lit('Unknown').cast(StringType()))

            if col_name not in self.test_df.columns:
                if col_name in ['like_count', 'label_int', 'tomatometer_status', 'tomatometer_rating', 
                                'audience_status', 'audience_rating']:
                    self.test_df = self.test_df.withColumn(col_name, lit(-1).cast(IntegerType()))
                else:
                    self.test_df = self.test_df.withColumn(col_name, lit('Unknown').cast(StringType()))
        
        print("Missing columns added")

    def fill_missing_values(self):
        for key, value in self.fill_values.items():
            if key in self.validation_df.columns:
                if isinstance(value, int):
                    self.validation_df = self.validation_df.withColumn(key, F.coalesce(col(key), lit(value).cast(IntegerType())))
                else:
                    self.validation_df = self.validation_df.withColumn(key, F.coalesce(col(key), lit(value).cast(StringType())))

            if key in self.test_df.columns:
                if isinstance(value, int):
                    self.test_df = self.test_df.withColumn(key, F.coalesce(col(key), lit(value).cast(IntegerType())))
                else:
                    self.test_df = self.test_df.withColumn(key, F.coalesce(col(key), lit(value).cast(StringType())))

        print(" Missing values filled")

    def reorder_columns(self):
        self.validation_df = self.validation_df.select(*[col for col in self.expected_columns if col in self.validation_df.columns])
        self.test_df = self.test_df.select(*[col for col in self.expected_columns if col in self.test_df.columns])
        print("Columns reordered")

    def save_to_csv(self, validation_path, test_path):
        self.validation_df.coalesce(1).write.csv(validation_path, header=True, mode='overwrite', emptyValue="")
        self.test_df.coalesce(1).write.csv(test_path, header=True, mode='overwrite', emptyValue="")
        print("Final validation and test data saved!")

    def process(self, validation_path, test_path):
        self.add_missing_columns()
        self.fill_missing_values()
        self.reorder_columns()
        self.save_to_csv(validation_path, test_path)



In [17]:

processor = DataPreprocessor(validation_df, test_df)

#save
processor.process(
    validation_path='/Users/bognarlili/Downloads/new_validation.csv',
    test_path='/Users/bognarlili/Downloads/new_test.csv'
)


Missing columns added
 Missing values filled
Columns reordered
Final validation and test data saved!


# Generate predictions

Convert the numeric variables to the form required for the model

In [None]:
#idk
feature_cols = ['numVotes', 'year', 'tomatometer_status', 'tomatometer_rating', 
                'audience_status', 'audience_rating', 'like_count']

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_data = assembler.transform(train_df)
validation_data = assembler.transform(validation_df)
test_data = assembler.transform(test_df)

### model 

#generate predictions
validation_preds = model.transform(validation_data)
test_preds = model.transform(test_data)

# predictions to (True/False)
validation_results = validation_preds.select("prediction").rdd.flatMap(lambda x: x).collect()
test_results = test_preds.select("prediction").rdd.flatMap(lambda x: x).collect()

#string
validation_results = ["True" if pred == 1.0 else "False" for pred in validation_results]
test_results = ["True" if pred == 1.0 else "False" for pred in test_results]
