In [98]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
from unidecode import unidecode
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace, lower, trim, lit, coalesce

from pyspark.ml import Pipeline

In [5]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/
# %pwd

### Load data with Spark

In [6]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

# spark = SparkSession.builder.appName("BERT-FineTuning").getOrCreate()

In [69]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns=["Unnamed: 0", "runtimeMinutes"])

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    spark_train_df = spark_train_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_train_df

In [70]:
def load_validation_data():
    path = "data/"
    validation_df = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    # validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="runtimeMinutes")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    spark_validation_df = spark_validation_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="runtimeMinutes")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    spark_test_df = spark_test_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_test_df


In [71]:
train_data = load_train_data()
train_data.show()

+---------+--------------------+--------------------+---------+-------+--------+-----+
|   tconst|         movie_title|       originalTitle|startYear|endYear|numVotes|label|
+---------+--------------------+--------------------+---------+-------+--------+-----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|     1920|   NULL|  2439.0| true|
|tt0012532|Ớrpháns ớf thé Stớrm|                 NaN|     1921|   NULL|     NaN| true|
|tt0013933|  The Faithful Heart|        Coeur fidèle|     1923|   NULL|  1252.0| true|
|tt0015400| The Thief of Bagdad|                 NaN|     1924|   NULL|  6001.0| true|
|tt0015842|  The Joyless Street|                 NaN|     1925|   NULL|  1554.0| true|
|tt0016544|    The Wizard of Oz|                 NaN|     1925|   NULL|  1497.0|false|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|     1925|   NULL|  7539.0| true|
|tt0017463|           3 Bad Men|           3 Bad Men|     1926|   NULL|  1165.0| true|
|tt0018379|          7th Heaven|          7

In [96]:
validation_data = load_validation_data()
validation_data.show()
validation_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0003740|             Cabiria|                NaN|     1914|   NULL|  3452.0|
|tt0008663|     A Man There Was|        Terje Vigen|     1917|   NULL|  1882.0|
|tt0010307|           J'accuse!|                NaN|     1919|   NULL|  1692.0|
|tt0014429|        Safety Last!|       Safety Last!|     1923|   NULL| 19898.0|
|tt0015175|Die Nibelungen: S...|                NaN|     1924|   NULL|  5676.0|
|tt0016332|       Seven Chances|                NaN|     1925|   NULL|  9914.0|
|tt0018737|       Pandora's Box|                NaN|     NULL|   1929| 10475.0|
|tt0018839|The Docks of New ...|                NaN|     1928|   NULL|  4339.0|
|tt0019421| Steamboat Bill, Jr.|Steamboat Bill, Jr.|     1928|   NULL| 14166.0|
|tt0019901|   Woman in the Moon|        

955

In [97]:
test_data = load_test_data()
test_data.show()
test_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0014972| He Who Gets Slapped|He Who Gets Slapped|     1924|   NULL|  3654.0|
|tt0015016|      The Iron Horse|                NaN|     1924|   NULL|  2136.0|
|tt0015174|Die Nibelungen: K...|                NaN|     1924|   NULL|  4341.0|
|tt0015214|             At 3:25|                NaN|     NULL|   1925|  1724.0|
|tt0015863|             Go West|                NaN|     1925|   NULL|  4188.0|
|tt0016481|             Variety|            Varieté|     1925|   NULL|  1188.0|
|tt0017136|          Metropolis|                NaN|     1927|   NULL|168372.0|
|tt0018876|   The Farmer's Wife|                NaN|     1928|   NULL|  2741.0|
|tt0019074| Laugh, Clown, Laugh|Laugh, Clown, Laugh|     1928|   NULL|  1934.0|
|tt0021730|           The Champ|        

1086

### Cleaning

In [74]:
# def handle_years():
#     """
#     Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
#     Drops 'startYear' and 'endYear' after merging.
#     """
#     spark_train_df = spark_train_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_validation_df = spark_validation_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_test_df = spark_test_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     # drop original startYear and endYear
#     spark_train_df = spark_train_df.drop("startYear", "endYear")
#     spark_validation_df = spark_validation_df.drop("startYear", "endYear")
#     spark_test_df = spark_test_df.drop("startYear", "endYear")

#     return spark_train_df, spark_validation_df, spark_test_df

def handle_years(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                                  .otherwise(col("endYear")))

    # drop original startYear and endYear
    df = df.drop("startYear", "endYear")

    return df

In [75]:
spark_train_df = load_train_data()
spark_train_df = handle_years(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------------------+--------+-----+----+
|   tconst|         movie_title|       originalTitle|numVotes|label|year|
+---------+--------------------+--------------------+--------+-----+----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|  2439.0| true|1920|
|tt0012532|Ớrpháns ớf thé Stớrm|                 NaN|     NaN| true|1921|
|tt0013933|  The Faithful Heart|        Coeur fidèle|  1252.0| true|1923|
|tt0015400| The Thief of Bagdad|                 NaN|  6001.0| true|1924|
|tt0015842|  The Joyless Street|                 NaN|  1554.0| true|1925|
|tt0016544|    The Wizard of Oz|                 NaN|  1497.0|false|1925|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|  7539.0| true|1925|
|tt0017463|           3 Bad Men|           3 Bad Men|  1165.0| true|1926|
|tt0018379|          7th Heaven|          7th Heaven|  3499.0| true|1927|
|tt0018528|         The Unknown|                 NaN|  7850.0| true|1927|
|tt0018684|     Beggars of Life|     B

In [76]:
# converts special characters to ASCII
def normalize_text(text):
    if text is None:
        return None
    return unidecode(text)

normalize_text_udf = udf(normalize_text, StringType())

def clean_titles(df):
    df = df.withColumn("movie_title", normalize_text_udf(col("movie_title")))
    df = df.withColumn("originalTitle", normalize_text_udf(col("originalTitle")))

    # maybe we can drop the original name since there are some NaN values present
    # and we already have the clean primaryTitle column
    df = df.drop("originalTitle")
    return df

In [77]:
spark_train_df = clean_titles(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------+-----+----+
|   tconst|         movie_title|numVotes|label|year|
+---------+--------------------+--------+-----+----+
|tt0011439|   The Mark of Zorro|  2439.0| true|1920|
|tt0012532|Orphans of the Storm|     NaN| true|1921|
|tt0013933|  The Faithful Heart|  1252.0| true|1923|
|tt0015400| The Thief of Bagdad|  6001.0| true|1924|
|tt0015842|  The Joyless Street|  1554.0| true|1925|
|tt0016544|    The Wizard of Oz|  1497.0|false|1925|
|tt0016641|Ben-Hur: A Tale o...|  7539.0| true|1925|
|tt0017463|           3 Bad Men|  1165.0| true|1926|
|tt0018379|          7th Heaven|  3499.0| true|1927|
|tt0018528|         The Unknown|  7850.0| true|1927|
|tt0018684|     Beggars of Life|  1093.0| true|1928|
|tt0021884|        Frankenstein| 69780.0| true|1931|
|tt0022787|      Wooden Crosses|  1409.0| true|1932|
|tt0023303|   One Hour with You|  2389.0| true|1932|
|tt0023937|       Passing Fancy|  1306.0| true|1933|
|tt0024127|Our Flags Lead Us...|  1733.0|false

In [78]:
# number of rows of training data after concatenating everything together
spark_train_df.count()

7959

### Merge with movie_reviews data

In [88]:
def load_reviews_data():
    path = "movie_reviews/"
    reviews_df = pd.read_csv(f"{path}/final_movie_reviews.csv")
    # reviews_df = reviews_df.sort_index()
    spark_reviews_df = spark.createDataFrame(reviews_df).replace(to_replace='\\N', value=None)
    return spark_reviews_df

In [89]:
spark_reviews_df = load_reviews_data()
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|      2010.0|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|      2010.0|              Comedy|
|                  10|             Fresh|              67.0|['Fresh', 'Fresh'...|['4/5', '3/5', '3...|['Obvious but ent...|       Fresh|      197

In [90]:
spark_reviews_df.count()

19095

In [91]:
def convert_year_column_to_int(df, column_name):
    """
    converts a float-based year column to an integer
    """
    df = df.withColumn(column_name, col(column_name).cast(IntegerType()))
    return df

spark_reviews_df = convert_year_column_to_int(spark_reviews_df, "Release Year")

In [92]:
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|        2010|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|        2010|              Comedy|
|                  10|             Fresh|              67.0|['Fresh', 'Fresh'...|['4/5', '3/5', '3...|['Obvious but ent...|       Fresh|        1

### Time to merge

In [111]:
def join_training_with_reviews(df_movies, df_reviews):
    """
    joins training data with reviews based on `movie_title` and `year`.

    Parameters:
    - df_movies: PySpark DataFrame containing training movie metadata.
    - df_reviews: PySpark DataFrame containing reviews.

    Returns:
    - Merged PySpark DataFrame with movie metadata + reviews.
    """

    # Standardize `movie_title` (trim + lowercase) for better matching
    df_movies = df_movies.withColumn("movie_title", trim(lower(col("movie_title"))))
    df_reviews = df_reviews.withColumn("movie_title", trim(lower(col("movie_title"))))

    # Rename 'Release Year' in reviews to 'year' for consistent joins
    df_reviews = df_reviews.withColumnRenamed("Release Year", "year")

    # Ensure `year` is cast as an integer
    df_movies = df_movies.withColumn("year", col("year").cast("int"))
    df_reviews = df_reviews.withColumn("year", col("year").cast("int"))

    # Perform a LEFT JOIN on `movie_title` and `year`
    df_merged = df_movies.join(df_reviews, on=["movie_title", "year"], how="left")

    # Select relevant columns
    selected_columns = [
        "tconst", "movie_title", "numVotes", "year", "label",
        "tomatometer_status", "review_type", "genre",
        "review_label", "review_content"
    ]

    df_final = df_merged.select(*selected_columns)

    # Fill missing reviews with "No Reviews" and "Unknown"
    df_final = df_final.fillna({"review_content": "No Reviews", "review_label": "Unknown"})

    return df_final


In [112]:
join_training_with_reviews(spark_train_df, spark_reviews_df).show()

+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|   tconst|         movie_title|numVotes|year|label|tomatometer_status|         review_type|               genre|review_label|      review_content|
+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|tt0017463|           3 bad men|  1165.0|1926| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0018379|          7th heaven|  3499.0|1927| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0018684|     beggars of life|  1093.0|1928| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0016641|ben-hur: a tale o...|  7539.0|1925| true|              NULL|                NULL|                NULL

In [114]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Release Year", "year")
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

5892

In [109]:
def clean_labels(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    # convert boolean column `label` to integer
    df = df.withColumn("label_int", when(col("label") == True, 1).otherwise(0))

    # convert categorical `review_label` to numerical
    df = df.withColumn("review_label_int", when(col("review_label") == "Fresh", 1)
                                          .when(col("review_label") == "Rotten", 0)
                                          .otherwise(None))
    # combine both labels:
    # both exist: average
    # only one exists: take that one
    df = df.withColumn("combined_label",
                       coalesce((col("label_int") + col("review_label_int")) / 2, col("label_int"), col("review_label_int")))

    # Drop unnecessary intermediate columns
    df = df.drop("label", "label_int", "review_label", "review_label_int")

    df.withColumnRenamed("label", "combined_label")

    return df

In [110]:
# clean_labels(final_df).show()

+----------+--------------------+--------+------------------+------------------+--------------------+--------------------+------------+--------------+
|    tconst|         movie_title|numVotes|tomatometer_status|tomatometer_rating|      review_content|               genre|release_year|combined_label|
+----------+--------------------+--------+------------------+------------------+--------------------+--------------------+------------+--------------+
|tt10534500|               #home| 10591.0|              NULL|              NULL|          No Reviews|                NULL|        NULL|           1.0|
| tt0090556|      'night, mother|  2244.0|              NULL|              NULL|          No Reviews|                NULL|        NULL|           1.0|
| tt0349047|(t)raumschiff sur...| 15303.0|              NULL|              NULL|          No Reviews|                NULL|        NULL|           0.0|
| tt2395385|                  +1|  8065.0|             Fresh|              82.0|["Love it or h