In [79]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
from unidecode import unidecode
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace

from pyspark.ml import Pipeline

In [68]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/
# %pwd

### Load data with Spark

In [48]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

# spark = SparkSession.builder.appName("BERT-FineTuning").getOrCreate()

In [53]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns="Unnamed: 0")

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    return spark_train_df

In [52]:
def load_validation_data():
    path = "data/"
    validation_dfdf = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    #validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="Unnamed: 0")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="Unnamed: 0")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    return spark_test_df


In [69]:
train_data = load_train_data()
train_data.show()

+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|     1920|   NULL|            79|  2439.0| true|
|tt0012532|Ớrpháns ớf thé Stớrm|                 NaN|     1921|   NULL|           150|     NaN| true|
|tt0013933|  The Faithful Heart|        Coeur fidèle|     1923|   NULL|            87|  1252.0| true|
|tt0015400| The Thief of Bagdad|                 NaN|     1924|   NULL|           155|  6001.0| true|
|tt0015842|  The Joyless Street|                 NaN|     1925|   NULL|           125|  1554.0| true|
|tt0016544|    The Wizard of Oz|                 NaN|     1925|   NULL|            95|  1497.0|false|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|     1925|   NULL|           

### Cleaning

In [76]:
# def handle_years():
#     """
#     Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
#     Drops 'startYear' and 'endYear' after merging.
#     """
#     spark_train_df = spark_train_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_validation_df = spark_validation_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_test_df = spark_test_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     # drop original startYear and endYear
#     spark_train_df = spark_train_df.drop("startYear", "endYear")
#     spark_validation_df = spark_validation_df.drop("startYear", "endYear")
#     spark_test_df = spark_test_df.drop("startYear", "endYear")

#     return spark_train_df, spark_validation_df, spark_test_df

def handle_years(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                                  .otherwise(col("endYear")))

    # drop original startYear and endYear
    df = df.drop("startYear", "endYear")

    return df

In [77]:
spark_train_df = load_train_data()
spark_train_df = handle_years(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------------------+--------------+--------+-----+----+
|   tconst|        primaryTitle|       originalTitle|runtimeMinutes|numVotes|label|year|
+---------+--------------------+--------------------+--------------+--------+-----+----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|            79|  2439.0| true|1920|
|tt0012532|Ớrpháns ớf thé Stớrm|                 NaN|           150|     NaN| true|1921|
|tt0013933|  The Faithful Heart|        Coeur fidèle|            87|  1252.0| true|1923|
|tt0015400| The Thief of Bagdad|                 NaN|           155|  6001.0| true|1924|
|tt0015842|  The Joyless Street|                 NaN|           125|  1554.0| true|1925|
|tt0016544|    The Wizard of Oz|                 NaN|            95|  1497.0|false|1925|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|           143|  7539.0| true|1925|
|tt0017463|           3 Bad Men|           3 Bad Men|            92|  1165.0| true|1926|
|tt0018379|          

In [80]:
# converts special characters to ASCII
def normalize_text(text):
    if text is None:
        return None
    return unidecode(text)

normalize_text_udf = udf(normalize_text, StringType())

def clean_titles(df):
    df = df.withColumn("primaryTitle", normalize_text_udf(col("primaryTitle")))
    df = df.withColumn("originalTitle", normalize_text_udf(col("originalTitle")))
    return df

In [81]:
spark_train_df = clean_titles(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------------------+--------------+--------+-----+----+
|   tconst|        primaryTitle|       originalTitle|runtimeMinutes|numVotes|label|year|
+---------+--------------------+--------------------+--------------+--------+-----+----+
|tt0011439|   The Mark of Zorro|   The Mark of Zorro|            79|  2439.0| true|1920|
|tt0012532|Orphans of the Storm|                 NaN|           150|     NaN| true|1921|
|tt0013933|  The Faithful Heart|        Coeur fidele|            87|  1252.0| true|1923|
|tt0015400| The Thief of Bagdad|                 NaN|           155|  6001.0| true|1924|
|tt0015842|  The Joyless Street|                 NaN|           125|  1554.0| true|1925|
|tt0016544|    The Wizard of Oz|                 NaN|            95|  1497.0|false|1925|
|tt0016641|Ben-Hur: A Tale o...|Ben-Hur: A Tale o...|           143|  7539.0| true|1925|
|tt0017463|           3 Bad Men|           3 Bad Men|            92|  1165.0| true|1926|
|tt0018379|          