In [None]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
from unidecode import unidecode
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace, lower, trim, lit, coalesce

from pyspark.ml import Pipeline

In [None]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/
# %pwd

### Load data with Spark

In [None]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

# spark = SparkSession.builder.appName("BERT-FineTuning").getOrCreate()

In [None]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns=["Unnamed: 0", "runtimeMinutes"])

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    spark_train_df = spark_train_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_train_df

In [None]:
def load_validation_data():
    path = "data/"
    validation_df = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    # validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="runtimeMinutes")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    spark_validation_df = spark_validation_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="runtimeMinutes")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    spark_test_df = spark_test_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_test_df


In [None]:
train_data = load_train_data()
train_data.show()

In [None]:
validation_data = load_validation_data()
validation_data.show()
validation_data.count()

In [None]:
test_data = load_test_data()
test_data.show()
test_data.count()

### Cleaning

In [None]:
# def handle_years():
#     """
#     Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
#     Drops 'startYear' and 'endYear' after merging.
#     """
#     spark_train_df = spark_train_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_validation_df = spark_validation_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_test_df = spark_test_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     # drop original startYear and endYear
#     spark_train_df = spark_train_df.drop("startYear", "endYear")
#     spark_validation_df = spark_validation_df.drop("startYear", "endYear")
#     spark_test_df = spark_test_df.drop("startYear", "endYear")

#     return spark_train_df, spark_validation_df, spark_test_df

def handle_years(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                                  .otherwise(col("endYear")))

    # drop original startYear and endYear
    df = df.drop("startYear", "endYear")

    return df

In [None]:
spark_train_df = load_train_data()
spark_train_df = handle_years(spark_train_df)
spark_train_df.show()

In [None]:
# converts special characters to ASCII
def normalize_text(text):
    if text is None:
        return None
    return unidecode(text)

normalize_text_udf = udf(normalize_text, StringType())

def clean_titles(df):
    df = df.withColumn("movie_title", normalize_text_udf(col("movie_title")))
    df = df.withColumn("originalTitle", normalize_text_udf(col("originalTitle")))

    # maybe we can drop the original name since there are some NaN values present
    # and we already have the clean primaryTitle column
    df = df.drop("originalTitle")
    return df

In [None]:
spark_train_df = clean_titles(spark_train_df)
spark_train_df.show()

In [None]:
# number of rows of training data after concatenating everything together
spark_train_df.count()

### Merge with movie_reviews data

In [None]:
def load_reviews_data():
    path = "movie_reviews"
    reviews_df = pd.read_csv(f"{path}/final_movie_reviews.csv")
    # reviews_df = reviews_df.sort_index()
    spark_reviews_df = spark.createDataFrame(reviews_df).replace(to_replace='\\N', value=None)
    return spark_reviews_df

In [112]:
spark_reviews_df = load_reviews_data()
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|Clean_Review_date|Clean_Comment Count|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|      2010.0|              NaN|                NaN|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|      2010.0|              N

In [113]:
spark_reviews_df.count()

19095

In [114]:
def convert_year_column_to_int(df, column_name):
    """
    converts a float-based year column to an integer
    """
    df = df.withColumn(column_name, col(column_name).cast(IntegerType()))
    return df

spark_reviews_df = convert_year_column_to_int(spark_reviews_df, "Release Year")

In [115]:
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|Clean_Review_date|Clean_Comment Count|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|        2010|              NaN|                NaN|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|        2010|              N

### Time to merge

In [116]:
def join_training_with_reviews(df_movies, df_reviews):
    """
    joins training data with reviews based on `movie_title` and `year`.

    Parameters:
    - df_movies: PySpark DataFrame containing training movie metadata.
    - df_reviews: PySpark DataFrame containing reviews.

    Returns:
    - Merged PySpark DataFrame with movie metadata + reviews.
    """

    # Standardize `movie_title` (trim + lowercase) for better matching
    df_movies = df_movies.withColumn("movie_title", trim(lower(col("movie_title"))))
    df_reviews = df_reviews.withColumn("movie_title", trim(lower(col("movie_title"))))

    # Rename 'Release Year' in reviews to 'year' for consistent joins
    df_reviews = df_reviews.withColumnRenamed("Release Year", "year")

    # Ensure `year` is cast as an integer
    df_movies = df_movies.withColumn("year", col("year").cast("int"))
    df_reviews = df_reviews.withColumn("year", col("year").cast("int"))

    # Perform a LEFT JOIN on `movie_title` and `year`
    df_merged = df_movies.join(df_reviews, on=["movie_title", "year"], how="left")

    # Select relevant columns
    selected_columns = [
        "tconst", "movie_title", "numVotes", "year", "label",
        "tomatometer_status", "review_type", "genre",
        "review_label", "review_content"
    ]

    df_final = df_merged.select(*selected_columns)

    # Fill missing reviews with "No Reviews" and "Unknown"
    df_final = df_final.fillna({"review_content": "No Reviews", "review_label": "Unknown"})

    return df_final

In [117]:
join_training_with_reviews(spark_train_df, spark_reviews_df).show()
final_df = join_training_with_reviews(spark_train_df, spark_reviews_df)

+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|   tconst|         movie_title|numVotes|year|label|tomatometer_status|         review_type|               genre|review_label|      review_content|
+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|tt0022626|    american madness|  1996.0|1932| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0017271|          by the law|  1057.0|1926| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0023876|           cavalcade|  5038.0|1933|false|            Rotten|['Fresh', 'Fresh'...|Classics, Drama, ...|      Rotten|['If nothing else...|
|tt0028773|            dead end|     NaN|1937| true|             Fresh|['Fresh', 'Fresh'...|Classics, Drama, ...

In [118]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Release Year", "year")
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

5892

In [119]:
def clean_labels(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    # convert boolean column `label` to integer
    df = df.withColumn("label_int", when(col("label") == True, 1).otherwise(0))

    # convert categorical `review_label` to numerical
    df = df.withColumn("review_label_int", when(col("review_label") == "Fresh", 1)
                                          .when(col("review_label") == "Rotten", 0)
                                          .otherwise(None))
    # combine both labels:
    # both exist: average
    # only one exists: take that one
    df = df.withColumn("combined_label",
                       coalesce((col("label_int") + col("review_label_int")) / 2, col("label_int"), col("review_label_int")))

    # Drop unnecessary intermediate columns
    df = df.drop("label", "label_int", "review_label", "review_label_int")

    df.withColumnRenamed("label", "combined_label")

    return df

In [120]:
clean_labels(final_df).show()

+----------+--------------------+--------+----+------------------+--------------------+--------------------+--------------------+--------------+
|    tconst|         movie_title|numVotes|year|tomatometer_status|         review_type|               genre|      review_content|combined_label|
+----------+--------------------+--------+----+------------------+--------------------+--------------------+--------------------+--------------+
|tt10534500|               #home| 10591.0|2021|              NULL|                NULL|                NULL|          No Reviews|           1.0|
| tt0090556|      'night, mother|  2244.0|1986|              NULL|                NULL|                NULL|          No Reviews|           1.0|
| tt0349047|(t)raumschiff sur...| 15303.0|2004|              NULL|                NULL|                NULL|          No Reviews|           0.0|
| tt2395385|                  +1|  8065.0|2013|             Fresh|['Fresh', 'Fresh'...|Horror, Mystery &...|["Love it or hate...| 

In [121]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Release Year", "year")
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

5892

In [122]:
tconst_list = [row["tconst"] for row in df_missing_movies.select("tconst").collect()]
tconst_list

['tt11691722',
 'tt2645104',
 'tt7838536',
 'tt9562694',
 'tt0937347',
 'tt0092501',
 'tt0421051',
 'tt2277834',
 'tt2585562',
 'tt3860294',
 'tt5458088',
 'tt9100028',
 'tt0047444',
 'tt0108350',
 'tt2615584',
 'tt6692354',
 'tt0064482',
 'tt0084630',
 'tt0103791',
 'tt0127626',
 'tt0841150',
 'tt2370228',
 'tt2725076',
 'tt3398048',
 'tt4849176',
 'tt5535942',
 'tt8819596',
 'tt11374902',
 'tt2449638',
 'tt3509426',
 'tt0074252',
 'tt0116308',
 'tt0078243',
 'tt0091658',
 'tt0093415',
 'tt0476550',
 'tt10695464',
 'tt1678040',
 'tt1773109',
 'tt5157456',
 'tt6772946',
 'tt0045911',
 'tt0068883',
 'tt0075811',
 'tt0100050',
 'tt0334416',
 'tt7917178',
 'tt0055499',
 'tt0075730',
 'tt0093207',
 'tt1276962',
 'tt0017961',
 'tt0068282',
 'tt0085328',
 'tt0112492',
 'tt0387462',
 'tt12492650',
 'tt1656170',
 'tt8000908',
 'tt0790662',
 'tt0902967',
 'tt10569922',
 'tt1684548',
 'tt0067814',
 'tt0330536',
 'tt10734864',
 'tt2900822',
 'tt0066728',
 'tt0450426',
 'tt0486544',
 'tt1601792',


## IBDM Review Scraping

In [123]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [124]:
my_url = "https://www.imdb.com/title/tt0073781/reviews/"

In [130]:
def scrape_first_five_reviews_with_text(imdb_id):
    """
    Fetches the IMDb reviews page for the given imdb_id
    and returns ONLY the first 5 reviews that actually contain text.
    """
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    review_cards = soup.find_all("div", {"data-testid": "review-card-parent"})

    reviews_data = []

    for card in review_cards:
        # extract review text
        content_div = card.select_one("div.ipc-html-content-inner-div")
        review_text = content_div.get_text(strip=True) if content_div else ""

        if not review_text:
            continue

        reviews_data.append(review_text)

        if len(reviews_data) == 5:
            break

    return reviews_data

In [131]:
import time

def scrape_reviews_for_imdb_ids(imdb_ids):
    """
    Takes a list of IMDb IDs ,
    scrapes the first 5 text reviews for each, then returns a Spark DataFrame with:
        - imdb_id
        - reviews (all reviews joined into one string separated by commas)
    """
    rows = []

    for imdb_id in imdb_ids:
        reviews_list = scrape_first_five_reviews_with_text(imdb_id)

        reviews_joined = ", ".join(reviews_list)

        rows.append((imdb_id, reviews_joined))

        time.sleep(1)

    columns = ["tconst", "reviews"]
    df = spark.createDataFrame(rows, columns)
    return df

In [132]:
df_reviews = scrape_reviews_for_imdb_ids(tconst_list)

df_reviews.show()

+----------+--------------------+
|    tconst|             reviews|
+----------+--------------------+
|tt11691722|A great person an...|
| tt2645104|Even the name "Ro...|
| tt7838536|Now I see many re...|
| tt9562694|Where do you star...|
| tt0937347|It seems strange ...|
| tt0092501|Sequels are a cap...|
| tt0421051|A week ago i watc...|
| tt2277834|OK, it was made f...|
| tt2585562|While most of the...|
| tt3860294|                    |
| tt5458088|Kammattipadam, a ...|
| tt9100028|"Little Switzerla...|
| tt0047444|This movie doesn'...|
| tt0108350|I don't know much...|
| tt2615584|Was pleasantly su...|
| tt6692354|Not your conventi...|
| tt0064482|Worth the time to...|
| tt0084630|Satte Pe Satta (1...|
| tt0103791|Comical (farce at...|
| tt0127626|A great story abo...|
+----------+--------------------+
only showing top 20 rows

