In [3]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
from unidecode import unidecode
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace, lower, trim, lit, coalesce

from pyspark.ml import Pipeline

In [4]:
!git clone https://github.com/meralegre/Big_Data_IMDb.git
%cd Big_Data_IMDb/
# %pwd

Cloning into 'Big_Data_IMDb'...
remote: Enumerating objects: 62, done.[K
remote: Counting objects: 100% (62/62), done.[K
remote: Compressing objects: 100% (56/56), done.[K
remote: Total 62 (delta 17), reused 36 (delta 4), pack-reused 0 (from 0)[K
Receiving objects: 100% (62/62), 897.17 KiB | 13.80 MiB/s, done.
Resolving deltas: 100% (17/17), done.
/content/Big_Data_IMDb


### Load data with Spark

In [5]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

# spark = SparkSession.builder.appName("BERT-FineTuning").getOrCreate()

In [6]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns=["Unnamed: 0", "runtimeMinutes"])

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    spark_train_df = spark_train_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_train_df

In [7]:
def load_validation_data():
    path = "data/"
    validation_df = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    # validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="runtimeMinutes")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    spark_validation_df = spark_validation_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="runtimeMinutes")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    spark_test_df = spark_test_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_test_df


In [8]:
train_data = load_train_data()
train_data.show()

+---------+--------------------+----------------+---------+-------+--------+-----+
|   tconst|         movie_title|   originalTitle|startYear|endYear|numVotes|label|
+---------+--------------------+----------------+---------+-------+--------+-----+
|tt0014109|The Saga of Gösta...|             NaN|     1924|   NULL|  1231.0| true|
|tt0015064|      The Last Laugh| Der letzte Mann|     1924|   NULL|     NaN| true|
|tt0015841|        The Freshman|    The Freshman|     1925|   NULL|  5374.0| true|
|tt0017271|          By the Law|             NaN|     NULL|   1926|  1057.0| true|
|tt0018451|The Student Princ...|             NaN|     1927|   NULL|  1459.0| true|
|tt0018742|       The Cameraman|   The Cameraman|     1928|   NULL| 11388.0| true|
|tt0019379|         Show People|             NaN|     1928|   NULL|  3695.0| true|
|tt0020018|      In Old Arizona|             NaN|     1928|   NULL|  1049.0|false|
|tt0020793|Escape from Dartmoor|             NaN|     1929|   NULL|  1102.0| true|
|tt0

In [9]:
validation_data = load_validation_data()
validation_data.show()
validation_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0003740|             Cabiria|                NaN|     1914|   NULL|  3452.0|
|tt0008663|     A Man There Was|        Terje Vigen|     1917|   NULL|  1882.0|
|tt0010307|           J'accuse!|                NaN|     1919|   NULL|  1692.0|
|tt0014429|        Safety Last!|       Safety Last!|     1923|   NULL| 19898.0|
|tt0015175|Die Nibelungen: S...|                NaN|     1924|   NULL|  5676.0|
|tt0016332|       Seven Chances|                NaN|     1925|   NULL|  9914.0|
|tt0018737|       Pandora's Box|                NaN|     NULL|   1929| 10475.0|
|tt0018839|The Docks of New ...|                NaN|     1928|   NULL|  4339.0|
|tt0019421| Steamboat Bill, Jr.|Steamboat Bill, Jr.|     1928|   NULL| 14166.0|
|tt0019901|   Woman in the Moon|        

955

In [10]:
test_data = load_test_data()
test_data.show()
test_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0014972| He Who Gets Slapped|He Who Gets Slapped|     1924|   NULL|  3654.0|
|tt0015016|      The Iron Horse|                NaN|     1924|   NULL|  2136.0|
|tt0015174|Die Nibelungen: K...|                NaN|     1924|   NULL|  4341.0|
|tt0015214|             At 3:25|                NaN|     NULL|   1925|  1724.0|
|tt0015863|             Go West|                NaN|     1925|   NULL|  4188.0|
|tt0016481|             Variety|            Varieté|     1925|   NULL|  1188.0|
|tt0017136|          Metropolis|                NaN|     1927|   NULL|168372.0|
|tt0018876|   The Farmer's Wife|                NaN|     1928|   NULL|  2741.0|
|tt0019074| Laugh, Clown, Laugh|Laugh, Clown, Laugh|     1928|   NULL|  1934.0|
|tt0021730|           The Champ|        

1086

### Cleaning

In [11]:
# def handle_years():
#     """
#     Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
#     Drops 'startYear' and 'endYear' after merging.
#     """
#     spark_train_df = spark_train_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_validation_df = spark_validation_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_test_df = spark_test_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     # drop original startYear and endYear
#     spark_train_df = spark_train_df.drop("startYear", "endYear")
#     spark_validation_df = spark_validation_df.drop("startYear", "endYear")
#     spark_test_df = spark_test_df.drop("startYear", "endYear")

#     return spark_train_df, spark_validation_df, spark_test_df

def handle_years(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                                  .otherwise(col("endYear")))

    # drop original startYear and endYear
    df = df.drop("startYear", "endYear")

    return df

In [12]:
spark_train_df = load_train_data()
spark_train_df = handle_years(spark_train_df)
spark_train_df.show()

+---------+--------------------+----------------+--------+-----+----+
|   tconst|         movie_title|   originalTitle|numVotes|label|year|
+---------+--------------------+----------------+--------+-----+----+
|tt0014109|The Saga of Gösta...|             NaN|  1231.0| true|1924|
|tt0015064|      The Last Laugh| Der letzte Mann|     NaN| true|1924|
|tt0015841|        The Freshman|    The Freshman|  5374.0| true|1925|
|tt0017271|          By the Law|             NaN|  1057.0| true|1926|
|tt0018451|The Student Princ...|             NaN|  1459.0| true|1927|
|tt0018742|       The Cameraman|   The Cameraman| 11388.0| true|1928|
|tt0019379|         Show People|             NaN|  3695.0| true|1928|
|tt0020018|      In Old Arizona|             NaN|  1049.0|false|1928|
|tt0020793|Escape from Dartmoor|             NaN|  1102.0| true|1929|
|tt0022125|              Marius|             NaN|  2251.0| true|1931|
|tt0022626|    American Madness|             NaN|  1996.0| true|1932|
|tt0023622| Trouble 

In [13]:
# converts special characters to ASCII
def normalize_text(text):
    if text is None:
        return None
    return unidecode(text)

normalize_text_udf = udf(normalize_text, StringType())

def clean_titles(df):
    df = df.withColumn("movie_title", normalize_text_udf(col("movie_title")))
    df = df.withColumn("originalTitle", normalize_text_udf(col("originalTitle")))

    # maybe we can drop the original name since there are some NaN values present
    # and we already have the clean primaryTitle column
    df = df.drop("originalTitle")
    return df

In [14]:
spark_train_df = clean_titles(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------+-----+----+
|   tconst|         movie_title|numVotes|label|year|
+---------+--------------------+--------+-----+----+
|tt0014109|The Saga of Gosta...|  1231.0| true|1924|
|tt0015064|      The Last Laugh|     NaN| true|1924|
|tt0015841|        The Freshman|  5374.0| true|1925|
|tt0017271|          By the Law|  1057.0| true|1926|
|tt0018451|The Student Princ...|  1459.0| true|1927|
|tt0018742|       The Cameraman| 11388.0| true|1928|
|tt0019379|         Show People|  3695.0| true|1928|
|tt0020018|      In Old Arizona|  1049.0|false|1928|
|tt0020793|Escape from Dartmoor|  1102.0| true|1929|
|tt0022125|              Marius|  2251.0| true|1931|
|tt0022626|    American Madness|  1996.0| true|1932|
|tt0023622| Trouble in Paradise| 14090.0| true|1932|
|tt0023876|           Cavalcade|  5038.0|false|1933|
|tt0024593|         Son of Kong|  4501.0|false|1933|
|tt0025164|    The Gay Divorcee|     NaN| true|1934|
|tt0027532|           Dodsworth|  8817.0| true

In [15]:
# number of rows of training data after concatenating everything together
spark_train_df.count()

7959

### Merge with movie_reviews data

In [21]:
def load_reviews_data():
    path = "movie_reviews"
    reviews_df = pd.read_csv(f"{path}/final_movie_reviews.csv")
    # reviews_df = reviews_df.sort_index()
    spark_reviews_df = spark.createDataFrame(reviews_df).replace(to_replace='\\N', value=None)
    return spark_reviews_df

In [22]:
spark_reviews_df = load_reviews_data()
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|Clean_Review_date|Clean_Comment Count|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|      2010.0|              NaN|                NaN|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|      2010.0|              N

In [23]:
spark_reviews_df.count()

19095

In [24]:
def convert_year_column_to_int(df, column_name):
    """
    converts a float-based year column to an integer
    """
    df = df.withColumn(column_name, col(column_name).cast(IntegerType()))
    return df

spark_reviews_df = convert_year_column_to_int(spark_reviews_df, "Release Year")

In [25]:
spark_reviews_df.show()

+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|         movie_title|tomatometer_status|tomatometer_rating|         review_type|        review_score|      review_content|review_label|Release Year|Clean_Review_date|Clean_Comment Count|               genre|
+--------------------+------------------+------------------+--------------------+--------------------+--------------------+------------+------------+-----------------+-------------------+--------------------+
|Percy Jackson & t...|            Rotten|              49.0|['Fresh', 'Rotten...|['3.5/5', '1/4', ...|["Whether audienc...|      Rotten|        2010|              NaN|                NaN|Action & Adventur...|
|         Please Give|             Fresh|              87.0|['Fresh', 'Fresh'...|['3/4', '3/5', '4...|['Holofcener alwa...|       Fresh|        2010|              N

### Time to merge

In [26]:
def join_training_with_reviews(df_movies, df_reviews):
    """
    joins training data with reviews based on `movie_title` and `year`.

    Parameters:
    - df_movies: PySpark DataFrame containing training movie metadata.
    - df_reviews: PySpark DataFrame containing reviews.

    Returns:
    - Merged PySpark DataFrame with movie metadata + reviews.
    """

    # Standardize `movie_title` (trim + lowercase) for better matching
    df_movies = df_movies.withColumn("movie_title", trim(lower(col("movie_title"))))
    df_reviews = df_reviews.withColumn("movie_title", trim(lower(col("movie_title"))))

    # Rename 'Release Year' in reviews to 'year' for consistent joins
    df_reviews = df_reviews.withColumnRenamed("Release Year", "year")

    # Ensure `year` is cast as an integer
    df_movies = df_movies.withColumn("year", col("year").cast("int"))
    df_reviews = df_reviews.withColumn("year", col("year").cast("int"))

    # Perform a LEFT JOIN on `movie_title` and `year`
    df_merged = df_movies.join(df_reviews, on=["movie_title", "year"], how="left")

    # Select relevant columns
    selected_columns = [
        "tconst", "movie_title", "numVotes", "year", "label",
        "tomatometer_status", "review_type", "genre",
        "review_label", "review_content"
    ]

    df_final = df_merged.select(*selected_columns)

    # Fill missing reviews with "No Reviews" and "Unknown"
    df_final = df_final.fillna({"review_content": "No Reviews", "review_label": "Unknown"})

    return df_final

In [31]:
join_training_with_reviews(spark_train_df, spark_reviews_df).show()
final_df = join_training_with_reviews(spark_train_df, spark_reviews_df)

+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|   tconst|         movie_title|numVotes|year|label|tomatometer_status|         review_type|               genre|review_label|      review_content|
+---------+--------------------+--------+----+-----+------------------+--------------------+--------------------+------------+--------------------+
|tt0022626|    american madness|  1996.0|1932| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0017271|          by the law|  1057.0|1926| true|              NULL|                NULL|                NULL|     Unknown|          No Reviews|
|tt0023876|           cavalcade|  5038.0|1933|false|            Rotten|['Fresh', 'Fresh'...|Classics, Drama, ...|      Rotten|['If nothing else...|
|tt0028773|            dead end|     NaN|1937| true|             Fresh|['Fresh', 'Fresh'...|Classics, Drama, ...

In [32]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Release Year", "year")
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

5892

In [33]:
def clean_labels(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    # convert boolean column `label` to integer
    df = df.withColumn("label_int", when(col("label") == True, 1).otherwise(0))

    # convert categorical `review_label` to numerical
    df = df.withColumn("review_label_int", when(col("review_label") == "Fresh", 1)
                                          .when(col("review_label") == "Rotten", 0)
                                          .otherwise(None))
    # combine both labels:
    # both exist: average
    # only one exists: take that one
    df = df.withColumn("combined_label",
                       coalesce((col("label_int") + col("review_label_int")) / 2, col("label_int"), col("review_label_int")))

    # Drop unnecessary intermediate columns
    df = df.drop("label", "label_int", "review_label", "review_label_int")

    df.withColumnRenamed("label", "combined_label")

    return df

In [34]:
clean_labels(final_df).show()

+----------+--------------------+--------+----+------------------+--------------------+--------------------+--------------------+--------------+
|    tconst|         movie_title|numVotes|year|tomatometer_status|         review_type|               genre|      review_content|combined_label|
+----------+--------------------+--------+----+------------------+--------------------+--------------------+--------------------+--------------+
|tt10534500|               #home| 10591.0|2021|              NULL|                NULL|                NULL|          No Reviews|           1.0|
| tt0090556|      'night, mother|  2244.0|1986|              NULL|                NULL|                NULL|          No Reviews|           1.0|
| tt0349047|(t)raumschiff sur...| 15303.0|2004|              NULL|                NULL|                NULL|          No Reviews|           0.0|
| tt2395385|                  +1|  8065.0|2013|             Fresh|['Fresh', 'Fresh'...|Horror, Mystery &...|["Love it or hate...| 

In [35]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Release Year", "year")
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

5892

In [36]:
tconst_list = [row["tconst"] for row in df_missing_movies.select("tconst").collect()]
tconst_list

['tt11691722',
 'tt2645104',
 'tt7838536',
 'tt9562694',
 'tt0937347',
 'tt0092501',
 'tt0421051',
 'tt2277834',
 'tt2585562',
 'tt3860294',
 'tt5458088',
 'tt9100028',
 'tt0047444',
 'tt0108350',
 'tt2615584',
 'tt6692354',
 'tt0064482',
 'tt0084630',
 'tt0103791',
 'tt0127626',
 'tt0841150',
 'tt2370228',
 'tt2725076',
 'tt3398048',
 'tt4849176',
 'tt5535942',
 'tt8819596',
 'tt11374902',
 'tt2449638',
 'tt3509426',
 'tt0074252',
 'tt0116308',
 'tt0078243',
 'tt0091658',
 'tt0093415',
 'tt0476550',
 'tt10695464',
 'tt1678040',
 'tt1773109',
 'tt5157456',
 'tt6772946',
 'tt0045911',
 'tt0068883',
 'tt0075811',
 'tt0100050',
 'tt0334416',
 'tt7917178',
 'tt0055499',
 'tt0075730',
 'tt0093207',
 'tt1276962',
 'tt0017961',
 'tt0068282',
 'tt0085328',
 'tt0112492',
 'tt0387462',
 'tt12492650',
 'tt1656170',
 'tt8000908',
 'tt0790662',
 'tt0902967',
 'tt10569922',
 'tt1684548',
 'tt0067814',
 'tt0330536',
 'tt10734864',
 'tt2900822',
 'tt0066728',
 'tt0450426',
 'tt0486544',
 'tt1601792',


## IBDM Review Scraping

In [37]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [38]:
my_url = "https://www.imdb.com/title/tt0073781/reviews/"

In [39]:
import requests
from bs4 import BeautifulSoup

def scrape_reviews(imdb_id):
    """
    Fetch the reviews page for a given IMDb ID (e.g., 'tt0111161')
    and parse out each review's rating and text from
    the new "review-card-parent" structure.
    """
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    review_cards = soup.find_all("div", {"data-testid": "review-card-parent"})

    reviews_data = []
    for card in review_cards:
        # extract the review text
        content_div = card.select_one("div.ipc-html-content-inner-div")
        review_text = content_div.get_text(strip=True) if content_div else ""

        reviews_data.append({
            "text": review_text,
        })

    return reviews_data


imdb_id = "tt0111161"
reviews = scrape_reviews(imdb_id)

for i, rev in enumerate(reviews, start=1):
    print(f"--- Review #{i} ---")
    print("Text:  ", rev["text"])
    print()

--- Review #1 ---
Text:   

--- Review #2 ---
Text:   It is no wonder that the film has such a high rating, it is quite literally breathtaking. What can I say that hasn't said before? Not much, it's the story, the acting, the premise, but most of all, this movie is about how it makes you feel. Sometimes you watch a film, and can't remember it days later, this film loves with you, once you've seen it, you don't forget.The ultimate story of friendship, of hope, and of life, and overcoming adversity.I understand why so many class this as the best film of all time, it isn't mine, but I get it. If you haven't seen it, or haven't seen it for some time, you need to watch it, it's amazing. 10/10.

--- Review #3 ---
Text:   I'm trying to save you money; this is the last film title that you should consider borrowing. Renting Shawshank will cost you five bucks... just plunk down the $25 and own the title. You'll wind up going back to this gem time and time again. This is one of few movies that ar

In [42]:
def scrape_first_five_reviews_with_text(imdb_id):
    """
    Fetches the IMDb reviews page for the given imdb_id
    and returns ONLY the first 5 reviews that actually contain text.
    """
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    review_cards = soup.find_all("div", {"data-testid": "review-card-parent"})

    reviews_data = []

    for card in review_cards:
        # extract review text
        content_div = card.select_one("div.ipc-html-content-inner-div")
        review_text = content_div.get_text(strip=True) if content_div else ""

        if not review_text:
            continue

        reviews_data.append(review_text)

        if len(reviews_data) == 5:
            break

    return reviews_data

In [43]:
def scrape_reviews_for_imdb_ids(imdb_ids):
    """
    Takes a list of IMDb IDs ,
    scrapes the first 5 text reviews for each, then returns a Spark DataFrame with:
        - imdb_id
        - reviews (all reviews joined into one string separated by commas)
    """
    rows = []

    for imdb_id in imdb_ids:
        reviews_list = scrape_first_five_reviews_with_text(imdb_id)

        reviews_joined = ", ".join(reviews_list)

        rows.append((imdb_id, reviews_joined))

    columns = ["tconst", "reviews"]
    df = spark.createDataFrame(rows, columns)
    return df

In [46]:
df_reviews = scrape_reviews_for_imdb_ids(tconst_list)

df_reviews.show()

KeyboardInterrupt: 