In [112]:
import pandas as pd
import os
import json
import re
import requests
from pathlib import Path

from tqdm import tqdm
from unidecode import unidecode
# import pyspark
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, IntegerType, FloatType
from pyspark.sql.functions import col, when, udf, regexp_replace, lower, trim, lit, coalesce, array, concat_ws, concat, split

from pyspark.ml import Pipeline

In [3]:
# !git clone https://github.com/meralegre/Big_Data_IMDb.git
# %cd Big_Data_IMDb/

# # add the csv files here otherwise the code wont work, cannot push it to GitHub
# # sadly, too large
# !mkdir movie_reviews
# %pwd

Cloning into 'Big_Data_IMDb'...
remote: Enumerating objects: 77, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (70/70), done.[K
remote: Total 77 (delta 26), reused 41 (delta 5), pack-reused 0 (from 0)[K
Receiving objects: 100% (77/77), 1.04 MiB | 10.32 MiB/s, done.
Resolving deltas: 100% (26/26), done.
/content/Big_Data_IMDb


'/content/Big_Data_IMDb'

### Load data with Spark

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .config("spark.driver.bindAddress","127.0.0.1") \
    .getOrCreate()

In [63]:
def load_train_data():
    path = "data/train/"
    train_files = os.listdir(path=path)

    train_df = pd.DataFrame()
    for file in train_files:
        if file.startswith('train-') and file.endswith('.csv'):
            df = pd.read_csv(f"{path}/{file}")
            train_df = pd.concat([train_df, df], ignore_index=False)
            train_df = train_df.drop(columns=["Unnamed: 0", "runtimeMinutes"])

    # train_df = train_df.sort_index()
    spark_train_df = spark.createDataFrame(train_df).replace(to_replace='\\N', value=None)
    spark_train_df = spark_train_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_train_df

In [64]:
def load_validation_data():
    path = "data/"
    validation_df = pd.read_csv(f"{path}/validation_hidden.csv", index_col=[0])
    # validation_df = validation_df.sort_index()
    validation_df = validation_df.drop(columns="runtimeMinutes")
    spark_validation_df = spark.createDataFrame(validation_df).replace(to_replace='\\N', value=None)
    spark_validation_df = spark_validation_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_validation_df

def load_test_data():
    path = "data/"
    test_df = pd.read_csv(f"{path}/test_hidden.csv", index_col=[0])
    # test_df = test_df.sort_index()
    test_df = test_df.drop(columns="runtimeMinutes")
    spark_test_df = spark.createDataFrame(test_df).replace(to_replace='\\N', value=None)
    spark_test_df = spark_test_df.withColumnRenamed("primaryTitle", "movie_title")
    return spark_test_df


In [65]:
train_data = load_train_data()
train_data.show()

+---------+--------------------+--------------------+---------+-------+--------+-----+
|   tconst|         movie_title|       originalTitle|startYear|endYear|numVotes|label|
+---------+--------------------+--------------------+---------+-------+--------+-----+
|tt0009369|              Mickey|              Mickey|     1918|   NULL|  1119.0|false|
|tt0014142|The Hunchback of ...|                 NaN|     NULL|   1923|  5288.0| true|
|tt0014945|            Girl Shy|            Girl Shy|     1924|   NULL|  3327.0| true|
|tt0017048|   A Page of Madness|                 NaN|     1926|   NULL|  3357.0| true|
|tt0017350|  The Scarlet Letter|                 NaN|     1926|   NULL|  1768.0| true|
|tt0017961|           Happiness|                 NaN|     1935|   NULL|  1080.0| true|
|tt0018054|   Thé King ớf Kings|   The King of Kings|     1927|   NULL|  2081.0| true|
|tt0018578|               Wings|                 NaN|     1927|   NULL|     NaN| true|
|tt0019429|        Street Angel|        Str

In [66]:
validation_data = load_validation_data()
validation_data.show()
validation_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0003740|             Cabiria|                NaN|     1914|   NULL|  3452.0|
|tt0008663|     A Man There Was|        Terje Vigen|     1917|   NULL|  1882.0|
|tt0010307|           J'accuse!|                NaN|     1919|   NULL|  1692.0|
|tt0014429|        Safety Last!|       Safety Last!|     1923|   NULL| 19898.0|
|tt0015175|Die Nibelungen: S...|                NaN|     1924|   NULL|  5676.0|
|tt0016332|       Seven Chances|                NaN|     1925|   NULL|  9914.0|
|tt0018737|       Pandora's Box|                NaN|     NULL|   1929| 10475.0|
|tt0018839|The Docks of New ...|                NaN|     1928|   NULL|  4339.0|
|tt0019421| Steamboat Bill, Jr.|Steamboat Bill, Jr.|     1928|   NULL| 14166.0|
|tt0019901|   Woman in the Moon|        

955

In [67]:
test_data = load_test_data()
test_data.show()
test_data.count()

+---------+--------------------+-------------------+---------+-------+--------+
|   tconst|         movie_title|      originalTitle|startYear|endYear|numVotes|
+---------+--------------------+-------------------+---------+-------+--------+
|tt0014972| He Who Gets Slapped|He Who Gets Slapped|     1924|   NULL|  3654.0|
|tt0015016|      The Iron Horse|                NaN|     1924|   NULL|  2136.0|
|tt0015174|Die Nibelungen: K...|                NaN|     1924|   NULL|  4341.0|
|tt0015214|             At 3:25|                NaN|     NULL|   1925|  1724.0|
|tt0015863|             Go West|                NaN|     1925|   NULL|  4188.0|
|tt0016481|             Variety|            Varieté|     1925|   NULL|  1188.0|
|tt0017136|          Metropolis|                NaN|     1927|   NULL|168372.0|
|tt0018876|   The Farmer's Wife|                NaN|     1928|   NULL|  2741.0|
|tt0019074| Laugh, Clown, Laugh|Laugh, Clown, Laugh|     1928|   NULL|  1934.0|
|tt0021730|           The Champ|        

1086

### Cleaning

In [68]:
# def handle_years():
#     """
#     Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
#     Drops 'startYear' and 'endYear' after merging.
#     """
#     spark_train_df = spark_train_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_validation_df = spark_validation_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     spark_test_df = spark_test_df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
#                                   .otherwise(col("endYear")))

#     # drop original startYear and endYear
#     spark_train_df = spark_train_df.drop("startYear", "endYear")
#     spark_validation_df = spark_validation_df.drop("startYear", "endYear")
#     spark_test_df = spark_test_df.drop("startYear", "endYear")

#     return spark_train_df, spark_validation_df, spark_test_df

def handle_years(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    df = df.withColumn("year", when(col("startYear").isNotNull(), col("startYear"))
                                  .otherwise(col("endYear")))

    # drop original startYear and endYear
    df = df.drop("startYear", "endYear")

    return df

In [69]:
spark_train_df = load_train_data()
spark_train_df = handle_years(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------------------+--------+-----+----+
|   tconst|         movie_title|       originalTitle|numVotes|label|year|
+---------+--------------------+--------------------+--------+-----+----+
|tt0009369|              Mickey|              Mickey|  1119.0|false|1918|
|tt0014142|The Hunchback of ...|                 NaN|  5288.0| true|1923|
|tt0014945|            Girl Shy|            Girl Shy|  3327.0| true|1924|
|tt0017048|   A Page of Madness|                 NaN|  3357.0| true|1926|
|tt0017350|  The Scarlet Letter|                 NaN|  1768.0| true|1926|
|tt0017961|           Happiness|                 NaN|  1080.0| true|1935|
|tt0018054|   Thé King ớf Kings|   The King of Kings|  2081.0| true|1927|
|tt0018578|               Wings|                 NaN|     NaN| true|1927|
|tt0019429|        Street Angel|        Street Angel|  2314.0| true|1928|
|tt0020768|           City Girl|                 NaN|  3199.0| true|1930|
|tt0022599|   À Nous la Liberté|   À n

In [70]:
# converts special characters to ASCII
def normalize_text(text):
    if text is None:
        return None
    return unidecode(text)

normalize_text_udf = udf(normalize_text, StringType())

def clean_titles(df):
    df = df.withColumn("movie_title", normalize_text_udf(col("movie_title")))
    df = df.withColumn("originalTitle", normalize_text_udf(col("originalTitle")))

    # maybe we can drop the original name since there are some NaN values present
    # and we already have the clean primaryTitle column
    df = df.drop("originalTitle")
    return df

In [71]:
spark_train_df = clean_titles(spark_train_df)
spark_train_df.show()

+---------+--------------------+--------+-----+----+
|   tconst|         movie_title|numVotes|label|year|
+---------+--------------------+--------+-----+----+
|tt0009369|              Mickey|  1119.0|false|1918|
|tt0014142|The Hunchback of ...|  5288.0| true|1923|
|tt0014945|            Girl Shy|  3327.0| true|1924|
|tt0017048|   A Page of Madness|  3357.0| true|1926|
|tt0017350|  The Scarlet Letter|  1768.0| true|1926|
|tt0017961|           Happiness|  1080.0| true|1935|
|tt0018054|   The King of Kings|  2081.0| true|1927|
|tt0018578|               Wings|     NaN| true|1927|
|tt0019429|        Street Angel|  2314.0| true|1928|
|tt0020768|           City Girl|  3199.0| true|1930|
|tt0022599|   A Nous la Liberte|  4392.0| true|1931|
|tt0027075|A Tale of Two Cities|  5596.0| true|1935|
|tt0027441|Charlie Chan at t...|  1461.0| true|1936|
|tt0028575|               Angel|  2698.0| true|1937|
|tt0031002|  The Young in Heart|  1332.0| true|1938|
|tt0031022|The Adventures of...|  6616.0| true

In [72]:
# number of rows of training data after concatenating everything together
spark_train_df.count()

7959

### Merge with movie_reviews data

In [50]:
def load_reviews_data():
    path = "movie_reviews"
    reviews_df = pd.read_csv(f"{path}/final_movie_reviews.csv")
    # reviews_df = reviews_df.sort_index()
    spark_reviews_df = spark.createDataFrame(reviews_df).replace(to_replace='\\N', value=None)
    return spark_reviews_df

In [86]:
spark_reviews_df = spark.read \
    .option("header", True) \
    .option("mode", "DROPMALFORMED") \
    .option("inferSchema", True) \
    .csv("/content/Big_Data_IMDb/movie_reviews/final_movie_reviews.csv")


# Show DataFrame
spark_reviews_df.show(truncate=False)

+--------------------------------------------------+------+------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|movie_title                                       |year  |genre                                                       |reviews                                                                                                                                                                                                                                    

In [88]:
spark_reviews_df.count()

825410

In [89]:
def convert_columns_to_int(df, column_name):
    """
    converts a float-based year column to an integer
    """
    df = df.withColumn(column_name, col(column_name).cast(IntegerType()))
    return df

spark_reviews_df = convert_columns_to_int(spark_reviews_df, "year")
spark_reviews_df = convert_columns_to_int(spark_reviews_df, "tomatometer_rating")
spark_reviews_df = convert_columns_to_int(spark_reviews_df, "audience_rating")

DataFrame[movie_title: string, year: int, genre: string, review_content: string, content_rating: string, production_company: string, tomatometer_status: string, tomatometer_rating: int, audience_status: string, audience_rating: int, review_score: string, review_label: string, Like count: string]

In [94]:
spark_reviews_df = spark_reviews_df.withColumnRenamed("Like count", "like_count")
spark_reviews_df = spark_reviews_df.withColumnRenamed("reviews", "review_content")
spark_reviews_df.show()

+--------------------+----+--------------------+--------------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|         movie_title|year|               genre|      review_content|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|review_label|like_count|
+--------------------+----+--------------------+--------------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|Percy Jackson & t...|1970|Action & Adventur...|Whether audiences...|            PG|  20th Century Fox|            Rotten|                49|        Spilled|             53|       3.5/5|       Fresh|      NULL|
|Percy Jackson & t...|1970|Action & Adventur...|Harry Potter knoc...|            PG|  20th Century Fox|            Rotten|                49|        Spilled

### Time to merge

For now we will use a LEFT join on the training data to avoid having null values on the tconst identifier column. Maybe later we can scrape the tconst of the movies missing and have bigger data

In [95]:
def join_training_with_reviews(df_movies, df_reviews):
    """
    joins training data with reviews based on `movie_title` and `year`.

    Parameters:
    - df_movies: PySpark DataFrame containing training movie metadata.
    - df_reviews: PySpark DataFrame containing reviews.

    Returns:
    - Merged PySpark DataFrame with movie metadata + reviews.
    """

    # Standardize `movie_title` (trim + lowercase) for better matching
    df_movies = df_movies.withColumn("movie_title", trim(lower(col("movie_title"))))
    df_reviews = df_reviews.withColumn("movie_title", trim(lower(col("movie_title"))))

    # Ensure `year` is cast as an integer
    df_movies = df_movies.withColumn("year", col("year").cast("int"))
    df_reviews = df_reviews.withColumn("year", col("year").cast("int"))

    # Perform a LEFT JOIN on `movie_title` and `year`
    df_merged = df_movies.join(df_reviews, on=["movie_title", "year"], how="left")

    # Select relevant columns
    # selected_columns = [
    #     "tconst", "movie_title", "numVotes", "year", "label",
    #     "tomatometer_status", "review_type", "genre",
    #     "review_label", "review_content"
    # ]

    # df_final = df_merged.select(*selected_columns)

    # # Fill missing reviews with None and "Unknown"
    # df_final = df_final.fillna({"review_content": None, "review_label": "Unknown"})

    return df_merged

In [96]:
join_training_with_reviews(spark_train_df, spark_reviews_df).show(40)
final_df = join_training_with_reviews(spark_train_df, spark_reviews_df)

+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|         movie_title|year|   tconst|numVotes|label|genre|review_content|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|review_label|like_count|
+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|   a nous la liberte|1931|tt0022599|  4392.0| true| NULL|          NULL|          NULL|              NULL|              NULL|              NULL|           NULL|           NULL|        NULL|        NULL|      NULL|
|a tale of two cities|1935|tt0027075|  5596.0| true| NULL|          NULL|          NULL|              NULL|              NULL|              

In [97]:
final_df = convert_columns_to_int(final_df, "numVotes")
final_df.show()

+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|         movie_title|year|   tconst|numVotes|label|genre|review_content|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|review_label|like_count|
+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+------------+----------+
|   a nous la liberte|1931|tt0022599|    4392| true| NULL|          NULL|          NULL|              NULL|              NULL|              NULL|           NULL|           NULL|        NULL|        NULL|      NULL|
|a tale of two cities|1935|tt0027075|    5596| true| NULL|          NULL|          NULL|              NULL|              NULL|              

In [98]:
final_df.count()

8294

In [99]:
def clean_labels(df):
    """
    Creates a 'year' column using 'startYear' if available, otherwise 'endYear'.
    Drops 'startYear' and 'endYear' after merging.
    """
    # convert boolean column `label` to integer
    df = df.withColumn("label_int", when(col("label") == True, 1)
                                      .when(col("label") == False, 0)
                                      .otherwise(None))

    # convert categorical `review_label` to numerical
    df = df.withColumn("tomatoes_label", when(col("review_label") == "Fresh", 1)
                                          .when(col("review_label") == "Rotten", 0)
                                          .otherwise(None))

    df = df.withColumn("tomatometer_status", when(col("tomatometer_status") == "Fresh", 1)
                                          .when(col("tomatometer_status") == "Rotten", 0)
                                          .otherwise(None))

    df = df.withColumn("audience_status", when(col("audience_status") == "Upright", 1)
                                          .when(col("audience_status") == "Spilled", 0)
                                          .otherwise(None))

    # Drop unnecessary intermediate columns
    df = df.drop("review_label")

    df.withColumnRenamed("label_int", "label")

    return df

In [100]:
clean_labels(final_df).show()
final_df = clean_labels(final_df)

+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+
|         movie_title|year|   tconst|numVotes|label|genre|review_content|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|like_count|label_int|tomatoes_label|
+--------------------+----+---------+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+
|   a nous la liberte|1931|tt0022599|    4392| true| NULL|          NULL|          NULL|              NULL|              NULL|              NULL|           NULL|           NULL|        NULL|      NULL|        1|          NULL|
|a tale of two cities|1935|tt0027075|    5596| true| NULL|          NULL|          NULL|    

In [101]:
df_missing_movies = spark_train_df.join(spark_reviews_df, on=["movie_title", "year"], how="left_anti")
df_missing_movies.count()

7911

In [110]:
tconst_list = [row["tconst"] for row in df_missing_movies.select("tconst").collect()]
tconst_list

['tt10534500',
 'tt0090556',
 'tt0349047',
 'tt2395385',
 'tt0473567',
 'tt0381838',
 'tt2416424',
 'tt0109000',
 'tt2518788',
 'tt4685428',
 'tt4875960',
 'tt0494724',
 'tt4126694',
 'tt1179933',
 'tt5128266',
 'tt1993391',
 'tt0147800',
 'tt0443649',
 'tt3488056',
 'tt10949778',
 'tt2990126',
 'tt4080598',
 'tt8774798',
 'tt0115433',
 'tt0211181',
 'tt6580564',
 'tt1712159',
 'tt10309552',
 'tt1160368',
 'tt2024544',
 'tt0417385',
 'tt0475169',
 'tt0276744',
 'tt2991516',
 'tt1385824',
 'tt5895028',
 'tt0449159',
 'tt14878948',
 'tt1663673',
 'tt1139085',
 'tt0094593',
 'tt1407927',
 'tt7692434',
 'tt0074084',
 'tt1797346',
 'tt8579674',
 'tt5638500',
 'tt0078723',
 'tt3213684',
 'tt7675404',
 'tt0094594',
 'tt0983990',
 'tt6363436',
 'tt3506970',
 'tt0087803',
 'tt0085124',
 'tt0322259',
 'tt0422460',
 'tt0046672',
 'tt15145772',
 'tt7150512',
 'tt0212712',
 'tt3481210',
 'tt4054004',
 'tt0315733',
 'tt2363047',
 'tt0280381',
 'tt1131724',
 'tt0248661',
 'tt3720634',
 'tt0017463',
 

## IBDM Review Scraping

In [107]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

In [None]:
my_url = "https://www.imdb.com/title/tt0073781/reviews/"

In [108]:
def scrape_imdb_reviews(imdb_id):
    """
    Fetches the IMDb reviews page for the given imdb_id
    and returns ONLY the first 5 reviews that actually contain text.
    """
    url = f"https://www.imdb.com/title/{imdb_id}/reviews"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    review_cards = soup.find_all("div", {"data-testid": "review-card-parent"})

    reviews_data = []

    for card in review_cards:
        # extract review text
        content_div = card.select_one("div.ipc-html-content-inner-div")
        review_text = content_div.get_text(strip=True) if content_div else ""

        if not review_text:
            continue

        reviews_data.append(review_text)

        if len(reviews_data) == 4:
            break

    return reviews_data

In [109]:
import time

def scrape_reviews_for_imdb_ids(imdb_ids):
    """
    Takes a list of IMDb IDs,
    scrapes the first 4 text reviews for each, then returns a Spark DataFrame with:
        - imdb_id
        - review (each review appears in a separate row)
    """
    rows = []

    for imdb_id in imdb_ids:
        reviews_list = scrape_imdb_reviews(imdb_id)

        for review in reviews_list:
            rows.append((imdb_id, review))

        # time.sleep(1)  # Uncomment if rate limiting is needed

    columns = ["tconst", "review"]
    df = spark.createDataFrame(rows, columns)
    return df

In [113]:
df_reviews = scrape_reviews_for_imdb_ids(tconst_list)

df_reviews.show()

+----------+--------------------+
|    tconst|              review|
+----------+--------------------+
|tt10534500|Rojin Thomas, who...|
|tt10534500|Nothing to say.. ...|
|tt10534500|What a performanc...|
|tt10534500|OMG. After a long...|
| tt0090556|Wow, here it is a...|
| tt0090556|Marsha Norman's p...|
| tt0090556|It's gut wrenchin...|
| tt0090556|"'Night Mother" i...|
| tt0349047|"(T)Raumschiff Su...|
| tt0349047|A year and a half...|
| tt0349047|After having read...|
| tt0349047|Three hundred yea...|
| tt2395385|'Shadow Walkers' ...|
| tt2395385|I really liked th...|
| tt2395385|"I just saw anoth...|
| tt2395385|Time Travel is a ...|
| tt0473567|Shoojit Sircar's ...|
| tt0473567|Yahaan is a wonde...|
| tt0473567|This was the only...|
| tt0473567|Yahaan takes a re...|
+----------+--------------------+
only showing top 20 rows



In [119]:
df_reviews.coalesce(1).write.json("movie_reviews/imdb_reviews.csv", mode='overwrite')

# Final Data

In [126]:
df_review = spark.read.json("/content/Big_Data_IMDb/movie_reviews/ibdm_reviews.json")

In [128]:
df_review.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [124]:
merged_df = final_df.join(df_reviews, on='tconst', how='left')
merged_df.show()

+---------+--------------------+----+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+--------------------+
|   tconst|         movie_title|year|numVotes|label|genre|review_content|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|like_count|label_int|tomatoes_label|              review|
+---------+--------------------+----+--------+-----+-----+--------------+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+--------------------+
|tt0009369|              mickey|1918|    1119|false| NULL|          NULL|          NULL|              NULL|              NULL|              NULL|           NULL|           NULL|        NULL|      NULL|        0|          NULL|Mack Sennett had ...|
|tt00093

In [105]:
merged_df.count()

8294

In [106]:
# merged_df = merged_df.withColumn(
#     "reviews",
#     when(
#         col("reviews").isNotNull() & (col("reviews") != "No Reviews"),
#         array(col("reviews"))
#     )
# )

merged_df = merged_df.withColumn(
    "reviews",
    when(
        (col("reviews").isNotNull()) & (col("reviews") != "No Reviews"),
        concat(lit("['"), concat_ws("', '", split(col("reviews"), ",\s*")), lit("']"))
    ))

merged_df.show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/socket.py", line 718, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [130]:
def handle_reviews(df):
    """
    Creates a 'review' column using 'review_content' if available, otherwise 'reviews'.
    Drops 'review_content' and 'reviews' after merging.
    """
    df = df.withColumn("reviews", when(col("review_content").isNotNull(), col("review_content"))
                                  .otherwise(col("review")))

    # drop original startYear and endYear
    df = df.drop("review_content", "review")

    #df = df.withColumnRenamed("review", "reviews")

    return df

df_final = handle_reviews(merged_df)
df_final.show()

+---------+--------------------+----+--------+-----+-----+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+--------------------+
|   tconst|         movie_title|year|numVotes|label|genre|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|like_count|label_int|tomatoes_label|             reviews|
+---------+--------------------+----+--------+-----+-----+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+--------------------+
|tt0009369|              mickey|1918|    1119|false| NULL|          NULL|              NULL|              NULL|              NULL|           NULL|           NULL|        NULL|      NULL|        0|          NULL|Mack Sennett had ...|
|tt0009369|              mickey|1918|    1119|false| NULL|          

In [131]:
df_final.coalesce(1).write.csv("movie_reviews/final_reviews_data.csv", header=True, mode='overwrite')

In [134]:
df_final = spark.read.csv("movie_reviews/final_reviews_data.csv", header=True)

df_final.select("reviews").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [135]:
from pyspark.sql.functions import isnan, when, count, col

df_final.select([count(when(isnan(c), c)).alias(c) for c in df_final.columns]).show()

+------+-----------+----+--------+-----+-----+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+-------+
|tconst|movie_title|year|numVotes|label|genre|content_rating|production_company|tomatometer_status|tomatometer_rating|audience_status|audience_rating|review_score|like_count|label_int|tomatoes_label|reviews|
+------+-----------+----+--------+-----+-----+--------------+------------------+------------------+------------------+---------------+---------------+------------+----------+---------+--------------+-------+
|     0|          0|   0|       0|    0|    0|             0|                 0|                 0|                 0|              0|              0|           0|         0|        0|             0|      0|
+------+-----------+----+--------+-----+-----+--------------+------------------+------------------+------------------+---------------+---------------+------------+-----

## Cleaning reviews for tokenization (IGNORE THIS)

In [46]:
import re
import unicodedata
import pyspark.sql.functions as F
from pyspark.sql.types import ArrayType, StringType

def robust_review_extractor(text):
    # Unicode normalization
    text = unicodedata.normalize("NFKD", text)
    text = text.replace("“", '"').replace("”", '"').replace("’", "'").replace("‘", "'")
    text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

    # Remove starting and ending square brackets, if present
    text = text.strip()
    if text.startswith('[') and text.endswith(']'):
        text = text[1:-1]

    reviews = []
    current_review = ''
    in_review = False
    quote_char = ''

    i = 0
    while i < len(text):
        char = text[i]

        if not in_review:
            if char in ['"', "'"]:
                in_review = True
                quote_char = char
                current_review = ''
        else:
            if char == quote_char:
                # Check if next char is comma or whitespace indicating end of review
                if (i + 1 == len(text)) or text[i + 1] in [',', ' ']:
                    reviews.append(current_review.strip())
                    in_review = False
                    quote_char = ''
                    current_review = ''
                    # Skip the comma if present
                    if i + 1 < len(text) and text[i + 1] == ',':
                        i += 1
                else:
                    current_review += char
            else:
                current_review += char
        i += 1

    # Final clean-up to remove extra spaces and empty strings
    cleaned_reviews = [re.sub(r'\s+', ' ', r).strip() for r in reviews if r.strip()]
    return cleaned_reviews

# Register UDF
robust_review_extractor_udf = F.udf(robust_review_extractor, ArrayType(StringType()))

# Apply the UDF
df_final_fixed = df_final.withColumn(
    "clean_reviews_list",
    robust_review_extractor_udf(F.col("reviews"))
)

df_final_fixed.select("clean_reviews_list").show(truncate=False, n=10)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [39]:
df_final_fixed.printSchema()


root
 |-- tconst: string (nullable = true)
 |-- movie_title: string (nullable = true)
 |-- year: string (nullable = true)
 |-- numVotes: string (nullable = true)
 |-- label: string (nullable = true)
 |-- tomatometer_status: string (nullable = true)
 |-- tomatometer_rating: string (nullable = true)
 |-- audience_status: string (nullable = true)
 |-- audience_rating: string (nullable = true)
 |-- review_type: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- Like count: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- label_int: string (nullable = true)
 |-- tomatoes_label: string (nullable = true)
 |-- reviews: string (nullable = true)
 |-- clean_reviews_list: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [42]:
from pyspark.sql.functions import col, explode, concat_ws, split

# Step 1: Convert the list of reviews into a single string with "|" as the separator
df_fixed = df_final_fixed.withColumn("clean_reviews_list", concat_ws("|", col("clean_reviews_list")))

# Step 2: Split the newly formatted column back into an array using "|"
df_fixed = df_fixed.withColumn("clean_reviews_list", split(col("clean_reviews_list"), "\\|"))

# Step 3: Explode the list to create one row per review
df_exploded = df_fixed.withColumn("review", explode(col("clean_reviews_list")))

# Step 4: Drop the original list column (optional)
df_exploded = df_exploded.drop("clean_reviews_list")

# Show the transformed DataFrame
df_exploded.show(10, truncate=False)

+---------+-----------+----+--------+-----+------------------+------------------+---------------+---------------+-----------+------------+----------+-----+---------+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
df_final_fixed.write.json("movie_reviews/final_cleaned_reviews.json", mode="overwrite")

Py4JJavaError: An error occurred while calling o307.json.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 20.0 failed 1 times, most recent failure: Lost task 0.0 in stage 20.0 (TID 20) (fc4ce7013ce9 executor driver): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to file:/content/movie_reviews/final_cleaned_reviews.json.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "<ipython-input-19-2dd57505eaa1>", line 8, in robust_review_extractor
TypeError: normalize() argument 2 must be str, not None

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:91)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.json(DataFrameWriter.scala:784)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to file:/content/movie_reviews/final_cleaned_reviews.json.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:775)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:420)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "<ipython-input-19-2dd57505eaa1>", line 8, in robust_review_extractor
TypeError: normalize() argument 2 must be str, not None

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:91)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:403)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:410)
	... 17 more
