# Ingest Data Warehouse

### Import


In [1]:
import os
import glob
from typing import Optional
from lib.duckdbcontext import DuckDBContext
import polars as pl
import pyspark
import opendatasets as od
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    explode,
    split,
    udf,
    size,
    regexp_replace,
    when,
    array,
    countDistinct,
    col,
)
from pyspark.ml.feature import StringIndexer, OneHotEncoder
import ast
from pyspark.sql.types import ArrayType, StringType
from tqdm import tqdm
import urllib.request
import duckdb
import json

In [2]:
print(pyspark.__version__)
print(duckdb.__version__)

3.5.1
0.9.2


### Config


In [3]:
duckdb_database = "../orchestration/db/bigdata.duckdb"

#### Setting up Cluster Connection


In [4]:
# Connect to Existing Spark Cluster
# spark = (
#     SparkSession.builder.master("spark://spark:7077")
#     .appName("Spark-ETL")
#     .config("spark.sql.debug.maxToStringFields", 1000)
#     .getOrCreate()
# )

# Connect to local Spark Sessions
spark = (
    SparkSession.builder.master("local").appName("Spark-ETL")
    # .config("spark.sql.debug.maxToStringFields", 1000)
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/17 16:49:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Add to Data Warehouse

## Initial Data

#### Train


In [5]:
# Get a list of all CSV files that match the pattern
csv_files = glob.glob("../../data/train-*.csv")
print(csv_files)

# Load all CSV files in the data directory into a dataframe
# Specify '\\N' as a null value
# Ignore the header and infer the schema from data
train_spark_df = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .csv("../../data/train-*.csv", nullValue="\\N")
)
# Drop the first column
train_spark_df = train_spark_df.drop("_c0")
train_spark_df.describe().show()
# Print the dataframe
train_spark_df.show(5)

['../../data/train-8.csv', '../../data/train-2.csv', '../../data/train-7.csv', '../../data/train-5.csv', '../../data/train-3.csv', '../../data/train-4.csv', '../../data/train-1.csv', '../../data/train-6.csv']


24/03/17 16:49:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+---------+--------------------+------------------+------------------+------------------+------------------+------------------+
|summary|   tconst|        primaryTitle|     originalTitle|         startYear|           endYear|    runtimeMinutes|          numVotes|
+-------+---------+--------------------+------------------+------------------+------------------+------------------+------------------+
|  count|     7959|                7959|              3971|              7173|               786|              7946|              7169|
|   mean|     NULL|   1231.388888888889|            1128.0|1997.9960964728843|1998.7633587786258|105.68713818273345| 29520.51081043381|
| stddev|     NULL|   954.6947857755001|1038.0438333712118| 21.99534723241901|   21.895931761063| 25.39634772412447|114449.99384975343|
|    min|tt0009369|"Drágớn Báll Z: R...|     'A' gai wak 2|              1918|              1921|                45|            1001.0|
|    max|tt9911196|             Ớútcást|        

#### Directing


In [6]:
# Using Polars to retrieve the directing data
# Load and parse the JSON file
with open("../../data/directing.json") as f:
    data = json.load(f)

movies_polars_df = pl.from_dict(data["movie"]).transpose().rename({"column_0": "movie"})
directors_polars_df = (
    pl.from_dict(data["director"]).transpose().rename({"column_0": "director"})
)
directing_polars_df = pl.concat(
    [
        movies_polars_df,
        directors_polars_df,
    ],
    how="horizontal",
)
directing_polars_df.head(5)

movie,director
str,str
"""tt0003740""","""nm0665163"""
"""tt0008663""","""nm0803705"""
"""tt0009369""","""nm0428059"""
"""tt0009369""","""nm0949648"""
"""tt0010307""","""nm0304098"""


#### Writing


In [7]:
with open("../../data/writing.json") as f:
    data = json.load(f)
writing_json = spark.sparkContext.parallelize(data)
writing_spark_df = spark.read.json(writing_json)
writing_spark_df.show(5)

                                                                                

+---------+---------+
|    movie|   writer|
+---------+---------+
|tt0003740|nm0195339|
|tt0003740|nm0515385|
|tt0003740|nm0665163|
|tt0003740|nm0758215|
|tt0008663|nm0406585|
+---------+---------+
only showing top 5 rows



### Load into DuckDB Database


In [9]:
# DuckDBContext to add pyspark tables to DuckDB
with DuckDBContext(duckdb_database) as ctx:
    ctx.save_to_duckdb(train_spark_df, "imdb_train")
    ctx.show_n("imdb_train", 5)

    ctx.save_to_duckdb(directing_polars_df, "imdb_directors")
    ctx.show_n("imdb_directors", 5)

    ctx.save_to_duckdb(writing_spark_df, "imdb_writing")
    ctx.show_n("imdb_writing", 5)

CREATED TABLE: imdb_train WITH 7959 ROWS!
shape: (5, 8)
┌───────────┬───────────────┬──────────────┬───────────┬─────────┬──────────────┬──────────┬───────┐
│ tconst    ┆ primaryTitle  ┆ originalTitl ┆ startYear ┆ endYear ┆ runtimeMinut ┆ numVotes ┆ label │
│ ---       ┆ ---           ┆ e            ┆ ---       ┆ ---     ┆ es           ┆ ---      ┆ ---   │
│ str       ┆ str           ┆ ---          ┆ i32       ┆ i32     ┆ ---          ┆ f64      ┆ bool  │
│           ┆               ┆ str          ┆           ┆         ┆ i32          ┆          ┆       │
╞═══════════╪═══════════════╪══════════════╪═══════════╪═════════╪══════════════╪══════════╪═══════╡
│ tt0014109 ┆ The Saga of   ┆ null         ┆ 1924      ┆ null    ┆ 183          ┆ 1231.0   ┆ true  │
│           ┆ Gösta Berling ┆              ┆           ┆         ┆              ┆          ┆       │
│ tt0015064 ┆ The Last      ┆ Der letzte   ┆ 1924      ┆ null    ┆ 77           ┆ null     ┆ true  │
│           ┆ Laugh         ┆ Mann 

## Extra Data


### IMDB Datasets


#### Helpers


In [10]:
def get_table_name(file_name: str) -> str:
    return file_name.rsplit(".", 2)[0]


def create_url(endpoint: str) -> str:
    """
    Create Url

    :param str endpoint: download endpoint
    :return str: full url
    """
    return f"https://datasets.imdbws.com/{endpoint}"


def download_file(url, filename):
    print(f"Downloading file: {filename}")

    response = urllib.request.urlopen(url)

    # Get the total file size
    file_size = int(response.headers.get("content-length", 0))

    # Create a tqdm progress bar
    progress = tqdm(total=file_size, unit="iB", unit_scale=True, desc=filename)

    chunk_size = 1024  # you can change this to larger if you want

    with open(filename, "wb") as f:
        while True:
            chunk = response.read(chunk_size)
            if not chunk:
                break
            f.write(chunk)
            progress.update(len(chunk))
    progress.close()

##### Dowloading

Run this cell once, otherwise you'll keep downloading the same files over and over...


In [11]:
extra_imdb = [
    "name.basics.tsv.gz",
    "title.akas.tsv.gz",
    "title.basics.tsv.gz",
    "title.crew.tsv.gz",
    # "title.episode.tsv.gz", # we have only movie data
    "title.principals.tsv.gz",
    "title.ratings.tsv.gz",
]

# RUN THIS ONCE!
# Download the files
for ds in extra_imdb:
    # Create an instance of the IMDB class with the desired endpoint
    download_url = create_url(ds)

    filepath = f"../../data/extra/{ds}"  # Local fp
    # Use the function to download the file
    download_file(download_url, filepath)

Downloading file: ../../data/extra/name.basics.tsv.gz


../../data/extra/name.basics.tsv.gz: 100%|██████████| 263M/263M [00:26<00:00, 9.82MiB/s] 


Downloading file: ../../data/extra/title.akas.tsv.gz


../../data/extra/title.akas.tsv.gz: 100%|██████████| 330M/330M [00:32<00:00, 10.0MiB/s] 


Downloading file: ../../data/extra/title.basics.tsv.gz


../../data/extra/title.basics.tsv.gz: 100%|██████████| 186M/186M [00:18<00:00, 9.86MiB/s] 


Downloading file: ../../data/extra/title.crew.tsv.gz


../../data/extra/title.crew.tsv.gz: 100%|██████████| 70.7M/70.7M [00:07<00:00, 9.68MiB/s]


Downloading file: ../../data/extra/title.principals.tsv.gz


../../data/extra/title.principals.tsv.gz: 100%|██████████| 471M/471M [00:50<00:00, 9.25MiB/s] 


Downloading file: ../../data/extra/title.ratings.tsv.gz


../../data/extra/title.ratings.tsv.gz: 100%|██████████| 7.11M/7.11M [00:00<00:00, 8.41MiB/s]


## Reading with Spark

RUN THIS ONCE!


In [12]:
with DuckDBContext(duckdb_database) as ctx:
    train_ids = ctx.conn.execute("SELECT tconst FROM imdb_train").fetchdf()
    train_ids_spark = spark.createDataFrame(train_ids)

    for ds in extra_imdb:
        table_name = f"extra.{get_table_name(ds)}".replace(".", "_")

        # Load a small subset of the data to infer the schema
        subset = spark.read.csv(
            f"../../data/extra/{ds}",
            header=True,
            sep="\t",
            nullValue="\\N",
            inferSchema=True,
        ).limit(1000)

        # Extract the schema from the subset
        schema = subset.schema

        # Load all TSV.GZ files in the data directory into a dataframe with the inferred schema
        spark_df = spark.read.csv(
            f"../../data/extra/{ds}",
            header=True,
            sep="\t",
            nullValue="\\N",
            schema=schema,
        )
        spark_df.show(5)

        spark_df_columns = spark_df.columns

        if "titleId" in spark_df_columns:
            filtered_spark_df = spark_df.join(
                train_ids_spark, train_ids_spark.tconst == spark_df.titleId, "inner"
            )
            filtered_spark_df = filtered_spark_df.drop("titleId")
        elif "knownForTitles" in spark_df_columns:
            # Split the knownForTitles column into multiple rows
            spark_df = spark_df.withColumn(
                "knownForTitles", explode(split(spark_df["knownForTitles"], ","))
            )

            # Select the values that are in both train_ids_spark and spark_df
            filtered_spark_df = spark_df.join(
                train_ids_spark,
                spark_df.knownForTitles == train_ids_spark.tconst,
                "inner",
            )
        elif "tconst" in spark_df_columns:
            filtered_spark_df = spark_df.join(train_ids_spark, "tconst", "inner")

        ctx.save_to_duckdb(filtered_spark_df, table_name)

                                                                                

+---------+---------------+---------+---------+--------------------+--------------------+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+---------------+---------+---------+--------------------+--------------------+
|nm0000001|   Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0072308,tt00531...|
|nm0000002|  Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0038355,tt01170...|
|nm0000003|Brigitte Bardot|     1934|     NULL|actress,soundtrac...|tt0056404,tt00573...|
|nm0000004|   John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00779...|
|nm0000005| Ingmar Bergman|     1918|     2007|writer,director,a...|tt0083922,tt00694...|
+---------+---------------+---------+---------+--------------------+--------------------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_name_basics WITH 555345 ROWS!


                                                                                

+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|  titleId|ordering|               title|region|language|      types|   attributes|isOriginalTitle|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|tt0000001|       1|          Карменсіта|    UA|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       2|          Carmencita|    DE|    NULL|       NULL|literal title|              0|
|tt0000001|       3|Carmencita - span...|    HU|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       4|          Καρμενσίτα|    GR|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       5|          Карменсита|    RU|    NULL|imdbDisplay|         NULL|              0|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_akas WITH 190567 ROWS!


                                                                                

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   NULL|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   NULL|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   NULL|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   NULL|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   NULL|             1|        Comedy

                                                                                

CREATED TABLE: extra_title_basics WITH 7958 ROWS!


                                                                                

+---------+---------+-------+
|   tconst|directors|writers|
+---------+---------+-------+
|tt0000001|nm0005690|   NULL|
|tt0000002|nm0721526|   NULL|
|tt0000003|nm0721526|   NULL|
|tt0000004|nm0721526|   NULL|
|tt0000005|nm0005690|   NULL|
+---------+---------+-------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_crew WITH 7958 ROWS!


                                                                                

+---------+--------+---------+---------------+--------------------+----------+
|   tconst|ordering|   nconst|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                NULL|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                NULL|      NULL|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|      NULL|
|tt0000002|       1|nm0721526|       director|                NULL|      NULL|
|tt0000002|       2|nm1335271|       composer|                NULL|      NULL|
+---------+--------+---------+---------------+--------------------+----------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_principals WITH 77941 ROWS!


                                                                                

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    2035|
|tt0000002|          5.7|     272|
|tt0000003|          6.5|    1982|
|tt0000004|          5.4|     178|
|tt0000005|          6.2|    2739|
+---------+-------------+--------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_ratings WITH 7958 ROWS!


### Kaggle Data


In [13]:
# Define a UDF to convert strings to lists
def parse_list(s):
    return s.strip("[]").split(", ")

In [14]:
# Letterboxd Movie Ratings Data
od.download(
    "https://www.kaggle.com/datasets/samlearner/letterboxd-movie-ratings-data/download?datasetVersionNumber=6",
    data_dir="../../data/extra",
)
# Oscar Award Data
od.download(
    "https://www.kaggle.com/datasets/unanimad/the-oscar-award",
    data_dir="../../data/extra",
)

Skipping, found downloaded files in "../../data/extra/letterboxd-movie-ratings-data" (use force=True to force download)
Skipping, found downloaded files in "../../data/extra/the-oscar-award" (use force=True to force download)


In [15]:
# Loop over all CSV files
for file_path in csv_files:
    # Check if 'users_export.csv' or 'ratings_export.csv' is part of the file name
    if "users_export.csv" in file_path or "ratings_export.csv" in file_path:
        # If the file exists, remove it
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"{file_path} removed successfully.")
        else:
            print(f"{file_path} does not exist.")
    else:
        df = (
            spark.read.csv(
                file_path,
                header=True,
                inferSchema=True,
            )
            .limit(5)
            .show(5)
        )

24/03/17 16:59:45 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-8.csv


+---+---------+--------------------+------------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|     originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+------------------+---------+-------+--------------+--------+-----+
| 29|tt0015224|           Peter Pan|              NULL|     1924|     \N|           105|  1042.0| true|
| 35|tt0015864|       The Gold Rush|              NULL|     1925|     \N|            95|107475.0| true|
| 37|tt0016029|  The Little Colonel|              NULL|     1935|     \N|            81|  1646.0| true|
| 82|tt0021309|The Story of the Fox|Le roman de Renard|     1937|     \N|            63|    NULL| true|
| 93|tt0022395|       The Skin Game|              NULL|     1931|     \N|            85|    NULL|false|
+---+---------+--------------------+------------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-2.csv


+---+---------+------------------+-------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|      primaryTitle|originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+------------------+-------------+---------+-------+--------------+--------+-----+
|  6|tt0011607|The Parson's Widow|   Prästänkan|     1920|     \N|            94|  1264.0| true|
| 17|tt0014358|       The Pilgrim|         NULL|     1923|     \N|            47|  4891.0| true|
| 20|tt0014611|        Why Worry?|         NULL|     1923|     \N|            63|  1739.0| true|
| 44|tt0016847|             Faust|         NULL|     1926|     \N|           107| 14809.0| true|
| 63|tt0018773|        Thé Circús|         NULL|     1928|     \N|            72| 32601.0| true|
+---+---------+------------------+-------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-7.csv


+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+
|  2|tt0009369|              Mickey|       Mickey|     1918|     \N|            93|  1119.0|false|
| 15|tt0014142|The Hunchback of ...|         NULL|       \N|   1923|           133|  5288.0| true|
| 21|tt0014945|            Girl Shy|     Girl Shy|     1924|     \N|            87|  3327.0| true|
| 45|tt0017048|   A Page of Madness|         NULL|     1926|     \N|            70|  3357.0| true|
| 48|tt0017350|  The Scarlet Letter|         NULL|     1926|     \N|           115|  1768.0| true|
+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-5.csv


+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|  8|tt0012349|             The Kid|                NULL|     1921|     \N|            68|121452.0| true|
| 30|tt0015361|              Strike|                NULL|     1925|     \N|            82|  7695.0| true|
| 36|tt0015881|               Greed|                NULL|     1924|     \N|           140|  9649.0| true|
| 53|tt0018066|Thé Énd ớf St. Pé...|Konets Sankt-Pete...|     1927|     \N|            85|    NULL| true|
| 70|tt0019412|              Speedy|                NULL|     1928|     \N|            85|  3613.0| true|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-3.csv


+---+---------+--------------------+-----------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|    originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+-----------------+---------+-------+--------------+--------+-----+
|  5|tt0011439|   The Mark of Zorro|The Mark of Zorro|     1920|     \N|            79|  2439.0| true|
| 10|tt0012532|Ớrpháns ớf thé Stớrm|             NULL|     1921|     \N|           150|    NULL| true|
| 13|tt0013933|  The Faithful Heart|     Coeur fidèle|     1923|     \N|            87|  1252.0| true|
| 31|tt0015400| The Thief of Bagdad|             NULL|     1924|     \N|           155|  6001.0| true|
| 33|tt0015842|  The Joyless Street|             NULL|     1925|     \N|           125|  1554.0| true|
+---+---------+--------------------+-----------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-4.csv


+---+---------+--------------------+---------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|  originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+---------------+---------+-------+--------------+--------+-----+
| 14|tt0014109|The Saga of Gösta...|           NULL|     1924|     \N|           183|  1231.0| true|
| 24|tt0015064|      The Last Laugh|Der letzte Mann|     1924|     \N|            77|    NULL| true|
| 32|tt0015841|        The Freshman|   The Freshman|     1925|     \N|            77|  5374.0| true|
| 47|tt0017271|          By the Law|           NULL|       \N|   1926|            80|  1057.0| true|
| 56|tt0018451|The Student Princ...|           NULL|     1927|     \N|           106|  1459.0| true|
+---+---------+--------------------+---------------+---------+-------+--------------+--------+-----+



24/03/17 16:59:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-1.csv
24/03/17 16:59:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: 
CSV file: file:///workspaces/Big-Data/data/train-6.csv


+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|  4|tt0010600|            The Doll|           Die Puppe|     1919|     \N|            66|  1898.0| true|
|  7|tt0011841|       Way Down East|       Way Down East|     1920|     \N|           145|  5376.0| true|
|  9|tt0012494|             Déstiny|        Der müde Tod|     1921|     \N|            97|  5842.0| true|
| 25|tt0015163|       The Navigator|       The Navigator|     1924|     \N|            59|  9652.0| true|
| 38|tt0016220|The Phantom of th...|The Phantom of th...|     1925|     \N|            93| 17887.0| true|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+

+---+---------+---------------+--------------

In [16]:
# Define a UDF to convert strings to lists
def parse_list(s):
    return s.strip("[]").split(", ")


parse_list_udf = udf(parse_list, ArrayType(StringType()))


def process_column(df, column_name):
    # Remove the extra double quotes
    df = df.withColumn(column_name, regexp_replace(df[column_name], '"', ""))
    print(f"Size after removing double quotes: {df.count()} rows")

    # Filter the DataFrame to only include rows where the column is not null
    df = df.filter(col(column_name).isNotNull())
    print(f"Size after filtering nulls: {df.count()} rows")

    # Convert the column to a list
    df = df.withColumn(column_name, parse_list_udf(df[column_name]))
    print(f"Size after converting to list: {df.count()} rows")

    # Check if there are any arrays with multiple values
    multi_value_rows = df.filter(size(df[column_name]) > 1)
    print(
        f"Number of rows with multiple values in {column_name}: {multi_value_rows.count()}"
    )

    # Explode the array into new rows
    df = df.withColumn(column_name[:-1], explode(df[column_name]))
    print(f"Size after exploding list: {df.count()} rows")

    # Drop the original column
    df = df.drop(column_name)
    print(f"Size after dropping original column: {df.count()} rows")

    return df

In [17]:
# Initialize the DuckDBContext
with DuckDBContext(duckdb_database) as ctx:
    train_ids = ctx.conn.execute("SELECT tconst FROM imdb_train").fetchdf()
    train_ids_spark = spark.createDataFrame(train_ids)

    spark_df = spark.read.csv(
        "../../data/extra/the-oscar-award/the_oscar_award.csv",
        header=True,
        inferSchema=True,
    )
    # Save the DataFrame to DuckDB
    ctx.save_to_duckdb(spark_df, "the_oscar_award")

    # Read Movie Data & drop unnecessary columns
    spark_df = spark.read.csv(
        "../../data/extra/letterboxd-movie-ratings-data/movie_data.csv",
        header=True,
        inferSchema=True,
    ).drop("image_url", "imdb_link", "overview", "tmdb_id", "tmdb_link")

    # Filter on our Train IDs
    filtered_spark_df = spark_df.join(
        train_ids_spark, spark_df.imdb_id == train_ids_spark.tconst, "inner"
    ).drop("imdb_id")
    filtered_spark_df.show(5)

    # Process the genres, production_countries, and spoken_languages columns
    for column in ["genres", "production_countries", "spoken_languages"]:
        filtered_spark_df = process_column(filtered_spark_df, column)

    filtered_spark_df.show(5)

    # Save the DataFrame to DuckDB
    ctx.save_to_duckdb(filtered_spark_df, "letterboxd_movie_ratings_data")

CREATED TABLE: the_oscar_award WITH 10889 ROWS!


                                                                                

+--------------------+-------------------+------------------+------------------+-----------------+----------+--------------------+------------+-------+----------------+------------+----------+-------------+---------+
|                 _id|             genres|          movie_id|       movie_title|original_language|popularity|production_countries|release_date|runtime|spoken_languages|vote_average|vote_count|year_released|   tconst|
+--------------------+-------------------+------------------+------------------+-----------------+----------+--------------------+------------+-------+----------------+------------+----------+-------------+---------+
|5fc86a3d6758f6963...|      "[""Drama""]"|the-trump-prophecy|The Trump Prophecy|               en|     2.025|"[""United States...|  2018-10-02|    120| "[""English""]"|           4|         7|         2018|tt8235296|
|5fc8708a6758f6963...|     "[""Comedy""]"|     nothing-funny|     Nothing Funny|               pl|     4.192|      "[""Poland""]"|  

                                                                                

Size after removing double quotes: 2268 rows


                                                                                

Size after filtering nulls: 2268 rows


                                                                                

Size after converting to list: 2268 rows


                                                                                

Number of rows with multiple values in genres: 0


                                                                                

Size after exploding list: 2268 rows


                                                                                

Size after dropping original column: 2268 rows


                                                                                

Size after removing double quotes: 2268 rows


                                                                                

Size after filtering nulls: 2258 rows


                                                                                

Size after converting to list: 2258 rows


                                                                                

Number of rows with multiple values in production_countries: 0


                                                                                

Size after exploding list: 2258 rows


                                                                                

Size after dropping original column: 2258 rows


                                                                                

Size after removing double quotes: 2258 rows


                                                                                

Size after filtering nulls: 2258 rows


                                                                                

Size after converting to list: 2258 rows


                                                                                

Number of rows with multiple values in spoken_languages: 0


                                                                                

Size after exploding list: 2258 rows


                                                                                

Size after dropping original column: 2258 rows


                                                                                

+--------------------+------------------+------------------+-----------------+----------+------------+-------+------------+----------+-------------+---------+-----------+--------------------+---------------+
|                 _id|          movie_id|       movie_title|original_language|popularity|release_date|runtime|vote_average|vote_count|year_released|   tconst|      genre| production_countrie|spoken_language|
+--------------------+------------------+------------------+-----------------+----------+------------+-------+------------+----------+-------------+---------+-----------+--------------------+---------------+
|5fc86a3d6758f6963...|the-trump-prophecy|The Trump Prophecy|               en|     2.025|  2018-10-02|    120|           4|         7|         2018|tt8235296|      Drama|United States of ...|        English|
|5fc8708a6758f6963...|     nothing-funny|     Nothing Funny|               pl|     4.192|  1996-02-02|     95|         7.2|        39|         1996|tt0113971|     Comed

                                                                                

CREATED TABLE: letterboxd_movie_ratings_data WITH 2258 ROWS!
