# Ingest Data Warehouse

### Import


In [12]:
import glob
from typing import Optional
from lib.duckdbcontext import DuckDBContext
import polars as pl
import pyarrow as pa
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split
from tqdm import tqdm
import urllib.request
import duckdb
import json

In [13]:
print(pyspark.__version__)
print(duckdb.__version__)

3.5.1
0.9.2


### Config


In [14]:
duckdb_database = "../orchestration/db/bigdata.duckdb"

#### Helpers


In [15]:
def get_table_name(file_name: str) -> str:
    return file_name.rsplit(".", 2)[0]


def create_url(endpoint: str) -> str:
    """
    Create Url

    :param str endpoint: download endpoint
    :return str: full url
    """
    return f"https://datasets.imdbws.com/{endpoint}"


def download_file(url, filename):
    print(f"Downloading file: {filename}")

    response = urllib.request.urlopen(url)

    # Get the total file size
    file_size = int(response.headers.get("content-length", 0))

    # Create a tqdm progress bar
    progress = tqdm(total=file_size, unit="iB", unit_scale=True, desc=filename)

    chunk_size = 1024  # you can change this to larger if you want

    with open(filename, "wb") as f:
        while True:
            chunk = response.read(chunk_size)
            if not chunk:
                break
            f.write(chunk)
            progress.update(len(chunk))
    progress.close()

#### Setting up Cluster Connection


In [16]:
# Connect to Existing Spark Cluster
# spark = (
#     SparkSession.builder.master("spark://spark:7077")
#     .appName("Spark-ETL")
#     .config("spark.sql.debug.maxToStringFields", 1000)
#     .getOrCreate()
# )

# Connect to local Spark Sessions
spark = (
    SparkSession.builder.master("local")
    .appName("Spark-ETL")
    # .config("spark.sql.debug.maxToStringFields", 1000)
    .getOrCreate()
)

# Add to Data Warehouse

## Initial Data

#### Train


In [17]:
# Get a list of all CSV files that match the pattern
csv_files = glob.glob("../../data/train-*.csv")
print(csv_files)

# Load all CSV files in the data directory into a dataframe
# Specify '\\N' as a null value
# Ignore the header and infer the schema from data
train_spark_df = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .csv("../../data/train-*.csv", nullValue="\\N")
)
# Drop the first column
train_spark_df = train_spark_df.drop("_c0")
train_spark_df.describe().show()
# Print the dataframe
train_spark_df.show(5)

['../../data/train-8.csv', '../../data/train-2.csv', '../../data/train-7.csv', '../../data/train-5.csv', '../../data/train-3.csv', '../../data/train-4.csv', '../../data/train-1.csv', '../../data/train-6.csv']


+-------+---------+--------------------+------------------+------------------+------------------+------------------+------------------+
|summary|   tconst|        primaryTitle|     originalTitle|         startYear|           endYear|    runtimeMinutes|          numVotes|
+-------+---------+--------------------+------------------+------------------+------------------+------------------+------------------+
|  count|     7959|                7959|              3971|              7173|               786|              7946|              7169|
|   mean|     NULL|   1231.388888888889|            1128.0|1997.9960964728843|1998.7633587786258|105.68713818273345| 29520.51081043381|
| stddev|     NULL|   954.6947857755001|1038.0438333712118| 21.99534723241901|   21.895931761063| 25.39634772412447|114449.99384975343|
|    min|tt0009369|"Drágớn Báll Z: R...|     'A' gai wak 2|              1918|              1921|                45|            1001.0|
|    max|tt9911196|             Ớútcást|        

#### Directing


In [18]:
# Using Polars to retrieve the directing data
# Load and parse the JSON file
with open("../../data/directing.json") as f:
    data = json.load(f)

movies_polars_df = pl.from_dict(data["movie"]).transpose().rename({"column_0": "movie"})
directors_polars_df = (
    pl.from_dict(data["director"]).transpose().rename({"column_0": "director"})
)
directing_polars_df = pl.concat(
    [
        movies_polars_df,
        directors_polars_df,
    ],
    how="horizontal",
)
directing_polars_df.head(5)

movie,director
str,str
"""tt0003740""","""nm0665163"""
"""tt0008663""","""nm0803705"""
"""tt0009369""","""nm0428059"""
"""tt0009369""","""nm0949648"""
"""tt0010307""","""nm0304098"""


#### Writing


In [19]:
with open("../../data/writing.json") as f:
    data = json.load(f)
writing_json = spark.sparkContext.parallelize(data)
writing_spark_df = spark.read.json(writing_json)
writing_spark_df.show(5)

+---------+---------+
|    movie|   writer|
+---------+---------+
|tt0003740|nm0195339|
|tt0003740|nm0515385|
|tt0003740|nm0665163|
|tt0003740|nm0758215|
|tt0008663|nm0406585|
+---------+---------+
only showing top 5 rows



### Load into DuckDB Database


In [20]:
# DuckDBContext to add pyspark tables to DuckDB
with DuckDBContext(duckdb_database) as ctx:
    ctx.save_to_duckdb(train_spark_df, "imdb_train")
    ctx.show_n("imdb_train", 5)

    ctx.save_to_duckdb(directing_polars_df, "imdb_directors")
    ctx.show_n("imdb_directors", 5)

    ctx.save_to_duckdb(writing_spark_df, "imdb_writing")
    ctx.show_n("imdb_writing", 5)

CREATED TABLE: imdb_train WITH 7959 ROWS!
shape: (5, 8)
┌───────────┬───────────────┬──────────────┬───────────┬─────────┬──────────────┬──────────┬───────┐
│ tconst    ┆ primaryTitle  ┆ originalTitl ┆ startYear ┆ endYear ┆ runtimeMinut ┆ numVotes ┆ label │
│ ---       ┆ ---           ┆ e            ┆ ---       ┆ ---     ┆ es           ┆ ---      ┆ ---   │
│ str       ┆ str           ┆ ---          ┆ i32       ┆ i32     ┆ ---          ┆ f64      ┆ bool  │
│           ┆               ┆ str          ┆           ┆         ┆ i32          ┆          ┆       │
╞═══════════╪═══════════════╪══════════════╪═══════════╪═════════╪══════════════╪══════════╪═══════╡
│ tt0014109 ┆ The Saga of   ┆ null         ┆ 1924      ┆ null    ┆ 183          ┆ 1231.0   ┆ true  │
│           ┆ Gösta Berling ┆              ┆           ┆         ┆              ┆          ┆       │
│ tt0015064 ┆ The Last      ┆ Der letzte   ┆ 1924      ┆ null    ┆ 77           ┆ null     ┆ true  │
│           ┆ Laugh         ┆ Mann 

## Extra Data


### Downloading

Run this cell once, otherwise you'll keep downloading the same files over and over...


In [21]:
extra_imdb = [
    "name.basics.tsv.gz",
    "title.akas.tsv.gz",
    "title.basics.tsv.gz",
    "title.crew.tsv.gz",
    # "title.episode.tsv.gz", # we have only movie data
    "title.principals.tsv.gz",
    "title.ratings.tsv.gz",
]

# RUN THIS ONCE!
# # Download the files
# for ds in extra_imdb:
#     # Create an instance of the IMDB class with the desired endpoint
#     download_url = create_url(ds)

#     filepath = f"../../data/extra/{ds}"  # Local fp
#     # Use the function to download the file
#     download_file(download_url, filepath)

## Reading with Spark


In [23]:
with DuckDBContext(duckdb_database) as ctx:
    train_ids = ctx.conn.execute("SELECT tconst FROM imdb_train").fetchdf()
    train_ids_spark = spark.createDataFrame(train_ids)

    for ds in extra_imdb:
        table_name = f"extra.{get_table_name(ds)}".replace(".", "_")

        # Load a small subset of the data to infer the schema
        subset = spark.read.csv(
            f"../../data/extra/{ds}",
            header=True,
            sep="\t",
            nullValue="\\N",
            inferSchema=True,
        ).limit(1000)

        # Extract the schema from the subset
        schema = subset.schema

        # Load all TSV.GZ files in the data directory into a dataframe with the inferred schema
        spark_df = spark.read.csv(
            f"../../data/extra/{ds}",
            header=True,
            sep="\t",
            nullValue="\\N",
            schema=schema,
        )
        spark_df.show(5)

        spark_df_columns = spark_df.columns

        if "titleId" in spark_df_columns:
            filtered_spark_df = spark_df.join(
                train_ids_spark, train_ids_spark.tconst == spark_df.titleId, "inner"
            )
            filtered_spark_df = filtered_spark_df.drop("titleId")
        elif "knownForTitles" in spark_df_columns:
            # Split the knownForTitles column into multiple rows
            spark_df = spark_df.withColumn(
                "knownForTitles", explode(split(spark_df["knownForTitles"], ","))
            )

            # Select the values that are in both train_ids_spark and spark_df
            filtered_spark_df = spark_df.join(
                train_ids_spark,
                spark_df.knownForTitles == train_ids_spark.tconst,
                "inner",
            )
        elif "tconst" in spark_df_columns:
            filtered_spark_df = spark_df.join(train_ids_spark, "tconst", "inner")

        ctx.save_to_duckdb(filtered_spark_df, table_name)

                                                                                

+---------+---------------+---------+---------+--------------------+--------------------+
|   nconst|    primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+---------------+---------+---------+--------------------+--------------------+
|nm0000001|   Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0053137,tt00723...|
|nm0000002|  Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0038355,tt00373...|
|nm0000003|Brigitte Bardot|     1934|     NULL|actress,soundtrac...|tt0054452,tt00573...|
|nm0000004|   John Belushi|     1949|     1982|actor,soundtrack,...|tt0078723,tt00779...|
|nm0000005| Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00509...|
+---------+---------------+---------+---------+--------------------+--------------------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_name_basics WITH 555345 ROWS!


                                                                                

+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|  titleId|ordering|               title|region|language|      types|   attributes|isOriginalTitle|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
|tt0000001|       1|          Карменсіта|    UA|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       2|          Carmencita|    DE|    NULL|       NULL|literal title|              0|
|tt0000001|       3|Carmencita - span...|    HU|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       4|          Καρμενσίτα|    GR|    NULL|imdbDisplay|         NULL|              0|
|tt0000001|       5|          Карменсита|    RU|    NULL|imdbDisplay|         NULL|              0|
+---------+--------+--------------------+------+--------+-----------+-------------+---------------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_akas WITH 190567 ROWS!


                                                                                

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   NULL|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   NULL|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   NULL|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   NULL|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   NULL|             1|        Comedy

                                                                                

CREATED TABLE: extra_title_basics WITH 7958 ROWS!


                                                                                

+---------+---------+-------+
|   tconst|directors|writers|
+---------+---------+-------+
|tt0000001|nm0005690|   NULL|
|tt0000002|nm0721526|   NULL|
|tt0000003|nm0721526|   NULL|
|tt0000004|nm0721526|   NULL|
|tt0000005|nm0005690|   NULL|
+---------+---------+-------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_crew WITH 7958 ROWS!


                                                                                

+---------+--------+---------+---------------+--------------------+----------+
|   tconst|ordering|   nconst|       category|                 job|characters|
+---------+--------+---------+---------------+--------------------+----------+
|tt0000001|       1|nm1588970|           self|                NULL|  ["Self"]|
|tt0000001|       2|nm0005690|       director|                NULL|      NULL|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|      NULL|
|tt0000002|       1|nm0721526|       director|                NULL|      NULL|
|tt0000002|       2|nm1335271|       composer|                NULL|      NULL|
+---------+--------+---------+---------------+--------------------+----------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_principals WITH 77941 ROWS!


                                                                                

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    2034|
|tt0000002|          5.7|     272|
|tt0000003|          6.5|    1981|
|tt0000004|          5.4|     178|
|tt0000005|          6.2|    2739|
+---------+-------------+--------+
only showing top 5 rows



                                                                                

CREATED TABLE: extra_title_ratings WITH 7958 ROWS!
