## Test Notebook!


In [None]:
# Using magic command you can also add Poetry packages to the Jupyter Notebook
# !poetry add package_name

#### Importing


In [14]:
import os
import glob
import polars as pl  # Faster than Pandas Dataframe library (Rust based library)
import pandas as pd  # Goo'ol Pandas
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

In [9]:
spark = (
    SparkSession.builder.master("local[*]")
    .appName("Local PySpark Session")
    .getOrCreate()
)

In [17]:
# Get a list of all CSV files that match the pattern
csv_files = glob.glob("../../data/train-*.csv")

# Print the files
for file in csv_files:
    print(file)


# Load all CSV files in the data directory into a dataframe
# Specify '\\N' as a null value
df = spark.read.csv("../../data/train-*.csv", header=True, nullValue="\\N")


# Print the dataframe
df.show()

../../data/train-8.csv
../../data/train-2.csv
../../data/train-7.csv
../../data/train-5.csv
../../data/train-3.csv
../../data/train-4.csv
../../data/train-1.csv
../../data/train-6.csv
+---+---------+--------------------+----------------+---------+-------+--------------+--------+-----+
|_c0|   tconst|        primaryTitle|   originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
+---+---------+--------------------+----------------+---------+-------+--------------+--------+-----+
| 14|tt0014109|The Saga of Gösta...|            NULL|     1924|   NULL|           183|  1231.0| True|
| 24|tt0015064|      The Last Laugh| Der letzte Mann|     1924|   NULL|            77|    NULL| True|
| 32|tt0015841|        The Freshman|    The Freshman|     1925|   NULL|            77|  5374.0| True|
| 47|tt0017271|          By the Law|            NULL|     NULL|   1926|            80|  1057.0| True|
| 56|tt0018451|The Student Princ...|            NULL|     1927|   NULL|           106|  1459.0| True|


24/03/13 19:20:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: \N, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
 Schema: _c0, tconst, primaryTitle, originalTitle, startYear, endYear, runtimeMinutes, numVotes, label
Expected: _c0 but found: \N
CSV file: file:///workspaces/Big-Data/data/train-4.csv


In [11]:
df.show()

+----+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
| _c0|      _c1|                 _c2|                 _c3|      _c4|    _c5|           _c6|     _c7|  _c8|
+----+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|NULL|   tconst|        primaryTitle|       originalTitle|startYear|endYear|runtimeMinutes|numVotes|label|
|  11|tt0013257|               Häxan|               Häxan|     1922|     \N|            91| 13679.0| True|
|  12|tt0013556|          Robin Hood|                NULL|     1922|     \N|           143|  2178.0| True|
|  16|tt0014341|     Our Hospitality|     Our Hospitality|     1923|     \N|            65| 10911.0| True|
|  19|tt0014538|          Three Ages|                NULL|     1923|     \N|            63|  4312.0| True|
|  50|tt0017925|         The General|                NULL|     1926|     \N|            67| 87784.0| True|
|  54|tt0018192|            Napoleon|

#### Using Polars and or Pandas

A lot of Data-Wrangling is necessary because the data is dirty!


In [None]:
# With Polars you can read multiple files at once using a wildcard

# Since
# Specify the columns to keep
columns_to_keep = [
    "column1",
    "column2",
    "column3",
    "column4",
    "column5",
    "column6",
    "column7",
    "column8",
]

# Load all CSV files in the data directory into a dataframe
# Specify '\\N' as a null value and only keep the specified columns
df = pl.read_csv("../../data/*.csv", null_values=["\\N"], with_columns=columns_to_keep)

# Print the dataframe
print(df)
# Print the dataframe
print(df)