In [None]:
!pip install --upgrade duckdb pandas

Collecting duckdb
  Downloading duckdb-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: duckdb
  Attempting uninstall: duckdb
    Found existing installation: duckdb 0.10.1
    Uninstalling duckdb-0.10.1:
      Successfully uninstalled duckdb-0.10.1
Successfully installed duckdb-0.10.2


In [None]:
!pip install pyspark



In [None]:
# We should have the same version (0.10.1) of duckdb to load the database without any problem:
!pip show duckdb

Name: duckdb
Version: 0.10.2
Summary: DuckDB in-process database
Home-page: https://www.duckdb.org
Author: 
Author-email: 
License: MIT
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: malloy


In [None]:
!pip install -U duckdb==0.10.1

Collecting duckdb==0.10.1
  Using cached duckdb-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.1 MB)
Installing collected packages: duckdb
  Attempting uninstall: duckdb
    Found existing installation: duckdb 0.10.2
    Uninstalling duckdb-0.10.2:
      Successfully uninstalled duckdb-0.10.2
Successfully installed duckdb-0.10.1


In [None]:
!wget -O "duckdb.jar" "https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar"

--2024-04-24 11:13:41--  https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64009472 (61M) [application/java-archive]
Saving to: ‘duckdb.jar’


2024-04-24 11:13:41 (153 MB/s) - ‘duckdb.jar’ saved [64009472/64009472]



In [16]:
import duckdb

# Conectar a la base de datos DuckDB
conn = duckdb.connect(database='database.duckdb', read_only=True)

# Ejecutar una consulta para obtener la estructura de la tabla 'flights'
query = "PRAGMA table_info('flights')"
table_info = conn.execute(query).fetchall()

# Imprimir información de cada columna
for column in table_info:
    print(f"Column Name: {column[1]}, Data Type: {column[2]}")

# Cerrar la conexión
conn.close()


Column Name: airport_acronym, Data Type: VARCHAR
Column Name: longitude, Data Type: DOUBLE
Column Name: latitude, Data Type: DOUBLE
Column Name: city, Data Type: VARCHAR
Column Name: flight_number_default, Data Type: VARCHAR
Column Name: estimated_arrival_time, Data Type: VARCHAR
Column Name: estimated_departure_time, Data Type: VARCHAR
Column Name: real_arrival_time, Data Type: VARCHAR
Column Name: real_departure_time, Data Type: VARCHAR
Column Name: scheduled_arrival_time, Data Type: VARCHAR
Column Name: scheduled_departure_time, Data Type: VARCHAR
Column Name: owner_name, Data Type: VARCHAR
Column Name: owner_logo, Data Type: VARCHAR
Column Name: destination_gate, Data Type: VARCHAR
Column Name: destination_terminal, Data Type: VARCHAR
Column Name: destination_baggage, Data Type: VARCHAR
Column Name: origin_airport, Data Type: VARCHAR
Column Name: origin_city, Data Type: VARCHAR
Column Name: origin_latitude, Data Type: VARCHAR
Column Name: origin_longitude, Data Type: VARCHAR
Column

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

DF = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM flights") \
  .load()

DF.show()

+---------------+---------+---------+---------+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+------------------------+--------------------+--------------------+----------------+--------------------+-------------------+--------------+-----------+---------------+----------------+-----------+---------------+------------------+
|airport_acronym|longitude| latitude|     city|flight_number_default|estimated_arrival_time|estimated_departure_time|real_arrival_time|real_departure_time|scheduled_arrival_time|scheduled_departure_time|          owner_name|          owner_logo|destination_gate|destination_terminal|destination_baggage|origin_airport|origin_city|origin_latitude|origin_longitude|origin_gate|origin_terminal|flight_status_live|
+---------------+---------+---------+---------+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+

## Preprocessing

In [107]:
from pyspark.sql import functions as F
from pyspark.sql.types import DateType

#### Missings

In [19]:
null_counts = DF.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in DF.columns])
null_counts.show()

+---------------+---------+--------+----+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+------------------------+----------+----------+----------------+--------------------+-------------------+--------------+-----------+---------------+----------------+-----------+---------------+------------------+
|airport_acronym|longitude|latitude|city|flight_number_default|estimated_arrival_time|estimated_departure_time|real_arrival_time|real_departure_time|scheduled_arrival_time|scheduled_departure_time|owner_name|owner_logo|destination_gate|destination_terminal|destination_baggage|origin_airport|origin_city|origin_latitude|origin_longitude|origin_gate|origin_terminal|flight_status_live|
+---------------+---------+--------+----+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+------------------------+----------+----------+-----------

Hi han massa missings en les columnes de arribada i sortida dels vols. Això es deu a que molts vols encara no han sortit o encara no han aterrat. Per a un anàlis de seguiment dinàmic dels vols, no seria útil aquesta base de dades sino que es faria directament a través de la API. Com no podem fer l'análisi dinàmic, tenim pensat utilitzar només aquells vols que han aterrat així que orientarem el preprocessing de la base de dades enfocada al nostre Data Analysis Pipeline. La idea que tenim per a utilitzar la base de dades de vols és intentar predir si un vol tindrà retard o no. Per això utilitzarem les variables rellevants d'aquesta base de dades. Com hi ha molt poques dades de real_arrival_time, traurem la variable de delay amb la real_departure_time o estimated arrival si la té.

First of all, if the real arrival time is later than the scheduled one, then it has experienced a delay.

In [113]:
DF_delay = DF.withColumn('delay_in_seconds',F.lit(None))
for ad in ['departure','arrival']:
    for re in ['estimated','real']:
        DF_Delay = DF_Delay.withColumn(
            "delay_in_seconds",
            F.when(
                (F.col(f"{re}_{ad}_time").isNotNull()) &
                (F.col(f"scheduled_{ad}_time").isNotNull()),
                (F.col(f"{re}_{ad}_time") - F.col(f"scheduled_{ad}_time"))
            ).otherwise(F.col("delay_in_seconds"))
        )


In [114]:
print(f"There are {DF_Delay.filter(DF_Delay.delay_in_seconds > 0).count()} flights with delay")

There are 865 flights with delay


In [115]:
DF_Delay = DF_Delay.dropna(subset="delay_in_seconds")

In [116]:
DF_Delay.count()

1680

In [117]:
DF_Delay = DF_Delay.withColumn(
    "has_delay",
    F.when(
        (F.col("delay_in_seconds").isNotNull()) &
        (F.col("delay_in_seconds") > 0),
        1
    ).otherwise(0)
)

DF_Delay.show()

+---------------+---------+---------+---------+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+------------------------+--------------------+--------------------+----------------+--------------------+-------------------+--------------+-----------+---------------+----------------+-----------+---------------+------------------+----------------+---------+
|airport_acronym|longitude| latitude|     city|flight_number_default|estimated_arrival_time|estimated_departure_time|real_arrival_time|real_departure_time|scheduled_arrival_time|scheduled_departure_time|          owner_name|          owner_logo|destination_gate|destination_terminal|destination_baggage|origin_airport|origin_city|origin_latitude|origin_longitude|origin_gate|origin_terminal|flight_status_live|delay_in_seconds|has_delay|
+---------------+---------+---------+---------+---------------------+----------------------+------------------------+-------

Now, we will delete arrival and departure times and keep only the date of the flight to be able to connect to the aribnb database.

In [122]:
DF_Date = DF_Delay.withColumn("arrival_date", F.from_unixtime(F.col("scheduled_arrival_time"), "yyyy-MM-dd").cast(DateType()))

We will keep the departure time because it can impact to the delay. Many times, last flights of the day have delay but it is not usual to have delay in the first flight.

In [125]:
DF_Time = DF_Date.withColumn("departure_time", F.from_unixtime(F.col("scheduled_departure_time"), "HH:mm:ss"))

Finally we will delete unuseful columns

In [126]:
DF_Time.show()

+---------------+---------+---------+---------+---------------------+----------------------+------------------------+-----------------+-------------------+----------------------+------------------------+--------------------+--------------------+----------------+--------------------+-------------------+--------------+-----------+---------------+----------------+-----------+---------------+------------------+----------------+---------+------------+--------------+
|airport_acronym|longitude| latitude|     city|flight_number_default|estimated_arrival_time|estimated_departure_time|real_arrival_time|real_departure_time|scheduled_arrival_time|scheduled_departure_time|          owner_name|          owner_logo|destination_gate|destination_terminal|destination_baggage|origin_airport|origin_city|origin_latitude|origin_longitude|origin_gate|origin_terminal|flight_status_live|delay_in_seconds|has_delay|arrival_date|departure_time|
+---------------+---------+---------+---------+---------------------

In [139]:
columns_to_drop = [
    'estimated_arrival_time',
    'estimated_departure_time',
    'real_arrival_time',
    'real_departure_time',
    'scheduled_arrival_time',
    'scheduled_departure_time',
    'destination_gate',
    'destination_terminal',
    'destination_baggage',
    'origin_gate',
    'origin_terminal',
    'flight_status_live',
    'delay_in_seconds'
]

# Make sure to strip any accidental leading or trailing spaces from column names
columns_to_drop = [col.strip() for col in columns_to_drop]

# Drop the columns
DF_cols = DF_Time.drop(*columns_to_drop)

Change NULL to 'Unkown'

In [142]:
null_counts = DF_cols.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in DF_cols.columns])
null_counts.show()

+---------------+---------+--------+----+---------------------+----------+----------+--------------+-----------+---------------+----------------+---------+------------+--------------+
|airport_acronym|longitude|latitude|city|flight_number_default|owner_name|owner_logo|origin_airport|origin_city|origin_latitude|origin_longitude|has_delay|arrival_date|departure_time|
+---------------+---------+--------+----+---------------------+----------+----------+--------------+-----------+---------------+----------------+---------+------------+--------------+
|              0|        0|       0|   0|                  144|       182|       453|             0|          0|              0|               0|        0|           0|             0|
+---------------+---------+--------+----+---------------------+----------+----------+--------------+-----------+---------------+----------------+---------+------------+--------------+



In [140]:
DF_Final = DF_cols.fillna('Unknown')

In [141]:
DF_Final.show()

+---------------+---------+---------+---------+---------------------+--------------------+--------------------+--------------+-----------+---------------+----------------+---------+------------+--------------+
|airport_acronym|longitude| latitude|     city|flight_number_default|          owner_name|          owner_logo|origin_airport|origin_city|origin_latitude|origin_longitude|has_delay|arrival_date|departure_time|
+---------------+---------+---------+---------+---------------------+--------------------+--------------------+--------------+-----------+---------------+----------------+---------+------------+--------------+
|           EHAM| 4.763889|52.308609|Amsterdam|                OS373|   Austrian Airlines|https://images.fl...|           VIE|     Vienna|      48.110271|       16.569719|        0|  2024-03-21|      13:50:00|
|           EHAM| 4.763889|52.308609|Amsterdam|               KL1920|      KLM Cityhopper|https://images.fl...|           GDN|     Gdansk|      54.376484|      

In [143]:
conn = duckdb.connect("quality_database.duckdb")
conn.close()

DF_Final.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:quality_database.duckdb") \
    .option("dbtable", "flights") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .save()