In [1]:
## Optional
# Format output of Jupyter Notebook
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

# Hide python warnings
import warnings
warnings.filterwarnings('ignore')

In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import isnull, when, count, col, regexp_extract

# Init SparkSession
spark = (
    SparkSession
    .builder
    .appName('EDA')
    .getOrCreate()
)

filename = '/home/iceberg/data/flights.csv'

# Ingest raw csv data

In [37]:
# Check for alphabetic characters in flight_number, this is useless...
raw_data = spark.read.option('header', True).csv(filename)
raw_data.where(regexp_extract('flight_number', '^\d+$', 0) == '').show()

[Stage 34:>                                                         (0 + 7) / 7]

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

                                                                                

In [24]:
# TODO: Fix nullables column, cant set column as Not-Nullable for some reason...

# Define schema
flights_schema = StructType([
    StructField("year", IntegerType(), False),   # Non-Nullable
    StructField("month", IntegerType(), False),  # Non-Nullable
    StructField("day", IntegerType(), False),    # Non-Nullable
    StructField("day_of_week", IntegerType(), False),    # Non-Nullable
    StructField("airline", StringType(), False),         # Non-Nullable
    StructField("flight_number", IntegerType(), False),  # Non-Nullable
    StructField("tail_number", StringType(), True),
    StructField("origin_airport", StringType(), False),  # Non-Nullable
    StructField("destination_airport", StringType(), False),   # Non-Nullable
    StructField("scheduled_departure", IntegerType(), False),  # Non-Nullable
    StructField("departure_time", IntegerType(), True),
    StructField("departure_delay", IntegerType(), True),
    StructField("taxi_out", IntegerType(), True),
    StructField("wheels_off", IntegerType(), True),
    StructField("scheduled_time", IntegerType(), True),
    StructField("elapsed_time", IntegerType(), True),
    StructField("air_time", IntegerType(), True),
    StructField("distance", IntegerType(), False),  # Non-Nullable
    StructField("wheels_on", IntegerType(), True),
    StructField("taxi_in", IntegerType(), True),
    StructField("scheduled_arrival", IntegerType(), False),  # Non-Nullable
    StructField("arrival_time", IntegerType(), True),
    StructField("arrival_delay", IntegerType(), True),
    StructField("diverted", IntegerType(), False),   # Non-Nullable
    StructField("cancelled", IntegerType(), False),  # Non-Nullable
    StructField("cancellation_reason", StringType(), True),
    StructField("air_system_delay", IntegerType(), True),
    StructField("security_delay", IntegerType(), True),
    StructField("airline_delay", IntegerType(), True),
    StructField("late_aircraft_delay", IntegerType(), True),
    StructField("weather_delay", IntegerType(), True)
])

# Read flights.csv
flights = spark.read.schema(flights_schema).csv(
    '/home/iceberg/data/flights.csv',
    schema=flights_schema,
    enforceSchema=True,
    header = True
)

flights.show(1)
flights.printSchema()

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|year|month|day|day_of_week|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

# Explore data!

## Total number of flight records: 5,819,079

In [32]:
flights.count()

                                                                                

5819079

## Total number of cancelled flights: 89,884, Total number of diverted flights: 15,187
These will cause nulls in other columns.

In [37]:
flights.where('cancelled = 1').count()

                                                                                

89884

In [35]:
flights.where('diverted = 1').count()

                                                                                

15187

## Get Null counts for each column

In [31]:
flights.select([count(when(isnull(c), c)).alias(c) for c in data.columns]).show()



+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|YEAR|MONTH|DAY|DAY_OF_WEEK|AIRLINE|FLIGHT_NUMBER|TAIL_NUMBER|ORIGIN_AIRPORT|DESTINATION_AIRPORT|SCHEDULED_DEPARTURE|DEPARTURE_TIME|DEPARTURE_DELAY|TAXI_OUT|WHEELS_OFF|SCHEDULED_TIME|ELAPSED_TIME|AIR_TIME|DISTANCE|WHEELS_ON|TAXI_IN|SCHEDULED_ARRIVAL|ARRIVAL_TIME|ARRIVAL_DELAY|DIVERTED|CANCELLED|CANCELLATION_REASON|AIR_SYSTEM_DELAY|SECURITY_DELAY|AIRLINE_DELAY|LATE_AIRCRAFT_DELAY|WEATHER_DELAY|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

                                                                                

## Investigate some causes for nulls in each column

In [48]:
# List of columns with null values
null_columns = [col for col in flights.columns if flights.filter(f"{col} IS NULL").count() > 0]

# Iterate through columns to investigate null causes
for col in null_columns:
    print(f"Investigating nulls in column: {col}")
    # Count rows where column is null and cancelled == 1
    count = flights.where(f"`{col}` IS NULL AND cancelled = 1").count()
    print(f"Nulls in {col} where flight is cancelled: {count}\n")

                                                                                

Investigating nulls in column: tail_number


                                                                                

Nulls in tail_number where flight is cancelled: 14721

Investigating nulls in column: departure_time


                                                                                

Nulls in departure_time where flight is cancelled: 86153

Investigating nulls in column: departure_delay


                                                                                

Nulls in departure_delay where flight is cancelled: 86153

Investigating nulls in column: taxi_out


                                                                                

Nulls in taxi_out where flight is cancelled: 89047

Investigating nulls in column: wheels_off


                                                                                

Nulls in wheels_off where flight is cancelled: 89047

Investigating nulls in column: scheduled_time


                                                                                

Nulls in scheduled_time where flight is cancelled: 5

Investigating nulls in column: elapsed_time


                                                                                

Nulls in elapsed_time where flight is cancelled: 89884

Investigating nulls in column: air_time


                                                                                

Nulls in air_time where flight is cancelled: 89884

Investigating nulls in column: wheels_on


                                                                                

Nulls in wheels_on where flight is cancelled: 89884

Investigating nulls in column: taxi_in


                                                                                

Nulls in taxi_in where flight is cancelled: 89884

Investigating nulls in column: arrival_time


                                                                                

Nulls in arrival_time where flight is cancelled: 89884

Investigating nulls in column: arrival_delay


                                                                                

Nulls in arrival_delay where flight is cancelled: 89884

Investigating nulls in column: cancellation_reason


                                                                                

Nulls in cancellation_reason where flight is cancelled: 0

Investigating nulls in column: air_system_delay


                                                                                

Nulls in air_system_delay where flight is cancelled: 89884

Investigating nulls in column: security_delay


                                                                                

Nulls in security_delay where flight is cancelled: 89884

Investigating nulls in column: airline_delay


                                                                                

Nulls in airline_delay where flight is cancelled: 89884

Investigating nulls in column: late_aircraft_delay


                                                                                

Nulls in late_aircraft_delay where flight is cancelled: 89884

Investigating nulls in column: weather_delay


[Stage 290:>                                                      (0 + 12) / 12]

Nulls in weather_delay where flight is cancelled: 89884



                                                                                

#### Tail_number - all Nulls are from cancelled flights
Although, not all cancelled flights have Nulls in tail_number. This might be explained by flights that are cancelled well in advance.
**TODO: Check cancellation reasons**

In [40]:
flights.where('tail_number IS NULL AND cancelled = 1').count()

                                                                                

14721

In [44]:
flights.where('departure_time IS NULL and cancelled = 1').count()

                                                                                

86153

#### Scheduled_time - 6 Nulls, 5 out of 6 flight were cancelled, 1 flight was diverted
Scheduled_time is the estimated time in air?

In [28]:
flights.where('scheduled_time IS NULL').show()

[Stage 19:>                                                         (0 + 7) / 7]

+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+--------+---------+-------+-----------------+------------+-------------+--------+---------+-------------------+----------------+--------------+-------------+-------------------+-------------+
|year|month|day|day_of_week|airline|flight_number|tail_number|origin_airport|destination_airport|scheduled_departure|departure_time|departure_delay|taxi_out|wheels_off|scheduled_time|elapsed_time|air_time|distance|wheels_on|taxi_in|scheduled_arrival|arrival_time|arrival_delay|diverted|cancelled|cancellation_reason|air_system_delay|security_delay|airline_delay|late_aircraft_delay|weather_delay|
+----+-----+---+-----------+-------+-------------+-----------+--------------+-------------------+-------------------+--------------+---------------+--------+----------+--------------+------------+--------+-

                                                                                