In [1]:
### IMPORT DATA ###

# Code from https://community.amstat.org/dataexpo/home

# Data dictionary reference https://www.transtats.bts.gov/Fields.asp?gnoyr_VQ=FGJ

import os

import urllib.request

from concurrent.futures import ThreadPoolExecutor


base_url = "https://blobs.duckdb.org/flight-data-partitioned/"

files = [f"Year={year}/data_0.parquet" for year in range(1987, 2025)]


def download_file(f):

    os.makedirs(os.path.dirname(f), exist_ok=True)

    req = urllib.request.Request(base_url + f, headers={'User-Agent': 'Mozilla/5.0'})

    with urllib.request.urlopen(req) as response, open(f, 'wb') as out_file:

        out_file.write(response.read())


with ThreadPoolExecutor() as executor:

    executor.map(download_file, files)

In [2]:
# Imports
import polars as pl

In [9]:
# Load in Data to a DataFrame
df_2024 = pl.read_parquet('Data/Year=2024/data_0.parquet')

In [10]:
# Head
print(df_2024.head(3))

shape: (3, 110)
┌──────┬─────────┬───────┬────────────┬───┬───────────────┬──────────────┬─────────────┬───────────┐
│ Year ┆ Quarter ┆ Month ┆ DayofMonth ┆ … ┆ Div5LongestGT ┆ Div5WheelsOf ┆ Div5TailNum ┆ column109 │
│ ---  ┆ ---     ┆ ---   ┆ ---        ┆   ┆ ime           ┆ f            ┆ ---         ┆ ---       │
│ i64  ┆ i64     ┆ i64   ┆ i64        ┆   ┆ ---           ┆ ---          ┆ str         ┆ str       │
│      ┆         ┆       ┆            ┆   ┆ str           ┆ str          ┆             ┆           │
╞══════╪═════════╪═══════╪════════════╪═══╪═══════════════╪══════════════╪═════════════╪═══════════╡
│ 2024 ┆ 1       ┆ 1     ┆ 8          ┆ … ┆ null          ┆ null         ┆ null        ┆ null      │
│ 2024 ┆ 1       ┆ 1     ┆ 9          ┆ … ┆ null          ┆ null         ┆ null        ┆ null      │
│ 2024 ┆ 1       ┆ 1     ┆ 10         ┆ … ┆ null          ┆ null         ┆ null        ┆ null      │
└──────┴─────────┴───────┴────────────┴───┴───────────────┴──────────────┴─

In [11]:
# Describe
print(df_2024.describe())

shape: (9, 111)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ Year      ┆ Quarter   ┆ Month     ┆ … ┆ Div5Longe ┆ Div5Wheel ┆ Div5TailN ┆ column10 │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ stGTime   ┆ sOff      ┆ um        ┆ 9        │
│ str       ┆ f64       ┆ f64       ┆ f64       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆ str       ┆ str       ┆ str       ┆ str      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 3.461319e ┆ 3.461319e ┆ 3.461319e ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0        │
│           ┆ 6         ┆ 6         ┆ 6         ┆   ┆           ┆           ┆           ┆          │
│ null_coun ┆ 0.0       ┆ 0.0       ┆ 0.0       ┆ … ┆ 3461319   ┆ 3461319   ┆ 3461319   ┆ 3461319  │
│ t         ┆           ┆           ┆           ┆   ┆           ┆          

In [13]:
# Schema
print(df_2024.schema)

Schema([('Year', Int64), ('Quarter', Int64), ('Month', Int64), ('DayofMonth', Int64), ('DayOfWeek', Int64), ('FlightDate', Date), ('Reporting_Airline', String), ('DOT_ID_Reporting_Airline', Int64), ('IATA_CODE_Reporting_Airline', String), ('Tail_Number', String), ('Flight_Number_Reporting_Airline', Int64), ('OriginAirportID', Int64), ('OriginAirportSeqID', Int64), ('OriginCityMarketID', Int64), ('Origin', String), ('OriginCityName', String), ('OriginState', String), ('OriginStateFips', String), ('OriginStateName', String), ('OriginWac', Int64), ('DestAirportID', Int64), ('DestAirportSeqID', Int64), ('DestCityMarketID', Int64), ('Dest', String), ('DestCityName', String), ('DestState', String), ('DestStateFips', String), ('DestStateName', String), ('DestWac', Int64), ('CRSDepTime', String), ('DepTime', String), ('DepDelay', Float64), ('DepDelayMinutes', Float64), ('DepDel15', Float64), ('DepartureDelayGroups', Int64), ('DepTimeBlk', String), ('TaxiOut', String), ('WheelsOff', String), ('

In [None]:
### Question 1 ------ Looking at fuel price by region and its correlation with x

sub_set = df_2024.select(['Reporting_Airline', 'Cancelled', ''])

In [None]:
### Question 2 ------ Can flights be grouped into clusters based on delay causes

In [None]:
### Question 3 ------- Are there seasonal trends to flight cancellations (Seasonal decomposition)

In [None]:
### Question 4 ------- What factors are most predictive for flight diversions?