In [1]:
import plotly.express as px
import plotly.graph_objects as go
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime as dt
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
import seaborn as sns
import sqlite3

# warnings
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

# Hide warning temporarily
pd.options.mode.chained_assignment = None

In [2]:
df = pd.read_csv("data/flights_sample_2m.csv")

In [3]:
dow_category = CategoricalDtype(categories=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday",
                                            "Saturday", "Sunday"], ordered=True)

df["FL_DATE"] = pd.to_datetime(df["FL_DATE"])
df["YEAR"] = df["FL_DATE"].dt.year
df["MONTH"] = df["FL_DATE"].dt.month
df["QUARTER"] = ((df["MONTH"] -1)//3)+1
df["DAY_OF_WEEK"] = df["FL_DATE"].dt.dayofweek.map({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday',
                                                     4: 'Friday', 5: 'Saturday', 6: 'Sunday'}).astype(dow_category)

df["DISTANCE_CAT"] = pd.cut(
    df["DISTANCE"], bins = [-1, 500, 1500, float("inf")],
    labels = ["Short-haul", "Medium-haul", "Long-haul"]
)

In [4]:
# T·∫°o file database m·ªõi (n·∫øu ch∆∞a c√≥)
conn = sqlite3.connect("flights.db")  # file s·∫Ω t·ª± ƒë·ªông t·∫°o

# Ghi DataFrame v√†o SQLite
df.to_sql("flights", conn, if_exists="replace", index=False)

2000000

In [5]:
query_1 = """
SELECT R1.TOTAL_FLIGHTS, R2.N_DELAYED_FLIGHTS, R3.N_CANCELLED_FLIGHTS, R4.N_DIVERTED_FLIGHTS, R5.N_ONTIME_FLIGHTS
FROM (SELECT printf("%,d", COUNT(*)) AS TOTAL_FLIGHTS FROM flights) R1
JOIN (SELECT printf("%,d", COUNT(*)) AS N_DELAYED_FLIGHTS FROM flights WHERE ARR_DELAY > 15) R2
JOIN (SELECT printf("%,d", COUNT(*)) AS N_CANCELLED_FLIGHTS FROM flights WHERE CANCELLED == 1) R3
JOIN (SELECT printf("%,d", COUNT(*)) AS N_DIVERTED_FLIGHTS FROM flights WHERE DIVERTED == 1) R4
JOIN (SELECT printf("%,d", COUNT(*)) AS N_ONTIME_FLIGHTS FROM flights WHERE ARR_DELAY <=15) R5
"""
result_1 = pd.read_sql(query_1, conn)
result_1

Unnamed: 0,TOTAL_FLIGHTS,N_DELAYED_FLIGHTS,N_CANCELLED_FLIGHTS,N_DIVERTED_FLIGHTS,N_ONTIME_FLIGHTS
0,2000000,343875,52522,4709,1598894


In [6]:
query_2 = """
SELECT DISTINCT AIRLINE
FROM flights
ORDER BY AIRLINE;
"""
result_2 = pd.read_sql(query_2, conn)
result_2

Unnamed: 0,AIRLINE
0,Alaska Airlines Inc.
1,Allegiant Air
2,American Airlines Inc.
3,Delta Air Lines Inc.
4,Endeavor Air Inc.
5,Envoy Air
6,ExpressJet Airlines LLC d/b/a aha!
7,Frontier Airlines Inc.
8,Hawaiian Airlines Inc.
9,Horizon Air


In [7]:
query_3 = """
SELECT YEAR, printf("%,d", COUNT(*)) AS TOTAL_FLIGHTS
FROM flights
GROUP BY YEAR;
"""
result_3 = pd.read_sql(query_3, conn)
result_3

Unnamed: 0,YEAR,TOTAL_FLIGHTS
0,2019,505345
1,2020,318768
2,2021,407824
3,2022,457760
4,2023,310303


In [8]:
query_4 = """
SELECT YEAR, MONTH, printf("%,d", COUNT(*)) AS TOTAL_FLIGHTS
FROM flights
GROUP BY YEAR, MONTH;
"""
result_4 = pd.read_sql(query_4, conn)
result_4

Unnamed: 0,YEAR,MONTH,TOTAL_FLIGHTS
0,2019,1,39568
1,2019,2,35912
2,2019,3,43208
3,2019,4,41136
4,2019,5,43293
5,2019,6,43584
6,2019,7,44850
7,2019,8,45015
8,2019,9,41281
9,2019,10,43438


In [9]:
query_5 = """
SELECT
    YEAR,
    COUNT(*) AS Total_Flights,

    -- Delayed Rate: ARR_DELAY > 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY > 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Delay_Rate,

    -- On-Time Rate: ARR_DELAY <= 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY <= 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS OnTime_Rate,

    -- Cancel Rate
    ROUND(100.0 * SUM(CASE WHEN CANCELLED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Cancel_Rate,

    -- Divert Rate
    ROUND(100.0 * SUM(CASE WHEN DIVERTED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Divert_Rate

FROM flights
WHERE AIRLINE = 'Envoy Air'          -- üîπ ƒë·ªïi t√™n h√£ng ·ªü ƒë√¢y
GROUP BY YEAR
ORDER BY YEAR;

"""
result_5 = pd.read_sql(query_5, conn)
result_5

Unnamed: 0,YEAR,Total_Flights,Delay_Rate,OnTime_Rate,Cancel_Rate,Divert_Rate
0,2019,22111,19.26,76.88,3.56,0.29
1,2020,14435,11.74,82.58,5.54,0.15
2,2021,17665,14.78,82.56,2.39,0.27
3,2022,16823,16.38,80.95,2.41,0.26
4,2023,10111,18.33,79.98,1.44,0.25


In [10]:
query_6 = """
SELECT
    YEAR, MONTH,
    COUNT(*) AS Total_Flights,

    -- Delayed Rate: ARR_DELAY > 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY > 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Delay_Rate,

    -- On-Time Rate: ARR_DELAY <= 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY <= 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS OnTime_Rate,

    -- Cancel Rate
    ROUND(100.0 * SUM(CASE WHEN CANCELLED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Cancel_Rate,

    -- Divert Rate
    ROUND(100.0 * SUM(CASE WHEN DIVERTED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Divert_Rate

FROM flights
WHERE AIRLINE = 'Envoy Air'          -- üîπ ƒë·ªïi t√™n h√£ng ·ªü ƒë√¢y
GROUP BY YEAR, MONTH
ORDER BY YEAR, MONTH;

"""
result_6 = pd.read_sql(query_6, conn)
result_6

Unnamed: 0,YEAR,MONTH,Total_Flights,Delay_Rate,OnTime_Rate,Cancel_Rate,Divert_Rate
0,2019,1,1751,20.1,71.9,7.94,0.06
1,2019,2,1531,26.71,66.82,6.07,0.39
2,2019,3,1785,15.13,83.25,1.51,0.11
3,2019,4,1761,15.45,80.24,4.03,0.28
4,2019,5,1933,22.61,73.98,3.26,0.16
5,2019,6,1923,25.43,69.58,4.42,0.57
6,2019,7,1999,19.61,75.24,4.85,0.3
7,2019,8,2106,19.47,76.73,3.28,0.52
8,2019,9,1877,16.68,79.97,3.14,0.21
9,2019,10,1875,17.65,80.64,1.49,0.21


In [11]:
query_7 = """
SELECT
    YEAR, QUARTER,
    COUNT(*) AS Total_Flights,

    -- Delayed Rate: ARR_DELAY > 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY > 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Delay_Rate,

    -- On-Time Rate: ARR_DELAY <= 15
    ROUND(100.0 * SUM(CASE WHEN ARR_DELAY <= 15 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS OnTime_Rate,

    -- Cancel Rate
    ROUND(100.0 * SUM(CASE WHEN CANCELLED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Cancel_Rate,

    -- Divert Rate
    ROUND(100.0 * SUM(CASE WHEN DIVERTED = 1 THEN 1 ELSE 0 END) 
          / COUNT(*), 2) AS Divert_Rate

FROM flights
WHERE AIRLINE = 'Envoy Air'          -- üîπ ƒë·ªïi t√™n h√£ng ·ªü ƒë√¢y
GROUP BY YEAR, QUARTER
ORDER BY YEAR, QUARTER;

"""
result_7 = pd.read_sql(query_7, conn)
result_7

Unnamed: 0,YEAR,QUARTER,Total_Flights,Delay_Rate,OnTime_Rate,Cancel_Rate,Divert_Rate
0,2019,1,5067,20.35,74.36,5.11,0.18
1,2019,2,5617,21.33,74.43,3.9,0.34
2,2019,3,5982,18.64,77.25,3.76,0.35
3,2019,4,5445,16.8,81.36,1.56,0.28
4,2020,1,5280,14.22,79.03,6.59,0.15
5,2020,2,2477,5.29,78.0,16.63,0.08
6,2020,3,3368,10.63,88.69,0.5,0.18
7,2020,4,3310,13.72,85.47,0.66,0.15
8,2021,1,3759,13.78,80.85,5.11,0.27
9,2021,2,4439,16.63,81.28,1.69,0.41
