In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

data1 = "../data/clean/flight.csv"
data2 = "../data/clean/city.csv"
data3 = "../data/clean/airlines.csv"

flight = pd.read_csv(data1, header=None)
flight.columns = ['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM',
       'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID', 'CRS_DEP_TIME',
       'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY', 'ARR_DELAY_NEW',
       'CANCELLED', 'DISTANCE']
city = pd.read_csv(data2)
airlines = pd.read_csv(data3)

In [None]:
# Defining function for categorization (ON-TIME, DELAY, CANCELLED)
def delay_cat(delay):
    if pd.isnull(delay):  # Check for NaN
        return 'Cancelled'
    elif delay <= 0:
        return 'On-time'
    elif delay <= 15:
        return 'Small Delay'
    elif delay <= 45:
        return 'Medium Delay'
    else:
        return 'Large Delay'
    
flight["dep_delay_cat"] = flight["DEP_DELAY"].apply(delay_cat) 
flight["arr_delay_cat"] = flight["ARR_DELAY"].apply(delay_cat) 

In [None]:
#Checking categorization
flight[["dep_delay_cat","DEP_DELAY","CANCELLED"]][(flight["dep_delay_cat"]=="Cancelled")].head()
flight[["arr_delay_cat","ARR_DELAY","CANCELLED"]][(flight["arr_delay_cat"]=="Cancelled")].head()
flight[["dep_delay_cat","DEP_DELAY","CANCELLED"]].head(10)
flight[["arr_delay_cat","ARR_DELAY","CANCELLED"]].head(10)
flight[["num_flight"]]=1

In [None]:
#Flight status dataset
flight_status = flight[['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM',
       'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID', 'CRS_DEP_TIME',
       'DEP_TIME', 'DEP_DELAY','CANCELLED', 'DISTANCE', 'dep_delay_cat',
       'num_flight']]
status_flight = {'Cancelled': 'Cancelled', 'On-time': 'On-time','Small Delay': 'Delay', 'Medium Delay': 'Delay', 'Large Delay':'Delay' }
flight_status['flight_departure_status'] = flight_status['dep_delay_cat'].map(status_flight)

In [None]:
# Visualization Number of Flight by Carrier and Flight Status
status = flight_status.groupby(['OP_UNIQUE_CARRIER','flight_departure_status'], as_index=False)['num_flight'].count()
status["iata"]=status["OP_UNIQUE_CARRIER"]
status = pd.merge(airlines, status, on='iata', how='inner')
status['Total per Airlines'] = status.groupby('OP_UNIQUE_CARRIER')['num_flight'].transform('sum')
status["% Total"] = (status["num_flight"] / status["Total per Airlines"]) * 100

fig = px.bar(status, x="name", y="num_flight", color="flight_departure_status", title="Number of flights by carrier")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.write_html('../figures/number_of_flights_by_carrier.html', auto_open=True)
fig.write_image("../figures/number_of_flights_by_carrier.png")



In [None]:
flight_status.groupby(['OP_UNIQUE_CARRIER'], as_index=False).count().head(5).sort_values(by="num_flight", ascending=False)

In [None]:
# Most delayed airlines
status[status["flight_departure_status"]=="Delay"][['name','iata','flight_departure_status','% Total']].sort_values(by=['% Total'], ascending=False).head(5)

In [None]:
# Top 5 Airlines Analysis
top5 = status[['name','iata','flight_departure_status','% Total',"Total per Airlines"]].sort_values(by=['Total per Airlines','flight_departure_status'], ascending=False).head(15)
top5[top5["flight_departure_status"]=="Delay"][["name","iata","flight_departure_status","% Total"]]

In [None]:
# Flight Delayed Dataframe
delay = ["Small Delay", "Medium Delay","Large Delay"]
departure_delay = flight[(flight['dep_delay_cat'].isin(delay))]
departure_delay = departure_delay[['FL_DATE', 'OP_UNIQUE_CARRIER', 'OP_CARRIER_FL_NUM',
       'ORIGIN_CITY_MARKET_ID', 'DEST_CITY_MARKET_ID', 'CRS_DEP_TIME',
       'DEP_TIME', 'DEP_DELAY','DISTANCE','dep_delay_cat','num_flight']]

delay_carrier = departure_delay.groupby(['OP_UNIQUE_CARRIER','dep_delay_cat'], as_index=False)['num_flight'].count()
delay_carrier['Total per Airlines'] = delay_carrier.groupby('OP_UNIQUE_CARRIER')['num_flight'].transform('sum')
delay_carrier["Percentage of Delay Categorization"] = (delay_carrier["num_flight"] / delay_carrier["Total per Airlines"]) * 100
delay_carrier["iata"]=delay_carrier["OP_UNIQUE_CARRIER"]
delay_carrier = pd.merge(airlines, delay_carrier, on='iata', how='inner')

fig = px.bar(delay_carrier, x="name", y="Percentage of Delay Categorization", color="dep_delay_cat", title="Delay Categorization by Airlines")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.write_html('../figures/delay_categorization_by_carrier.html', auto_open=True)
fig.write_image("../figures/delay_categorization_by_carrier.png")

In [None]:
# Delayed Analysis of Leading Airlines
delay_top5 = delay_carrier[['name','iata','dep_delay_cat','Percentage of Delay Categorization',"Total per Airlines"]].sort_values(by=['Total per Airlines','Percentage of Delay Categorization'], ascending=False).head(15)

fig = px.bar(delay_top5, x="name", y="Percentage of Delay Categorization", color="dep_delay_cat", title="Delay Categorization by Leading Airlines")
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'}, font=dict(size=25))
fig.write_html('../figures/delay_categorization_by_carrier5.html', auto_open=True)
fig.write_image("../figures/delay_categorization_by_carrier5.png", width=800, height=1000)

In [None]:
# Small Delay
small_delay = delay_carrier[delay_carrier["dep_delay_cat"]=="Small Delay"]
small_delay["% of Small Delay"] = small_delay["Percentage of Delay Categorization"]
small_delay[["name","iata","dep_delay_cat","% of Small Delay"]].sort_values(by="% of Small Delay", ascending=False).head(5)

In [None]:
# Large Delay
large_delay = delay_carrier[delay_carrier["dep_delay_cat"]=="Large Delay"]
large_delay["% of Large Delay"] = large_delay["Percentage of Delay Categorization"]
large_delay[["name","iata","dep_delay_cat","% of Large Delay"]].sort_values(by="% of Large Delay", ascending=False).head(5)

In [None]:
# Medium Delay
medium_delay = delay_carrier[delay_carrier["dep_delay_cat"]=="Medium Delay"]
medium_delay["% of Medium Delay"] = medium_delay["Percentage of Delay Categorization"]
medium_delay[["name","iata","dep_delay_cat","% of Medium Delay"]].sort_values(by="% of Medium Delay", ascending=False).head(5)