### Multiclass Classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
flight_data = pd.read_csv("/teamspace/studios/this_studio/Flight_delay_analysis/data/preprocessed_flight_data_for_binary_classification.csv")
flight_data.head(10)

Unnamed: 0,airline,airline_dot,airline_code,dot_code,fl_number,origin,origin_city,dest,dest_city,crs_dep_time,...,delay_due_carrier,delay_due_weather,delay_due_nas,delay_due_security,delay_due_late_aircraft,day,month,day_of_week,year,target
0,1.286792,1.286792,0.884229,0.001871,-0.543443,-0.52848,-0.554247,-0.606505,0.605839,-0.354125,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,9,1,2,2019,False
1,-1.101954,-1.101954,-0.874146,-0.493776,-0.779814,0.609352,0.415701,1.353425,1.346862,1.631967,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,19,11,5,2022,False
2,1.286792,1.286792,0.884229,0.001871,-1.174718,-0.842364,-0.830017,0.608652,0.415833,-0.767809,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,22,7,4,2022,True
3,-1.101954,-1.101954,-0.874146,-0.493776,-0.123929,0.609352,0.415701,1.373025,1.204357,0.580264,...,-0.138889,-0.051551,1.456688,-0.017174,-0.178016,6,3,0,2023,True
4,1.116167,1.116167,0.180879,1.165449,-1.204479,0.373938,0.739017,-0.831897,-0.885707,1.055691,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,23,2,6,2020,False
5,0.945543,0.945543,1.060067,-1.546032,-1.056819,-0.901218,-0.896582,0.706648,0.710342,-0.652554,...,4.305829,-0.051551,-0.157574,-0.017174,-0.178016,31,7,2,2019,True
6,-1.272579,-1.272579,-1.401658,-0.454018,-0.216073,-0.871791,1.689947,-1.33168,-1.284719,-0.652554,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,11,6,6,2023,False
7,0.604293,0.604293,1.411742,1.260868,1.117445,-0.17536,-0.202403,-0.871096,1.688872,0.65024,...,-0.138889,-0.051551,1.389428,-0.017174,-0.178016,8,7,0,2019,True
8,1.116167,1.116167,0.180879,1.165449,-1.099744,-0.106698,-0.211912,0.157868,0.244828,-1.640454,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,12,2,6,2023,False
9,-1.613828,-1.613828,-1.22582,-0.122703,-1.309787,1.354827,1.347612,-0.586906,-0.629199,1.642257,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,22,8,5,2020,True


In [3]:
flight_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3476398 entries, 0 to 3476397
Data columns (total 35 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   airline                  float64
 1   airline_dot              float64
 2   airline_code             float64
 3   dot_code                 float64
 4   fl_number                float64
 5   origin                   float64
 6   origin_city              float64
 7   dest                     float64
 8   dest_city                float64
 9   crs_dep_time             float64
 10  dep_time                 float64
 11  dep_delay                float64
 12  taxi_out                 float64
 13  wheels_off               float64
 14  wheels_on                float64
 15  taxi_in                  float64
 16  crs_arr_time             float64
 17  arr_time                 float64
 18  arr_delay                float64
 19  cancelled                bool   
 20  diverted                 bool   
 21  crs_elap

In [4]:
flight_data.isna().sum()

airline                    0
airline_dot                0
airline_code               0
dot_code                   0
fl_number                  0
origin                     0
origin_city                0
dest                       0
dest_city                  0
crs_dep_time               0
dep_time                   0
dep_delay                  0
taxi_out                   0
wheels_off                 0
wheels_on                  0
taxi_in                    0
crs_arr_time               0
arr_time                   0
arr_delay                  0
cancelled                  0
diverted                   0
crs_elapsed_time           0
elapsed_time               0
air_time                   0
distance                   0
delay_due_carrier          0
delay_due_weather          0
delay_due_nas              0
delay_due_security         0
delay_due_late_aircraft    0
day                        0
month                      0
day_of_week                0
year                       0
target        

In [5]:
# Drop or keep the original target column based on necessity
flight_data = flight_data.drop(columns=['target'])

In [6]:
flight_data.columns

Index(['airline', 'airline_dot', 'airline_code', 'dot_code', 'fl_number',
       'origin', 'origin_city', 'dest', 'dest_city', 'crs_dep_time',
       'dep_time', 'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on',
       'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'diverted', 'crs_elapsed_time', 'elapsed_time', 'air_time', 'distance',
       'delay_due_carrier', 'delay_due_weather', 'delay_due_nas',
       'delay_due_security', 'delay_due_late_aircraft', 'day', 'month',
       'day_of_week', 'year'],
      dtype='object')

In [7]:
# Define conditions for each delay type
conditions = [
    (flight_data['delay_due_carrier'] > 0),
    (flight_data['delay_due_weather'] > 0),
    (flight_data['delay_due_nas'] > 0),
    (flight_data['delay_due_security'] > 0),
    (flight_data['delay_due_late_aircraft'] > 0),
]

# Assign class labels for each condition
classes = ['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

# Create the new target column
flight_data['delay_category'] = np.select(conditions, classes, default='no_delay')

In [8]:
flight_data.sample(20)

Unnamed: 0,airline,airline_dot,airline_code,dot_code,fl_number,origin,origin_city,dest,dest_city,crs_dep_time,...,delay_due_carrier,delay_due_weather,delay_due_nas,delay_due_security,delay_due_late_aircraft,day,month,day_of_week,year,delay_category
3073684,0.945543,0.945543,1.060067,-1.546032,0.776016,1.304149,1.141212,0.37346,0.738843,0.04378,...,-0.019667,-0.051551,-0.157574,-0.017174,-0.074041,29,8,6,2021,no_delay
2910722,0.945543,0.945543,1.060067,-1.546032,0.068373,0.844765,0.919694,1.490621,1.128355,1.652548,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,17,7,5,2021,no_delay
2637392,1.286792,1.286792,0.884229,0.001871,-0.129652,1.492152,1.128899,-0.841697,-0.828705,-1.08476,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,18,7,0,2022,no_delay
2923324,0.945543,0.945543,1.060067,-1.546032,-1.183875,0.668205,0.681962,0.157868,0.244828,-0.796623,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,28,3,0,2022,no_delay
3118266,-1.101954,-1.101954,-0.874146,-0.493776,-0.62164,-1.58784,-1.495668,-1.08139,-0.987983,1.655217,...,-0.061264,-0.051551,-0.157574,-0.017174,0.136294,17,7,2,2019,late_aircraft_delay
3134257,1.286792,1.286792,0.884229,0.001871,-1.139368,1.124122,1.007308,0.050072,0.596339,0.048405,...,-0.138889,-0.051551,2.96802,-0.017174,-0.178016,8,8,2,2021,nas_delay
3380786,0.945543,0.945543,1.060067,-1.546032,-0.474226,-0.901218,-0.896582,0.547617,0.527072,-0.475264,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,12,7,0,2021,no_delay
1365849,-1.101954,-1.101954,-0.874146,-0.493776,-0.627003,-1.58784,-1.495668,0.040272,-0.106683,0.236557,...,5.030854,-0.051551,-0.157574,-0.017174,-0.178016,6,10,6,2019,carrier_delay
729994,-1.613828,-1.613828,-1.22582,-0.122703,-1.146102,1.51177,1.252519,1.353425,1.346862,0.798426,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,31,8,5,2019,no_delay
1532972,1.286792,1.286792,0.884229,0.001871,-0.356293,1.374445,1.204973,-1.33168,-1.284719,1.909814,...,-0.138889,-0.051551,-0.157574,-0.017174,-0.178016,12,3,5,2022,no_delay


In [10]:
flight_data.delay_category.value_counts()

delay_category
no_delay               2728958
carrier_delay           368739
nas_delay               199553
late_aircraft_delay     134507
weather_delay            42448
security_delay            2193
Name: count, dtype: int64

### Data Preparation

In [None]:
# handle class imbalance 


In [None]:
# split in train and test set


### Model Building and Evaluation

#### 1. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)


#### 2. Gradient Boost Classifier

#### 3. Neural Network Model