# Delayed Flights: Machine Learning Model

## Initial Processing of Data

In [1]:
# Importing libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Load the data (this is a cleaned already formatted dataset)
delay_df = pd.read_csv("s3://final-project77/merged_delay.csv")
delay_df.head()

Unnamed: 0,flight_id,mkt_carrier_fl_num,month,day_of_month,day_of_week,crs_elapsed_time,distance,dep_del15,dep_time_blk2,arr_time_blk2,...,mkt_unique_carrier_aa,mkt_unique_carrier_as,mkt_unique_carrier_b6,mkt_unique_carrier_dl,mkt_unique_carrier_f9,mkt_unique_carrier_g4,mkt_unique_carrier_ha,mkt_unique_carrier_nk,mkt_unique_carrier_ua,mkt_unique_carrier_wn
0,355,4404,1,1,3,185,1183,0.0,10,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4086,424,1,31,5,65,162,0.0,12,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4521,952,1,31,5,110,533,0.0,22,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,5597,1678,1,31,5,95,488,0.0,9,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,6336,40,1,6,1,75,293,0.0,18,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
# Get a list of columns to find outcome column (dep_del15)
delay_df.columns

Index(['flight_id', 'mkt_carrier_fl_num', 'month', 'day_of_month',
       'day_of_week', 'crs_elapsed_time', 'distance', 'dep_del15',
       'dep_time_blk2', 'arr_time_blk2', 'origin_state_nm_california',
       'origin_state_nm_colorado', 'origin_state_nm_florida',
       'origin_state_nm_georgia', 'origin_state_nm_illinois',
       'origin_state_nm_new_york', 'origin_state_nm_north_carolina',
       'origin_state_nm_other', 'origin_state_nm_texas',
       'origin_state_nm_virginia', 'dest_state_nm_california',
       'dest_state_nm_colorado', 'dest_state_nm_florida',
       'dest_state_nm_georgia', 'dest_state_nm_illinois',
       'dest_state_nm_new_york', 'dest_state_nm_north_carolina',
       'dest_state_nm_other', 'dest_state_nm_texas', 'dest_state_nm_virginia',
       'mkt_unique_carrier_aa', 'mkt_unique_carrier_as',
       'mkt_unique_carrier_b6', 'mkt_unique_carrier_dl',
       'mkt_unique_carrier_f9', 'mkt_unique_carrier_g4',
       'mkt_unique_carrier_ha', 'mkt_unique_carrier

In [4]:
# Split our preprocessed data into our features and target arrays
# Create our features
X = pd.get_dummies(delay_df.drop('dep_del15', axis=1))

# Create our target
y = delay_df['dep_del15']

# View X
X

Unnamed: 0,flight_id,mkt_carrier_fl_num,month,day_of_month,day_of_week,crs_elapsed_time,distance,dep_time_blk2,arr_time_blk2,origin_state_nm_california,...,mkt_unique_carrier_aa,mkt_unique_carrier_as,mkt_unique_carrier_b6,mkt_unique_carrier_dl,mkt_unique_carrier_f9,mkt_unique_carrier_g4,mkt_unique_carrier_ha,mkt_unique_carrier_nk,mkt_unique_carrier_ua,mkt_unique_carrier_wn
0,355,4404,1,1,3,185,1183,10,12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4086,424,1,31,5,65,162,12,12,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4521,952,1,31,5,110,533,22,23,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,5597,1678,1,31,5,95,488,9,11,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,6336,40,1,6,1,75,293,18,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745842,418031,992,1,25,6,155,836,17,19,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2745843,418791,1411,1,1,3,271,2182,6,13,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745844,419615,889,1,29,3,91,404,20,22,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745845,419882,1177,1,29,3,96,356,12,14,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Check the balance of our target values
y.value_counts()

0.0    2476064
1.0     269783
Name: dep_del15, dtype: int64

In [6]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Sampling Methods

### Random Oversampling

In [8]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

In [9]:
# Train the Logistic Regression model using the resampled data
regression = LogisticRegression(solver='lbfgs', random_state=1)
regression.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [10]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = regression.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6236814855547791

In [11]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[372906, 246110],
       [ 23947,  43499]])

In [12]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.60      0.64      0.73      0.62      0.39    619016
        1.0       0.15      0.64      0.60      0.24      0.62      0.39     67446

avg / total       0.86      0.61      0.64      0.69      0.62      0.39    686462



## Ensemble Learners

### Balanced Random Forest Classifier

In [13]:
# Fit the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

random_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_model = random_model.fit(X_train_scaled, y_train)
random_model

BalancedRandomForestClassifier(random_state=1)

In [14]:
# Calculated the balanced accuracy score
predictions = random_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.6786805530916125

In [15]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[414610, 204406],
       [ 21072,  46374]])

In [16]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.95      0.67      0.69      0.79      0.68      0.46    619016
        1.0       0.18      0.69      0.67      0.29      0.68      0.46     67446

avg / total       0.88      0.67      0.69      0.74      0.68      0.46    686462



In [17]:
# List the features sorted in descending order by feature importance
for i in sorted(zip(random_model.feature_importances_, X.columns), reverse=True):
    print(f'{i[1]}: ({i[0]})')

flight_id: (0.18786884011507093)
mkt_carrier_fl_num: (0.1179159280076778)
distance: (0.10436102311163822)
crs_elapsed_time: (0.10346067170133778)
day_of_month: (0.10093299604803244)
dep_time_blk2: (0.06644528643203351)
arr_time_blk2: (0.0631497898858914)
day_of_week: (0.05728136154862848)
month: (0.04189596575915691)
dest_state_nm_other: (0.012649405418333075)
origin_state_nm_other: (0.012405116171648457)
origin_state_nm_texas: (0.00713309239607813)
dest_state_nm_texas: (0.007121075016048541)
dest_state_nm_california: (0.006555045668983274)
mkt_unique_carrier_aa: (0.006483027003274002)
mkt_unique_carrier_ua: (0.006362670873451038)
origin_state_nm_california: (0.006220631371078016)
dest_state_nm_florida: (0.0061249869156995455)
mkt_unique_carrier_wn: (0.005927397962113821)
origin_state_nm_florida: (0.005823408459907334)
mkt_unique_carrier_dl: (0.005598103948987234)
dest_state_nm_illinois: (0.0055361080909257475)
origin_state_nm_illinois: (0.005442815328636988)
dest_state_nm_new_york: (0

### Easy Ensemble AdaBoost Classifier

In [None]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier 
eec = EasyEnsembleClassifier(n_estimators=128, random_state=1)
eec.fit(X_train_scaled, y_train)

In [None]:
# Calculated the balanced accuracy score
predictions = eec.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)