# Delayed Flights: Machine Learning Model

## Initial Processing of Data

In [1]:
# Importing libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [2]:
# Load the data (this is a cleaned already formatted dataset)
delay_df = pd.read_csv("s3://final-project77/merged_delay.csv")
delay_df.head()

Unnamed: 0,flight_id,mkt_carrier_fl_num,month,day_of_month,day_of_week,crs_elapsed_time,distance,dep_del15,dep_time_blk2,arr_time_blk2,...,mkt_unique_carrier_aa,mkt_unique_carrier_as,mkt_unique_carrier_b6,mkt_unique_carrier_dl,mkt_unique_carrier_f9,mkt_unique_carrier_g4,mkt_unique_carrier_ha,mkt_unique_carrier_nk,mkt_unique_carrier_ua,mkt_unique_carrier_wn
0,355,4404,1,1,3,185,1183,0.0,10,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,4086,424,1,31,5,65,162,0.0,12,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,4521,952,1,31,5,110,533,0.0,22,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,5597,1678,1,31,5,95,488,0.0,9,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,6336,40,1,6,1,75,293,0.0,18,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
# Get a list of columns to find outcome column (dep_del15)
delay_df.columns

Index(['flight_id', 'mkt_carrier_fl_num', 'month', 'day_of_month',
       'day_of_week', 'crs_elapsed_time', 'distance', 'dep_del15',
       'dep_time_blk2', 'arr_time_blk2', 'origin_state_nm_california',
       'origin_state_nm_colorado', 'origin_state_nm_florida',
       'origin_state_nm_georgia', 'origin_state_nm_illinois',
       'origin_state_nm_new_york', 'origin_state_nm_north_carolina',
       'origin_state_nm_other', 'origin_state_nm_texas',
       'origin_state_nm_virginia', 'dest_state_nm_california',
       'dest_state_nm_colorado', 'dest_state_nm_florida',
       'dest_state_nm_georgia', 'dest_state_nm_illinois',
       'dest_state_nm_new_york', 'dest_state_nm_north_carolina',
       'dest_state_nm_other', 'dest_state_nm_texas', 'dest_state_nm_virginia',
       'mkt_unique_carrier_aa', 'mkt_unique_carrier_as',
       'mkt_unique_carrier_b6', 'mkt_unique_carrier_dl',
       'mkt_unique_carrier_f9', 'mkt_unique_carrier_g4',
       'mkt_unique_carrier_ha', 'mkt_unique_carrier

In [5]:
# Drop the cancellation code columns
delay_df = delay_df.drop(['flight_id', 'mkt_carrier_fl_num'], 1)
delay_df

  


Unnamed: 0,month,day_of_month,day_of_week,crs_elapsed_time,distance,dep_del15,dep_time_blk2,arr_time_blk2,origin_state_nm_california,origin_state_nm_colorado,...,mkt_unique_carrier_aa,mkt_unique_carrier_as,mkt_unique_carrier_b6,mkt_unique_carrier_dl,mkt_unique_carrier_f9,mkt_unique_carrier_g4,mkt_unique_carrier_ha,mkt_unique_carrier_nk,mkt_unique_carrier_ua,mkt_unique_carrier_wn
0,1,1,3,185,1183,0.0,10,12,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,31,5,65,162,0.0,12,12,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,31,5,110,533,0.0,22,23,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,31,5,95,488,0.0,9,11,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,6,1,75,293,0.0,18,19,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745842,1,25,6,155,836,0.0,17,19,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2745843,1,1,3,271,2182,0.0,6,13,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745844,1,29,3,91,404,0.0,20,22,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745845,1,29,3,96,356,0.0,12,14,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Split our preprocessed data into our features and target arrays
# Create our features
X = pd.get_dummies(delay_df.drop('dep_del15', axis=1))

# Create our target
y = delay_df['dep_del15']

# View X
X

Unnamed: 0,month,day_of_month,day_of_week,crs_elapsed_time,distance,dep_time_blk2,arr_time_blk2,origin_state_nm_california,origin_state_nm_colorado,origin_state_nm_florida,...,mkt_unique_carrier_aa,mkt_unique_carrier_as,mkt_unique_carrier_b6,mkt_unique_carrier_dl,mkt_unique_carrier_f9,mkt_unique_carrier_g4,mkt_unique_carrier_ha,mkt_unique_carrier_nk,mkt_unique_carrier_ua,mkt_unique_carrier_wn
0,1,1,3,185,1183,10,12,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,31,5,65,162,12,12,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,31,5,110,533,22,23,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,31,5,95,488,9,11,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,6,1,75,293,18,19,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2745842,1,25,6,155,836,17,19,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2745843,1,1,3,271,2182,6,13,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745844,1,29,3,91,404,20,22,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2745845,1,29,3,96,356,12,14,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Check the balance of our target values
y.value_counts()

0.0    2476064
1.0     269783
Name: dep_del15, dtype: int64

In [8]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Sampling Methods

### Random Oversampling

In [10]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)

In [11]:
# Train the Logistic Regression model using the resampled data
regression = LogisticRegression(solver='lbfgs', random_state=1)
regression.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [12]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = regression.predict(X_test_scaled)
balanced_accuracy_score(y_test, y_pred)

0.6234693485364753

In [13]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[360143, 258873],
       [ 22585,  44861]])

In [14]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.58      0.67      0.72      0.62      0.38    619016
        1.0       0.15      0.67      0.58      0.24      0.62      0.39     67446

avg / total       0.86      0.59      0.66      0.67      0.62      0.38    686462



## Ensemble Learners

### Balanced Random Forest Classifier

In [15]:
# Fit the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

random_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
random_model = random_model.fit(X_train_scaled, y_train)
random_model

BalancedRandomForestClassifier(random_state=1)

In [16]:
# Calculated the balanced accuracy score
predictions = random_model.predict(X_test_scaled)
balanced_accuracy_score(y_test, predictions)

0.6678115542756844

In [17]:
# Display the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm

array([[411323, 207693],
       [ 22180,  45266]])

In [18]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.95      0.66      0.67      0.78      0.67      0.45    619016
        1.0       0.18      0.67      0.66      0.28      0.67      0.45     67446

avg / total       0.87      0.67      0.67      0.73      0.67      0.45    686462



In [19]:
# List the features sorted in descending order by feature importance
for i in sorted(zip(random_model.feature_importances_, X.columns), reverse=True):
    print(f'{i[1]}: ({i[0]})')

distance: (0.16826688215687238)
crs_elapsed_time: (0.16587064900378862)
day_of_month: (0.14695849568367042)
month: (0.09577402852531805)
dep_time_blk2: (0.08515227493645122)
arr_time_blk2: (0.08226359226418292)
day_of_week: (0.07619023635007074)
dest_state_nm_other: (0.014738098896758464)
origin_state_nm_other: (0.013658988292825767)
dest_state_nm_texas: (0.008012871332356779)
origin_state_nm_texas: (0.007628031560903765)
mkt_unique_carrier_aa: (0.0075126728199633895)
mkt_unique_carrier_ua: (0.007426785596927201)
dest_state_nm_california: (0.007420427361456515)
dest_state_nm_florida: (0.007119154317251181)
mkt_unique_carrier_wn: (0.00695249874816417)
origin_state_nm_florida: (0.006479104083664283)
dest_state_nm_illinois: (0.006366114268435157)
origin_state_nm_california: (0.006254067244409304)
origin_state_nm_illinois: (0.006198871558038988)
mkt_unique_carrier_dl: (0.006096464449846365)
dest_state_nm_new_york: (0.0054754717380634185)
origin_state_nm_new_york: (0.005233435089577714)
des