In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
#  Import and read the flights_data.csv.
flights_df = pd.read_csv("flights_data_df.csv")
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,Cancelled,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,114.0,,,,,,
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,117.0,,,,,,
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,121.0,,,,,,
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,121.0,,,,,,
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,135.0,,,,,,


In [5]:
# Creating 'Is_Delayed' column that will tell us if a flight is delayed or not. 
flights_df["Is_Delayed"] = None
flights_df.loc[flights_df["DepDelayMinutes"] == 0, "Is_Delayed"] = "1"
flights_df.loc[flights_df["DepDelayMinutes"] > 0, "Is_Delayed"] = "0"

In [6]:
# cutting out a segment for the model
flights_df = flights_df[0:1000000]

In [7]:
flights_df.dtypes[flights_df.dtypes == 'O']

Airline             object
Origin              object
OriginCityName      object
Dest                object
DestCityName        object
CancellationCode    object
Is_Delayed          object
dtype: object

In [8]:
#Drop unnecessary columns
#flights_df_segment = flights_data_segment.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'CarrierDelay', 'WeatherDelay', "NASDelay", 'SecurityDelay', 'LateAircraftDelay', 'FirstDepTime', 'CancellationCode'], axis=1)
flights_df = flights_df.drop(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'CRSArrTime', 'ArrTime', 'CRSElapsedTime', 'ActualElapsedTime', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Origin', 'OriginCityName', 'DestCityName', 'Unnamed: 0', 'Cancelled', 'FirstDepTime', 'CancellationCode'], axis=1)
flights_df

Unnamed: 0,Airline,Dest,Is_Delayed
0,Delta Air Lines Inc.,BOS,0
1,Delta Air Lines Inc.,BOS,1
2,Delta Air Lines Inc.,BOS,0
3,Delta Air Lines Inc.,BOS,1
4,Delta Air Lines Inc.,BOS,1
...,...,...,...
999995,Delta Air Lines Inc.,IND,1
999996,Delta Air Lines Inc.,IND,1
999997,Delta Air Lines Inc.,IND,1
999998,Delta Air Lines Inc.,IND,1


In [9]:
flights_df.fillna(0, inplace=True)

In [10]:
data_type= flights_df.dtypes
data_type

Airline       object
Dest          object
Is_Delayed    object
dtype: object

In [11]:
for col in ['Is_Delayed']:
    flights_df[col] = flights_df[col].astype('int')

In [12]:
df_binary_encoded = pd.get_dummies(flights_df, columns=["Airline", "Dest"])

# Create our features
X = df_binary_encoded.drop(columns="Is_Delayed", axis=1) # YOUR CODE HERE


# Create our target
y = flights_df["Is_Delayed"] # YOUR CODE HERE

In [13]:
from sklearn.model_selection import train_test_split
# YOUR CODE HERE
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)
Counter(y_train)

Counter({1: 578846, 0: 171154})

In [14]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({1: 578846, 0: 578846})

In [15]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [16]:
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5499359163591901

In [17]:
# Display the confusion matrix
# YOUR CODE HERE
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[ 24974,  32077],
       [ 65193, 127756]], dtype=int64)

In [18]:
# Print the imbalanced classification report
# YOUR CODE HERE
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.28      0.44      0.66      0.34      0.54      0.28     57051
          1       0.80      0.66      0.44      0.72      0.54      0.30    192949

avg / total       0.68      0.61      0.49      0.64      0.54      0.29    250000



In [19]:
# Resample the training data with SMOTE
# YOUR CODE HERE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)
Counter(y_resampled)

Counter({1: 578846, 0: 578846})

In [20]:
# Train the Logistic Regression model using the resampled data
# YOUR CODE HERE
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

y_pred = model.predict(X_test)

In [21]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
balanced_accuracy_score(y_test, y_pred)

0.5497508782665556