In [1]:
 # Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
#  Import and read the flights_data.csv.
flights_df = pd.read_csv("flights_data_df.csv")
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,Cancelled,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,114.0,,,,,,
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,117.0,,,,,,
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,121.0,,,,,,
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,121.0,,,,,,
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,135.0,,,,,,


### Data Preprocessing

In [3]:
# Creating 'Is_Delayed' column that will tell us if a flight is delayed or not. 
flights_df["Is_Delayed"] = None
flights_df.loc[flights_df["DepDelayMinutes"] == 0, "Is_Delayed"] = "0"
flights_df.loc[flights_df["DepDelayMinutes"] > 0, "Is_Delayed"] = "1"


In [4]:
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,Is_Delayed
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,114.0,,,,,,,1
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,117.0,,,,,,,0
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,121.0,,,,,,,1
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,121.0,,,,,,,0
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,135.0,,,,,,,0


In [5]:
# Converting the categorical variables into indicator variables. 
flights = pd.get_dummies(flights_df, columns=['Airline'])

In [6]:
flights.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [7]:
# cutting out a segment for the model
flights_data_segment = flights[0:1000000]

In [8]:
flights_data_segment.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [9]:
#Drop unnecessary columns
flights_df_segment = flights_data_segment.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'CancellationCode'], axis=1)

In [27]:
# Drop NaN values
flights_df_segment.dropna(inplace = True)

In [28]:
# Change dtype
for col in ['DepTime', 'DepDelayMinutes', 
            'ArrTime', 'CRSElapsedTime', 'ActualElapsedTime', 'CarrierDelay',
            'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', "FirstDepTime", "Is_Delayed"]:
    flights_df_segment[col] = flights_df_segment[col].astype('int')

In [29]:
flights_df_segment.dtypes

Year                              int64
Month                             int64
DayofMonth                        int64
DayOfWeek                         int64
CRSDepTime                        int64
DepTime                           int32
DepDelayMinutes                   int32
CRSArrTime                        int64
ArrTime                           int32
CRSElapsedTime                    int32
ActualElapsedTime                 int32
CarrierDelay                      int32
WeatherDelay                      int32
NASDelay                          int32
SecurityDelay                     int32
LateAircraftDelay                 int32
FirstDepTime                      int32
Is_Delayed                        int32
Airline_Alaska Airlines Inc.      uint8
Airline_Allegiant Air             uint8
Airline_American Airlines Inc.    uint8
Airline_Delta Air Lines Inc.      uint8
Airline_Frontier Airlines Inc.    uint8
Airline_Hawaiian Airlines Inc.    uint8
Airline_JetBlue Airways           uint8


In [30]:
flights_df_segment.count()

Year                              996003
Month                             996003
DayofMonth                        996003
DayOfWeek                         996003
CRSDepTime                        996003
DepTime                           996003
DepDelayMinutes                   996003
CRSArrTime                        996003
ArrTime                           996003
CRSElapsedTime                    996003
ActualElapsedTime                 996003
CarrierDelay                      996003
WeatherDelay                      996003
NASDelay                          996003
SecurityDelay                     996003
LateAircraftDelay                 996003
FirstDepTime                      996003
Is_Delayed                        996003
Airline_Alaska Airlines Inc.      996003
Airline_Allegiant Air             996003
Airline_American Airlines Inc.    996003
Airline_Delta Air Lines Inc.      996003
Airline_Frontier Airlines Inc.    996003
Airline_Hawaiian Airlines Inc.    996003
Airline_JetBlue 

### Splitting Data

In [31]:
# Create our features
X = flights_df_segment.drop("Is_Delayed", axis=1)
X = pd.get_dummies(X)

# Create our target
y = flights_df_segment['Is_Delayed']

In [32]:
flights_df_segment['Is_Delayed'].value_counts()

0    771795
1    224208
Name: Is_Delayed, dtype: int64

In [33]:
# Splitting data for training and testing (splitting in the ratio 70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)



In [34]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Applying Decision Tree Classifier on Training Data

In [35]:

# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)


EasyEnsembleClassifier(n_estimators=100, random_state=1)

### Making Predictions and Checking Accuracy

In [None]:
pred_prob = clf.predict_proba(X_test)
auc_score = roc_auc_score(y_test, pred_prob[:,1])
auc_score

In [37]:

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

# Display the confusion matrix
y_pred = eec.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)


array([[192813,      0],
       [     0,  56188]], dtype=int64)

In [38]:

# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)


1.0