In [1]:
 # Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


In [2]:
#  Import and read the flights_data.csv.
flights_df = pd.read_csv("flights_data_df.csv")
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,Cancelled,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,114.0,,,,,,
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,117.0,,,,,,
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,121.0,,,,,,
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,121.0,,,,,,
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,135.0,,,,,,


### Data Preprocessing

In [4]:
#Drop unnecessary columns
flights = flights_df.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'CancellationCode'], axis=1)

In [5]:
# Creating 'Is_Delayed' column that will tell us if a flight is delayed or not. 
flights["Is_Delayed"] = None
flights.loc[flights_df["DepDelayMinutes"] == 0, "Is_Delayed"] = "0"
flights.loc[flights_df["DepDelayMinutes"] > 0, "Is_Delayed"] = "1"


In [6]:
flights.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,CRSDepTime,DepTime,DepDelayMinutes,CRSArrTime,ArrTime,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,Is_Delayed
0,2021,1,2,6,Delta Air Lines Inc.,730,733.0,3.0,939,927.0,129.0,114.0,,,,,,,1
1,2021,1,3,7,Delta Air Lines Inc.,730,727.0,0.0,939,924.0,129.0,117.0,,,,,,,0
2,2021,1,4,1,Delta Air Lines Inc.,730,737.0,7.0,939,938.0,129.0,121.0,,,,,,,1
3,2021,1,7,4,Delta Air Lines Inc.,1715,1710.0,0.0,1912,1911.0,117.0,121.0,,,,,,,0
4,2021,1,8,5,Delta Air Lines Inc.,1715,1711.0,0.0,1912,1926.0,117.0,135.0,,,,,,,0


In [7]:
# Converting the categorical variables into indicator variables. 
flights_data_df = pd.get_dummies(flights, columns=['Airline'])

In [8]:
flights_data_df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,DepTime,DepDelayMinutes,CRSArrTime,ArrTime,CRSElapsedTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,2021,1,2,6,730,733.0,3.0,939,927.0,129.0,...,0,0,0,1,0,0,0,0,0,0
1,2021,1,3,7,730,727.0,0.0,939,924.0,129.0,...,0,0,0,1,0,0,0,0,0,0
2,2021,1,4,1,730,737.0,7.0,939,938.0,129.0,...,0,0,0,1,0,0,0,0,0,0
3,2021,1,7,4,1715,1710.0,0.0,1912,1911.0,117.0,...,0,0,0,1,0,0,0,0,0,0
4,2021,1,8,5,1715,1711.0,0.0,1912,1926.0,117.0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
# cutting out a segment for the model
flights_data_segment = flights[0:9000000]

In [10]:
flights_data_segment.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,CRSDepTime,DepTime,DepDelayMinutes,CRSArrTime,ArrTime,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,Is_Delayed
0,2021,1,2,6,Delta Air Lines Inc.,730,733.0,3.0,939,927.0,129.0,114.0,,,,,,,1
1,2021,1,3,7,Delta Air Lines Inc.,730,727.0,0.0,939,924.0,129.0,117.0,,,,,,,0
2,2021,1,4,1,Delta Air Lines Inc.,730,737.0,7.0,939,938.0,129.0,121.0,,,,,,,1
3,2021,1,7,4,Delta Air Lines Inc.,1715,1710.0,0.0,1912,1911.0,117.0,121.0,,,,,,,0
4,2021,1,8,5,Delta Air Lines Inc.,1715,1711.0,0.0,1912,1926.0,117.0,135.0,,,,,,,0


In [12]:
# Drop NaN values
flights_data_segment.dropna(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [13]:
# Change dtype
for col in ['DepTime', 'DepDelayMinutes', 
            'ArrTime', 'CRSElapsedTime', 'ActualElapsedTime', 'CarrierDelay',
            'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', "FirstDepTime", "Is_Delayed"]:
    flights_data_segment[col] = flights_data_segment[col].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [51]:
flights_df_segment.dtypes

Year                              int64
Month                             int64
DayofMonth                        int64
DayOfWeek                         int64
CRSDepTime                        int64
DepTime                           int32
DepDelayMinutes                   int32
CRSArrTime                        int64
ArrTime                           int32
CRSElapsedTime                    int32
ActualElapsedTime                 int32
CarrierDelay                      int32
WeatherDelay                      int32
NASDelay                          int32
SecurityDelay                     int32
LateAircraftDelay                 int32
FirstDepTime                      int32
Is_Delayed                        int32
Airline_Alaska Airlines Inc.      uint8
Airline_Allegiant Air             uint8
Airline_American Airlines Inc.    uint8
Airline_Delta Air Lines Inc.      uint8
Airline_Frontier Airlines Inc.    uint8
Airline_Hawaiian Airlines Inc.    uint8
Airline_JetBlue Airways           uint8


In [14]:
flights_data_segment.count()

Year                 50218
Month                50218
DayofMonth           50218
DayOfWeek            50218
Airline              50218
CRSDepTime           50218
DepTime              50218
DepDelayMinutes      50218
CRSArrTime           50218
ArrTime              50218
CRSElapsedTime       50218
ActualElapsedTime    50218
CarrierDelay         50218
WeatherDelay         50218
NASDelay             50218
SecurityDelay        50218
LateAircraftDelay    50218
FirstDepTime         50218
Is_Delayed           50218
dtype: int64

### Splitting Data

In [15]:
# Create our features
X = flights_data_segment.drop("Is_Delayed", axis=1)
X = pd.get_dummies(X)

# Create our target
y = flights_data_segment['Is_Delayed']

In [16]:
flights_data_segment['Is_Delayed'].value_counts()

1    50198
0       20
Name: Is_Delayed, dtype: int64

In [24]:
# Splitting data for training and testing (splitting in the ratio 70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=1)



In [25]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Applying EasyEnsembleClassifier on Training Data

In [26]:

# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train_scaled, y_train)


EasyEnsembleClassifier(n_estimators=100, random_state=1)

### Making Predictions and Checking Accuracy

In [27]:

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

# Display the confusion matrix
y_pred = eec.predict(X_test_scaled)
confusion_matrix(y_test, y_pred)


array([[    0,     0],
       [  119, 14947]], dtype=int64)

In [28]:

# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)




0.9921014204168326