In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
#  Import and read the flights_data.csv.
flights_df = pd.read_csv("flights_data_df.csv")
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,Cancelled,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,114.0,,,,,,
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,117.0,,,,,,
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,129.0,121.0,,,,,,
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,121.0,,,,,,
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,0.0,,117.0,135.0,,,,,,


# Data Preprocesing

In [3]:
# Creating 'Is_Delayed' column that will tell us if a flight is delayed or not. 
flights_df["Is_Delayed"] = None
flights_df.loc[flights_df["DepDelayMinutes"] == 0, "Is_Delayed"] = "1"
flights_df.loc[flights_df["DepDelayMinutes"] > 0, "Is_Delayed"] = "0"

In [4]:
flights_df.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Airline,Origin,OriginCityName,Dest,DestCityName,...,CancellationCode,CRSElapsedTime,ActualElapsedTime,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,FirstDepTime,Is_Delayed
0,0,2021,1,2,6,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,114.0,,,,,,,0
1,1,2021,1,3,7,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,117.0,,,,,,,1
2,2,2021,1,4,1,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,129.0,121.0,,,,,,,0
3,3,2021,1,7,4,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,121.0,,,,,,,1
4,4,2021,1,8,5,Delta Air Lines Inc.,CVG,"Cincinnati, OH",BOS,"Boston, MA",...,,117.0,135.0,,,,,,,1


In [5]:
# Converting the categorical variables into indicator variables. 
flights = pd.get_dummies(flights_df, columns=['Airline'])

In [6]:
flights.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [7]:
# cutting out a segment for the model
flights_data_segment = flights[0:1000000]

In [8]:
flights_data_segment.head()

Unnamed: 0.1,Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,Origin,OriginCityName,Dest,DestCityName,CRSDepTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,0,2021,1,2,6,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
1,1,2021,1,3,7,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
2,2,2021,1,4,1,CVG,"Cincinnati, OH",BOS,"Boston, MA",730,...,0,0,0,1,0,0,0,0,0,0
3,3,2021,1,7,4,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0
4,4,2021,1,8,5,CVG,"Cincinnati, OH",BOS,"Boston, MA",1715,...,0,0,0,1,0,0,0,0,0,0


In [9]:
#Drop unnecessary columns
#flights_df_segment = flights_data_segment.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'CarrierDelay', 'WeatherDelay', "NASDelay", 'SecurityDelay', 'LateAircraftDelay', 'FirstDepTime', 'CancellationCode'], axis=1)
flights_df_segment = flights_data_segment.drop(['Origin', 'OriginCityName', 'DestCityName', 'Dest', 'Unnamed: 0', 'Cancelled', 'FirstDepTime', 'CancellationCode'], axis=1)

In [10]:
# replacing all NaN values with the mean of the attribute in which they are present
flights_df_segment=flights_df_segment.fillna(flights_df_segment.mean())

  


In [11]:
flights_df_segment.fillna(0, inplace=True)

In [12]:
flights_df_segment.count()

Year                              1000000
Month                             1000000
DayofMonth                        1000000
DayOfWeek                         1000000
CRSDepTime                        1000000
DepTime                           1000000
DepDelayMinutes                   1000000
CRSArrTime                        1000000
ArrTime                           1000000
CRSElapsedTime                    1000000
ActualElapsedTime                 1000000
CarrierDelay                      1000000
WeatherDelay                      1000000
NASDelay                          1000000
SecurityDelay                     1000000
LateAircraftDelay                 1000000
Is_Delayed                        1000000
Airline_Alaska Airlines Inc.      1000000
Airline_Allegiant Air             1000000
Airline_American Airlines Inc.    1000000
Airline_Delta Air Lines Inc.      1000000
Airline_Frontier Airlines Inc.    1000000
Airline_Hawaiian Airlines Inc.    1000000
Airline_JetBlue Airways           

In [13]:
flights_df_segment

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,CRSDepTime,DepTime,DepDelayMinutes,CRSArrTime,ArrTime,CRSElapsedTime,...,Airline_Alaska Airlines Inc.,Airline_Allegiant Air,Airline_American Airlines Inc.,Airline_Delta Air Lines Inc.,Airline_Frontier Airlines Inc.,Airline_Hawaiian Airlines Inc.,Airline_JetBlue Airways,Airline_Southwest Airlines Co.,Airline_Spirit Air Lines,Airline_United Air Lines Inc.
0,2021,1,2,6,730,733.0,3.0,939,927.0,129.0,...,0,0,0,1,0,0,0,0,0,0
1,2021,1,3,7,730,727.0,0.0,939,924.0,129.0,...,0,0,0,1,0,0,0,0,0,0
2,2021,1,4,1,730,737.0,7.0,939,938.0,129.0,...,0,0,0,1,0,0,0,0,0,0
3,2021,1,7,4,1715,1710.0,0.0,1912,1911.0,117.0,...,0,0,0,1,0,0,0,0,0,0
4,2021,1,8,5,1715,1711.0,0.0,1912,1926.0,117.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,2021,10,13,3,845,839.0,0.0,1119,1101.0,154.0,...,0,0,0,1,0,0,0,0,0,0
999996,2021,10,14,4,845,833.0,0.0,1119,1049.0,154.0,...,0,0,0,1,0,0,0,0,0,0
999997,2021,10,15,5,845,842.0,0.0,1119,1110.0,154.0,...,0,0,0,1,0,0,0,0,0,0
999998,2021,10,16,6,845,845.0,0.0,1119,1126.0,154.0,...,0,0,0,1,0,0,0,0,0,0


In [14]:
for col in ['DepTime', 'DepDelayMinutes', 'ArrTime', 'CRSElapsedTime', 'ActualElapsedTime', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', "Is_Delayed"]:
    flights_df_segment[col] = flights_df_segment[col].astype('int')

In [15]:
data_type= flights_df_segment.dtypes
data_type

Year                              int64
Month                             int64
DayofMonth                        int64
DayOfWeek                         int64
CRSDepTime                        int64
DepTime                           int32
DepDelayMinutes                   int32
CRSArrTime                        int64
ArrTime                           int32
CRSElapsedTime                    int32
ActualElapsedTime                 int32
CarrierDelay                      int32
WeatherDelay                      int32
NASDelay                          int32
SecurityDelay                     int32
LateAircraftDelay                 int32
Is_Delayed                        int32
Airline_Alaska Airlines Inc.      uint8
Airline_Allegiant Air             uint8
Airline_American Airlines Inc.    uint8
Airline_Delta Air Lines Inc.      uint8
Airline_Frontier Airlines Inc.    uint8
Airline_Hawaiian Airlines Inc.    uint8
Airline_JetBlue Airways           uint8
Airline_Southwest Airlines Co.    uint8


# Sppliting Data

In [16]:
# Features and target arrays
y = flights_df_segment.WeatherDelay
X = flights_df_segment.drop(columns=["WeatherDelay"])

In [17]:
from collections import Counter

In [18]:
# Splitting data for training and testing (splitting in the ratio 70:30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [19]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)

# Applying Decision Tree Classifier on Training Data

# Making Predictions and Checking Accuracy

In [20]:
# Resample the training data with the BalancedRandomForestClassifier

from sklearn.preprocessing import StandardScaler
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=1)
rf_model = rf_model.fit(X_train_scaled, y_train)
predictions = rf_model.predict(X_test_scaled)
rf_model

RandomForestClassifier(random_state=1)

In [21]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [22]:
acc_score = balanced_accuracy_score(y_test, predictions)
acc_score



0.018490377954383568

In [23]:
predictions = rf_model.predict(X_test_scaled)
confusion_matrix(y_test,predictions)

array([[27855,     0,     0, ...,     0,     0,     0],
       [   21,     0,     0, ...,     0,     0,     0],
       [   17,     0,     0, ...,     0,     0,     0],
       ...,
       [    0,     0,     0, ...,     0,     0,     0],
       [    0,     0,     0, ...,     1,     0,     0],
       [    0,     0,     0, ...,     0,     0,     0]], dtype=int64)

In [24]:
print(classification_report_imbalanced(y_test, predictions))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.96      1.00      1.00      0.98      1.00      1.00     27855
          1       0.00      0.00      1.00      0.00      0.00      0.00        21
          2       0.00      0.00      1.00      0.00      0.00      0.00        17
          3       0.00      0.00      1.00      0.00      0.00      0.00        17
          4       0.00      0.00      1.00      0.00      0.00      0.00        21
          5       0.00      0.00      1.00      0.00      0.00      0.00        25
          6       0.00      0.00      1.00      0.00      0.00      0.00        23
          7       0.00      0.00      1.00      0.00      0.00      0.00        36
          8       1.00      1.00      1.00      1.00      1.00      1.00    269158
          9       0.00      0.00      1.00      0.00      0.00      0.00        22
         10       0.00      0.00      1.00      0.00      0.00      0.00        31
   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
