In [1]:
import pandas as pd
import pickle

In [2]:
# Load data
clean = pd.read_csv('REDdelay_clean.csv')
clean.head()

Unnamed: 0,DEP_DEL15,MONTH,DAY_OF_WEEK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,...,CARRIER_NAME_Mesa_Airlines_Inc,"CARRIER_NAME_Midwest_Airline,_Inc",CARRIER_NAME_SkyWest_Airlines_Inc,CARRIER_NAME_Southwest_Airlines_Co,CARRIER_NAME_Spirit_Air_Lines,CARRIER_NAME_United_Air_Lines_Inc,DEPARTING_AIRPORT_Chicago_OHare_International,DEPARTING_AIRPORT_Houston_Intercontinental,DEPARTING_AIRPORT_Los_Angeles_International,principal_component
0,0,1,3,3,1,52,76,23400,62105,3973,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
1,1,1,3,3,1,55,50,23400,62105,3973,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
2,1,1,3,2,1,49,50,23400,62105,3973,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
3,1,1,3,2,1,48,50,23400,62105,3973,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
4,1,1,3,6,1,55,76,23400,62105,3973,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387


## REDdelay_clean.csv is 210 MB

In [3]:
# Define features set
X = clean.copy()
X = X.drop("DEP_DEL15", axis=1)
X.head()

Unnamed: 0,MONTH,DAY_OF_WEEK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,AVG_MONTHLY_PASS_AIRPORT,...,CARRIER_NAME_Mesa_Airlines_Inc,"CARRIER_NAME_Midwest_Airline,_Inc",CARRIER_NAME_SkyWest_Airlines_Inc,CARRIER_NAME_Southwest_Airlines_Co,CARRIER_NAME_Spirit_Air_Lines,CARRIER_NAME_United_Air_Lines_Inc,DEPARTING_AIRPORT_Chicago_OHare_International,DEPARTING_AIRPORT_Houston_Intercontinental,DEPARTING_AIRPORT_Los_Angeles_International,principal_component
0,1,3,3,1,52,76,23400,62105,3973,3103410,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
1,1,3,3,1,55,50,23400,62105,3973,3103410,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
2,1,3,2,1,49,50,23400,62105,3973,3103410,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
3,1,3,2,1,48,50,23400,62105,3973,3103410,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387
4,1,3,6,1,55,76,23400,62105,3973,3103410,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-11.91387


In [4]:
# Define target
y = clean["DEP_DEL15"].values

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Splitting into Train and Test sets (70% - 30% respectively)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 20)

In [6]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [7]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [8]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Choose best learning rate

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

In [10]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators = 20,
                                            learning_rate = learning_rate,
                                            max_features = 5,
                                            max_depth = 3,
                                            random_state = 20)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.794
Accuracy score (validation): 0.794

Learning rate:  0.1
Accuracy score (training): 0.794
Accuracy score (validation): 0.794

Learning rate:  0.25
Accuracy score (training): 0.796
Accuracy score (validation): 0.796

Learning rate:  0.5
Accuracy score (training): 0.797
Accuracy score (validation): 0.797

Learning rate:  0.75
Accuracy score (training): 0.797
Accuracy score (validation): 0.798

Learning rate:  1
Accuracy score (training): 0.798
Accuracy score (validation): 0.798



## Creating Gradient Boosting Classifier using best learning rate = 1

In [11]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators = 20,
                                        learning_rate = 1,
                                        max_features = 5,
                                        max_depth = 3,
                                        random_state = 20)

In [12]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(learning_rate=1, max_features=5, n_estimators=20,
                           random_state=20)

In [13]:
# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).sample(20)

Unnamed: 0,Prediction,Actual
17648,0,0
160862,0,0
33937,0,0
208823,0,0
132662,0,0
67910,0,0
15579,0,0
148090,0,1
130770,0,0
160406,0,0


## Model evaluation

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [15]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7983104589320773


In [16]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index = ["Actual 0", "Actual 1"],
    columns = ["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,169988,2214
Actual 1,41549,3231


In [17]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.80      0.99      0.89    172202
           1       0.59      0.07      0.13     44780

    accuracy                           0.80    216982
   macro avg       0.70      0.53      0.51    216982
weighted avg       0.76      0.80      0.73    216982



## Random Oversampling

In [18]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

In [19]:
ros = RandomOverSampler(random_state = 20)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

Counter({0: 401968, 1: 401968})

In [20]:
# Splitting into Train and Test sets (70% - 30% respectively)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                    test_size = 0.3, random_state = 20)

In [21]:
# X_train.to_csv('X_train_data.csv', index = False) # This file is 210 MB

In [22]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [23]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Create a classifier object
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators = 30,
                                            learning_rate = learning_rate,
                                            max_features = 15,
                                            max_depth = 5,
                                            random_state = 20)

    # Fit the model
    classifier.fit(X_train_scaled, y_train)
    print("Learning rate: ", learning_rate)

    # Score the model
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train_scaled,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test_scaled,
            y_test)))
    print()

Learning rate:  0.05
Accuracy score (training): 0.616
Accuracy score (validation): 0.616

Learning rate:  0.1
Accuracy score (training): 0.626
Accuracy score (validation): 0.625

Learning rate:  0.25
Accuracy score (training): 0.640
Accuracy score (validation): 0.639

Learning rate:  0.5
Accuracy score (training): 0.649
Accuracy score (validation): 0.647

Learning rate:  0.75
Accuracy score (training): 0.652
Accuracy score (validation): 0.649

Learning rate:  1
Accuracy score (training): 0.653
Accuracy score (validation): 0.649

Learning rate:  1.25
Accuracy score (training): 0.653
Accuracy score (validation): 0.649



In [25]:
# Choose a learning rate and create classifier
classifier = GradientBoostingClassifier(n_estimators = 30,
                                        learning_rate = 1,
                                        max_features = 15,
                                        max_depth = 5,
                                        random_state = 20)

In [26]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

GradientBoostingClassifier(learning_rate=1, max_depth=5, max_features=15,
                           n_estimators=30, random_state=20)

In [27]:
# Make Prediction
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).sample(20)

Unnamed: 0,Prediction,Actual
154039,1,0
200084,1,1
124026,0,0
144490,1,1
7426,1,1
60666,1,1
24280,1,1
91657,0,0
175676,0,0
240043,0,1


In [28]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.6486248916788636


In [29]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index = ["Actual 0", "Actual 1"],
    columns = ["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,81319,39271
Actual 1,45474,75117


In [30]:
# Generate classification report
print("Classification Report")
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       0.64      0.67      0.66    120590
           1       0.66      0.62      0.64    120591

    accuracy                           0.65    241181
   macro avg       0.65      0.65      0.65    241181
weighted avg       0.65      0.65      0.65    241181



## Saving the model

In [31]:
# Save the model to disk
filename = 'gb_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [32]:
# Load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [37]:
import joblib as jb

In [39]:
filename = 'gb_model.pkl'
pickle.dump(classifier, open(filename, 'wb'))

In [42]:
# Create an iterator object with write permission - model.pkl
with open('model.pkl', 'wb') as files:
    pickle.dump(classifier, files)

## Making predictions with the loaded model

In [33]:
# Make Prediction
predictions = loaded_model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).sample(20)

Unnamed: 0,Prediction,Actual
14551,1,0
106863,0,0
240357,0,0
130105,1,1
76508,0,0
43788,0,0
129326,1,1
78231,1,1
140335,0,1
117597,0,0
