# Flight Delay Prediction Part 3 - Classification

We will be predicting whether there was a delay in the departure or not.

In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

## Classification

In [None]:
# Importing requirements
from google.colab import files
import pandas as pd
import numpy as np
import imblearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [None]:
# Load the CSV file into a DataFrame
df = pd.read_csv("/content/gdrive/My Drive/Flight Delay/Data/RESULT.csv")

In [None]:
# Listing all the column keys
df.keys()

In [None]:
# Labels is an array with the ARR_DEL15 values indicating whether there is a delay in arrival or not
# The labels are the 'answer' to predict (classify as delayed or not)
labels = np.asarray(df["ARR_DEL15"])
# Drop unwanted columns
df_select = df.drop(["DEP_DELAY","DEP_DELAY_NEW","DEP_DEL15","ARR_DELAY","ARR_DELAY_NEW","ARR_DEL15","HOUR","date","hour","airport","index","DEP_TIME","ARR_TIME","FL_DATE"],axis=1)
# Integer enccoding of values
lenc = LabelEncoder()
intenc = lenc.fit_transform(df_select["DEST"])
df_select["DEST"] = intenc
intenc = lenc.fit_transform(df_select["ORIGIN"])
df_select["ORIGIN"] = intenc
intenc = lenc.fit_transform(df_select["OP_UNIQUE_CARRIER"])
df_select["OP_UNIQUE_CARRIER"] = intenc
intenc = lenc.fit_transform(df_select["OP_CARRIER_FL_NUM"])
df_select["OP_CARRIER_FL_NUM"] = intenc
intenc = lenc.fit_transform(df_select["winddir16Point"])
df_select["winddir16Point"] = intenc
intenc = lenc.fit_transform(df_select["weatherCode"])
df_select["weatherCode"] = intenc
# Features is an array of a list of features to be used for classification
# They are the inputs to the classifier
features = np.asarray(df_select)

In [None]:
df_select.to_csv("/content/gdrive/My Drive/Flight Delay/Data/FINALTABLE.csv")

In [None]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)

### Random Forest

In [None]:
# Default Classifier
cl = RandomForestClassifier()
cl.fit(features_train,labels_train)
acc_test = cl.score(features_test, labels_test)
print ("Test Accuracy:", acc_test)

In [None]:
# Before SMOTE Oversampling
pred = cl.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)

In [None]:
print(classification_report(labels_test, pred))

In [None]:
# SMOTE Oversampling
sos = SMOTE(random_state=42)
X_res, y_res = sos.fit_resample(features_train, labels_train)
clf = RandomForestClassifier()
clf.fit(X_res,y_res)
acc_test = clf.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = clf.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

### Naive Bayes

In [None]:
# Default Classifier
gnb = GaussianNB()
gnb.fit(features_train,labels_train)
acc_test = gnb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)

In [None]:
# Before application of sampling techniques
pred = gnb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)

In [None]:
print(classification_report(labels_test, pred))

In [None]:
# Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(features_train, labels_train)
gnb = GaussianNB()
gnb.fit(X_res,y_res)
acc_test = gnb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = gnb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

In [None]:
# Random Oversampling
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(features_train, labels_train)
gnb = GaussianNB()
gnb.fit(X_res,y_res)
acc_test = gnb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = gnb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

In [None]:
# SMOTE Oversampling
sos = SMOTE(random_state=42)
X_res, y_res = sos.fit_resample(features_train, labels_train)
gnb = GaussianNB()
gnb.fit(X_res,y_res)
acc_test = gnb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = gnb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

### AdaBoost

In [None]:
# Default Classifier
adb = AdaBoostClassifier()
adb.fit(features_train,labels_train)
acc_test = adb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)

In [None]:
# Before SMOTE Oversampling
pred = adb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)

In [None]:
# SMOTE Oversampling
adb = AdaBoostClassifier(n_estimators = 100)
adb.fit(X_res,y_res)
acc_test = adb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = adb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

### Extra Trees

In [None]:
# With SMOTE Oversampling
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc.fit(X_res,y_res)
acc_test = etc.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = etc.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

### Gradient Boosting

In [None]:
# With SMOTE Oversampling
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_res,y_res)
acc_test = gb.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)
pred = gb.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)
print(classification_report(labels_test, pred))

### Logistic Regression

In [None]:
# Default Classifier
lr = LogisticRegression()
lr.fit(features_train,labels_train)
acc_test = lr.score(features_test,labels_test)
print ("Test Accuracy:", acc_test)

In [None]:
# Without sampling
pred = lr.predict(features_test)
precision = precision_score(labels_test, pred, average="weighted")
recall = recall_score(labels_test, pred, average="weighted")
f1 = f1_score(labels_test, pred, average="weighted")
print ("Precision:", precision)
print ("Recall:", recall)
print ("F1 Score:", f1)

In [None]:
print(classification_report(labels_test, pred))