In [None]:
# !pip install --user -r requirements.txt

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import lightgbm as lgbm

In [None]:
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, RocCurveDisplay, auc, roc_curve)
from scikitplot.metrics import plot_roc

# Feature Extraction

In [None]:
df = pd.read_csv('Dataset/Airlines.csv')

df['Length_by_hours'] = df['Length'] / 60
df['Time_by_hour'] = df['Time'] / 60

In [None]:
#0 --> Morning | 1 --> Afternoon | 2 --> Evening | 3 --> Night
departure_period = []
for i in range(0,len(df)):
    if ((df['Time_by_hour'][i] >= 5) & (df['Time_by_hour'][i] < 12)):      
        departure_period.append(0)
    elif ((df['Time_by_hour'][i] >= 12) & (df['Time_by_hour'][i] < 17)):    
        departure_period.append(1)
    elif ((df['Time_by_hour'][i] >= 17) & (df['Time_by_hour'][i] < 21)): 
        departure_period.append(2)
    else: 
        departure_period.append(3)

df['Departure_period'] = departure_period

In [None]:
# 0 --> not holiday | 1 --> holiday
holiday = []
for i in range(0,len(df)):
    if ((df['DayOfWeek'][i] == 6) | (df['DayOfWeek'][i] == 7)):      
        holiday.append(1)
    else: 
        holiday.append(0)

df['Holiday'] = holiday

In [None]:
arrival_time = []
for i in range(0,len(df)):     
    if ((df['Time_by_hour'][i] + df['Length_by_hours'][i]) >=24):
        arrival_time.append((df['Time_by_hour'][i] + df['Length_by_hours'][i])-24)
    else:
        arrival_time.append(df['Time_by_hour'][i] + df['Length_by_hours'][i])
        
df['Arrival_Time'] = arrival_time

In [None]:
arrival_period = []
for i in range(0,len(df)):
    if ((df['Arrival_Time'][i] >= 5) & (df['Arrival_Time'][i] < 12)):      
        arrival_period.append(0)
    elif ((df['Arrival_Time'][i] >= 12) & (df['Arrival_Time'][i] < 17)):    
        arrival_period.append(1)
    elif ((df['Arrival_Time'][i] >= 17) & (df['Arrival_Time'][i] < 21)): 
        arrival_period.append(2)
    else: 
        arrival_period.append(3)
        
df['Arrival_period'] = arrival_period

In [None]:
df.drop(columns=['Length', 'Time'], inplace=True)

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
le = preprocessing.LabelEncoder()

In [None]:
airportfrom = dict()
tmp = le.fit_transform(df['AirportFrom'])
for x, y in zip(df['AirportFrom'], tmp):
  airportfrom[x] = y
df['AirportFrom'] = tmp 

In [None]:
airlines = dict()
tmp = le.fit_transform(df['Airline'])
for x, y in zip(df['Airline'], tmp):
  airlines[x] = y
df['Airline'] = tmp 

In [None]:
airportto = dict()
tmp = le.fit_transform(df['AirportTo'])
for x, y in zip(df['AirportTo'], tmp):
  airportto[x] = y
df['AirportTo'] = tmp 

In [None]:
X = df.drop(columns='Delay')
y = df['Delay']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, stratify=y, random_state=66)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, shuffle=True, stratify=y_train, random_state=66)

In [None]:
def save_model(model, path):
    joblib.dump(model, open(path,'wb'))

In [None]:
def score_model(yt, yp):
    accuracy = round(accuracy_score(yt, yp), 3)
    precision = round(precision_score(yt, yp), 3)
    recall = round(recall_score(yt, yp), 3)
    return accuracy, precision, recall;

def output_result(accuracy, precision, recall):
    print(f'Accuracy of the model: {accuracy}')
    print(f'Precision Score of the model: {precision}')
    print(f'Recall Score of the model: {recall}')

    
def plot_conf_matrix(yt, yp):
    CM_DT = confusion_matrix(yt, yp, normalize='true')
    sns.heatmap(CM_DT, annot=True, center=True)
    plt.show()
    
def print_score(name, model, X, y, Xv, yv):
    print(name,' Train Score is : ' , model.score(X, y))
    print(name, ' Validation Score is : ' , model.score(Xv, yv))
    
def print_roc(y_pred, y_test):
    fpr, tpr, thresholds =roc_curve(y_pred, y_test)
    roc_auc = auc(fpr, tpr)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                              estimator_name='example estimator')
    display.plot()
    plt.show()
    
def compute_weights(y):
    one =  y.value_counts()[1]
    zero = y.value_counts()[0]
    summ = one + zero
    return {0: zero/summ, 1: one/summ}

# Train Random Forest

In [None]:
max_depth=20
n_estimators=50
min_samples_split=2

In [None]:
parameters = {'n_estimators': [int(n_estimators)], 
              'max_depth': [int(max_depth)],
              'min_samples_split': [int(min_samples_split)]}

RandomForestClassifierModel = RandomForestClassifier(random_state=66, class_weight = compute_weights(y))

rf_grid = GridSearchCV(RandomForestClassifierModel, parameters, cv=4)
rf_grid.fit(X_train, y_train)

print_score('RandomForestClassifierModel', rf_grid, X_train, y_train, X_val, y_val)

# Test Random Forest

In [None]:
y_pred_RF = rf_grid.predict(X_val)
save_model(rf_grid, "Katib-RandomForest.sav")

rf_accuracy_score, rf_precision_score, rf_recall_score = score_model(y_val, y_pred_RF)
output_result(rf_accuracy_score, rf_precision_score, rf_recall_score)
plot_conf_matrix(y_val, y_pred_RF)
print_roc(y_val, y_pred_RF)

# Pipeline Metrics

In [None]:
print(rf_accuracy_score)
print(rf_precision_score)
print(rf_recall_score)