In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import confusion_matrix
import keras
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Preparing data

In [None]:
df = pd.read_csv('../input/airlines-dataset-to-predict-a-delay/Airlines.csv')
df

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
cols = df.columns
for i in range(0,len(cols)):
    print(df[cols[i]].value_counts(),'\n')
    print('********************************************\n')

In [None]:
le = preprocessing.LabelEncoder()

df['Airline'] = le.fit_transform(df['Airline'])
df['AirportFrom'] = le.fit_transform(df['AirportFrom'])
df['AirportTo'] = le.fit_transform(df['AirportTo'])

In [None]:
df

# Visualize some features of data

In [None]:
sns.countplot(df['Delay'])

In [None]:
sns.histplot(df['Time'])

In [None]:
sns.histplot(df['Length'])

In [None]:
df['Length_by_hours'] = df['Length'] / 60
df['Time_by_hour'] = df['Time'] / 60

In [None]:
df.drop(columns=['Length', 'Time'], inplace=True)

In [None]:
sns.histplot(df['Time_by_hour'])

In [None]:
sns.histplot(df['Length_by_hours'])

# Feature extraction

In [None]:
#0 --> Morning | 1 --> Afternoon | 2 --> Evening | 3 --> Night
departure_period = []
for i in range(0,len(df)):
    if ((df['Time_by_hour'][i] >= 5) & (df['Time_by_hour'][i] < 12)):      
        departure_period.append(0)
    elif ((df['Time_by_hour'][i] >= 12) & (df['Time_by_hour'][i] < 17)):    
        departure_period.append(1)
    elif ((df['Time_by_hour'][i] >= 17) & (df['Time_by_hour'][i] < 21)): 
        departure_period.append(2)
    else: 
        departure_period.append(3)

df['Departure_period'] = departure_period

In [None]:
df['Departure_period'].value_counts()

In [None]:
# 0 --> not holiday | 1 --> holiday
holiday = []
for i in range(0,len(df)):
    if ((df['DayOfWeek'][i] == 6) | (df['DayOfWeek'][i] == 7)):      
        holiday.append(1)
    else: 
        holiday.append(0)

df['Holiday'] = holiday

In [None]:
df['Holiday'].value_counts()

In [None]:
arrival_time = []
for i in range(0,len(df)):     
    if ((df['Time_by_hour'][i] + df['Length_by_hours'][i]) >=24):
        arrival_time.append((df['Time_by_hour'][i] + df['Length_by_hours'][i])-24)
    else:
        arrival_time.append(df['Time_by_hour'][i] + df['Length_by_hours'][i])
        
df['Arrival_Time'] = arrival_time

In [None]:
arrival_period = []
for i in range(0,len(df)):
    if ((df['Arrival_Time'][i] >= 5) & (df['Arrival_Time'][i] < 12)):      
        arrival_period.append(0)
    elif ((df['Arrival_Time'][i] >= 12) & (df['Arrival_Time'][i] < 17)):    
        arrival_period.append(1)
    elif ((df['Arrival_Time'][i] >= 17) & (df['Arrival_Time'][i] < 21)): 
        arrival_period.append(2)
    else: 
        arrival_period.append(3)
        
df['Arrival_period'] = arrival_period

In [None]:
df

# Correlation among features

In [None]:
plt.figure(figsize=(15,15))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds, fmt='.2f')
plt.show()

In [None]:
X = df.drop(columns='Delay')
y = df['Delay']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, shuffle=True)

# Some ML Models

In [None]:
DecisionTreeClassifierModel = DecisionTreeClassifier(criterion='entropy',max_depth=16,random_state=40)
DecisionTreeClassifierModel.fit(X_train, y_train)

print('DecisionTreeClassifierModel Train Score is : ' , DecisionTreeClassifierModel.score(X_train, y_train))
print('DecisionTreeClassifierModel Test Score is : ' , DecisionTreeClassifierModel.score(X_test, y_test))

In [None]:
y_pred_DT = DecisionTreeClassifierModel.predict(X_test)

CM_DT = confusion_matrix(y_test, y_pred_DT)
sns.heatmap(CM_DT, center=True)
plt.show()

print('Confusion Matrix is\n', CM_DT)

In [None]:
RandomForestClassifierModel = RandomForestClassifier(criterion = 'entropy', max_depth=18, n_estimators=400, random_state=44)
RandomForestClassifierModel.fit(X_train, y_train)

print('RandomForestClassifierModel Train Score is : ' , RandomForestClassifierModel.score(X_train, y_train))
print('RandomForestClassifierModel Test Score is : ' , RandomForestClassifierModel.score(X_test, y_test))

In [None]:
y_pred_RF = RandomForestClassifierModel.predict(X_test)
CM_RF = confusion_matrix(y_test, y_pred_RF)

sns.heatmap(CM_RF, center=True)
plt.show()

print('Confusion Matrix is\n', CM_RF)

In [None]:
GBCModel = GradientBoostingClassifier(n_estimators=300, max_depth=8, learning_rate=0.25, random_state=44)
GBCModel.fit(X_train, y_train)
print('GBCModel Train Score is : ' , GBCModel.score(X_train, y_train))
print('GBCModel Test Score is : ' , GBCModel.score(X_test, y_test))

In [None]:
y_pred_GB = GBCModel.predict(X_test)
CM_GB = confusion_matrix(y_test, y_pred_GB)

sns.heatmap(CM_GB, center=True)
plt.show()

print('Confusion Matrix is\n', CM_GB)

In [None]:
VotingClassifierModel = VotingClassifier(estimators=[('GBCModel',GBCModel),
                                                     ('RFCModel',RandomForestClassifierModel),
                                                     ('TDCModel',DecisionTreeClassifierModel)],
                                         voting='hard')
VotingClassifierModel.fit(X_train, y_train)
print('VotingClassifierModel Train Score is : ' , VotingClassifierModel.score(X_train, y_train))
print('VotingClassifierModel Test Score is : ' , VotingClassifierModel.score(X_test, y_test))

In [None]:
y_pred_V = VotingClassifierModel.predict(X_test)
CM_V = confusion_matrix(y_test, y_pred_V)
sns.heatmap(CM_V, center=True)
plt.show()
print('Confusion Matrix is\n', CM_V)