<a href="https://colab.research.google.com/github/MikelBros/flight_delay_predictor/blob/master/flight_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries and data loading
---
This code is partially based on:
https://github.com/codeclassifiers/MachineHack-2019-Flight-Price-Prediction-Hackathon/blob/master/Hackathon_dataset.ipynb

The data was downloaded from: http://stat-computing.org/dataexpo/2009/the-data.html



In [0]:
from sklearn import metrics
import io
import os
import random

import numpy as np
import scipy as sp
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

from google.colab import files
from google.colab import drive

from scipy.stats import skew
from scipy.special import boxcox1p
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


# Mount google drive so that the data can be loaded from it
drive.mount('/content/gdrive', force_remount=True)
path = '/content/gdrive/My Drive/2007.csv'

if os.path.isfile(path):
  data = pd.read_csv(path)
  print ("Size of loaded data: " + str(data.shape))
else:
  print('csv file not found')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
Size of loaded data: (7453215, 29)


# Preprocessing



In [0]:
print ("Size of loaded data: " + str(data.shape))
preprocessed_data = data
print ("Size of loaded data: " + str(preprocessed_data.shape))
# are there null values in the dataset?
print(preprocessed_data.isnull().values.any())
# where
print(preprocessed_data.isnull().sum())
# Drop a posteriori knowledge. Using knowledge oobtined after knowing
# that a flight has been delayed is cheating!!
preprocessed_data = preprocessed_data.drop('CancellationCode', axis=1)
preprocessed_data = preprocessed_data.drop('DepTime', axis=1)
preprocessed_data = preprocessed_data.drop('ArrTime', axis=1)
preprocessed_data = preprocessed_data.drop('TaxiOut', axis=1)
preprocessed_data = preprocessed_data.drop('TaxiIn', axis=1)
# scheduled departure column is called CRSDepTime
preprocessed_data = preprocessed_data.drop('Diverted', axis=1) 
preprocessed_data = preprocessed_data.drop('Cancelled', axis=1) 
preprocessed_data = preprocessed_data.drop('DepDelay', axis=1) 
preprocessed_data = preprocessed_data.drop('CarrierDelay', axis=1)
preprocessed_data = preprocessed_data.drop('WeatherDelay', axis=1)
preprocessed_data = preprocessed_data.drop('NASDelay', axis=1)
preprocessed_data = preprocessed_data.drop('SecurityDelay', axis=1)
preprocessed_data = preprocessed_data.drop('LateAircraftDelay', axis=1)
preprocessed_data = preprocessed_data.drop('ActualElapsedTime', axis=1)
preprocessed_data = preprocessed_data.drop('AirTime', axis=1)
# drop null column values
print ("Size of loaded data: " + str(preprocessed_data.shape))
preprocessed_data.dropna(inplace=True)

print ("Size of loaded data: " + str(preprocessed_data.shape))

# Select duplicate rows except first occurrence based on all columns
duplicateRowsDF = preprocessed_data[preprocessed_data.duplicated()]
print("Total Duplicate Rows:")
print(duplicateRowsDF.shape)
# remove duplicate rows in training dataset
preprocessed_data.drop_duplicates(keep='first', inplace=True)

print ("Size of data after removing duplicated entries: " + str(preprocessed_data.shape))

Size of loaded data: (7453215, 29)
Size of loaded data: (7453215, 29)
True
Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               160748
CRSDepTime                 0
ArrTime               177927
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                   22
ActualElapsedTime     177927
CRSElapsedTime           994
AirTime               177927
ArrDelay              177927
DepDelay              160748
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     7292466
Diverted                   0
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
dtype: int64
Size of loaded data: (7453215, 14)
Size of loaded data: (7275288, 14)
T

# Feature engineering

In [0]:
#preprocessed_data.rename(columns={'DayofMonth': 'Day'}, inplace=True)
#preprocessed_data['DATE'] = pd.to_datetime(preprocessed_data[['Year', 'Month', 'Day']])
#preprocessed_data = preprocessed_data.drop('Year', axis=1)
#preprocessed_data = preprocessed_data.drop('Month', axis=1)
#preprocessed_data = preprocessed_data.drop('Day', axis=1)

#print ("Size of loaded data: " + str(preprocessed_data.shape))

Size of loaded data: (7275259, 12)


# Data transformaton

In [0]:
#Separate categorical and numerical columns in dataframe
data_categorical = preprocessed_data.select_dtypes(exclude=['int', 'float'])
data_numerical = preprocessed_data.select_dtypes(include=['int', 'float'])

#Label encode and hot encode categorical columns
le = LabelEncoder()
data_categorical = data_categorical.apply(LabelEncoder().fit_transform)

# Find skewed numerical features
skewed_feats = data_numerical.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness.head(10)
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

# Box cox tranform the skewed numerical features
skewed_features = skewness.index
lam = 0.5
for feat in skewed_features:
  # Keep the target untouched, we want to log-tranform it
  if feat == 'ArrDelay':
    print('I am passing')
  else:
    data_numerical[feat] = boxcox1p(data_numerical[feat], lam)


Skew in numerical features: 

There are 7 skewed numerical features to Box Cox transform
I am passing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [0]:


print(data_numerical.isnull().values.any())

# Merge categorical and numerical columns back into data
preprocessed_data = pd.concat([data_categorical, data_numerical], axis=1)

print(preprocessed_data.head())
print("data size after data processing: " + str(preprocessed_data.shape))
print(preprocessed_data.isnull().values.any())
print(preprocessed_data.isnull().sum())

preprocessed_data.dropna(inplace=True)



# log transform the target (first translate to get positive values)
y_data = np.log(preprocessed_data["ArrDelay"] + 1 - data["ArrDelay"].min())
print((y_data.min()))
# drop it from the input data
preprocessed_data = preprocessed_data.drop(["ArrDelay"], axis=1)

x_data = preprocessed_data
print ("Size of loaded data: " + str(preprocessed_data.shape))

False
   UniqueCarrier  TailNum  Origin  ...  CRSElapsedTime  ArrDelay   Distance
0             17     1547     268  ...       15.435596       1.0  37.496835
1             17     1662     268  ...       17.078784       8.0  41.817805
2             17     3787     268  ...       17.078784      34.0  41.817805
3             17     1623     268  ...       17.078784      26.0  41.817805
4             17     2407     268  ...       17.078784      -3.0  41.817805

[5 rows x 12 columns]
data size after data processing: (7275259, 12)
False
UniqueCarrier     0
TailNum           0
Origin            0
Dest              0
DATE              0
DayOfWeek         0
CRSDepTime        0
CRSArrTime        0
FlightNum         0
CRSElapsedTime    0
ArrDelay          0
Distance          0
dtype: int64
0.0
Size of loaded data: (7275259, 11)


# Linear regression

In [0]:
# We have created a function to print accuracy metrics which can be used
# to get accuracy metrics of all models in upcoming steps
def print_accuracy_report(y_test, y_pred, X_test, model):
    print('R Squared(Accuracy)', metrics.r2_score(y_test, y_pred))
    print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    print('Root Mean Squared Log Error', np.sqrt(metrics.mean_squared_log_error(y_test, y_pred)))


# we have created a function to generate linear regression model
# which can then be called again after feature selection or other steps
from sklearn.linear_model import LinearRegression


def LinearRegressionModel(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    #for i, values in enumerate(y_test):
    #    print(str(y_pred[i]), str(y_test[i]))
    print(print_accuracy_report(y_test, y_pred, X_test, regressor))
    return regressor

linearModel = LinearRegressionModel(x_data, y_data)

R Squared(Accuracy) 0.025959526034579516
Mean Absolute Error: 0.06430969783106819
Mean Squared Error: 0.010047198857869291
Root Mean Squared Error: 0.10023571647805632
Root Mean Squared Log Error 0.014485118319833797
None


# Gradient boosting

In [0]:
from sklearn.ensemble import GradientBoostingRegressor 


def GradientBoostingRegressorModel(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.1)
    rf = GradientBoostingRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(print_accuracy_report(y_test, y_pred, X_test, rf))
    return rf
    
gradientBoostingRegressorModel = GradientBoostingRegressorModel(x_data, y_data)

R Squared(Accuracy) 0.0720204230934468
Mean Absolute Error: 0.062365917884732586
Mean Squared Error: 0.009572082058627624
Root Mean Squared Error: 0.09783701783388343
Root Mean Squared Log Error 0.014130943736311767
None


In [0]:
#similarly define a function for random forest regressor
from sklearn.ensemble import RandomForestRegressor


def RandomForestRegressorModel(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.3)
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(print_accuracy_report(y_test, y_pred, X_test, rf))
    return rf

randomForestModel = RandomForestRegressorModel(x_data, y_data)



In [0]:
def plotFeatureImportances(model):
    #first print all features importances in descending order
    feature_importances = pd.DataFrame(model.feature_importances_,
                                   index = x_data.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
    print(feature_importances)
    # Next plot feature importances to get idea about where the curve breaks
    # in the graph i.e. select top appropriate features
    features = x_data.columns.tolist()
    importances = model.feature_importances_
    indices = np.argsort(importances)
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.show()

plotFeatureImportances(randomForestModel)
plotFeatureImportances(gradientBoostingRegressorModel)
plotFeatureImportances(extraTreeRegressorModel)
