# Flight Delay Prediction
## Regression Engine to predict Arrival Delay

This notebook contains the code to train a regression engine which predicts the Arrival delay period (in minutes) for delayed flights.

In [1]:
# Pre-requisites
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Store the classifier models to save time
import joblib

# Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Classifiers from scikit-learn
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Performance metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("Data/flight_and_weather.csv", index_col=0)
print(f"\nShape: {df.shape}", end="\n\n")
df.info()

  mask |= (ar1 == a)



Shape: (1851436, 33)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 33 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   FlightDate       object 
 5   OriginAirportID  int64  
 6   Origin           object 
 7   DestAirportID    int64  
 8   Dest             object 
 9   CRSDepTime       int64  
 10  DepTime          float64
 11  DepDelayMinutes  float64
 12  DepDel15         float64
 13  CRSArrTime       int64  
 14  ArrTime          float64
 15  ArrDelayMinutes  float64
 16  ArrDel15         float64
 17  Time_new         int64  
 18  windspeedKmph    int64  
 19  winddirDegree    int64  
 20  weatherCode      int64  
 21  precipMM         float64
 22  visibility       int64  
 23  pressure         int64  
 24  cloudcover       int64  
 25  DewPointF        int64  
 26  WindGustKmph     in

### Eliminating Redundancy
|Column|Reason for elimination|
|:-|:-|
|FlightDate| The columns Year, Month and DayofMonth give this information in separate columns|
|OriginAirportID| It gives the same information as Origin|
|DestAirportID| It gives the same information as Dest|
|**CRSArrTime, ArrTime and ArrDel15**| They **leak information about target ArrDelayMinutes**|
|Time_new|It is a duplicate of time|
|date |It is a duplicate of FlightDate|
|airport|It is a duplicate of Origin|

In [3]:
# Dropping columns with redundant or duplicate data
df.drop(columns=["FlightDate",
                 "OriginAirportID",
                 "DestAirportID",
                 "CRSArrTime",
                 "ArrTime",
                 "ArrDel15",
                 "Time_new",
                 "date",
                 "airport"],
        inplace=True)
print(f"\nShape: {df.shape}", end="\n\n")
print(df.info())
# df.to_csv("./Data/flight_and_weather_without_redundant_info.csv")


Shape: (1851436, 24)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1851436 entries, 0 to 1851435
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   Origin           object 
 5   Dest             object 
 6   CRSDepTime       int64  
 7   DepTime          float64
 8   DepDelayMinutes  float64
 9   DepDel15         float64
 10  ArrDelayMinutes  float64
 11  windspeedKmph    int64  
 12  winddirDegree    int64  
 13  weatherCode      int64  
 14  precipMM         float64
 15  visibility       int64  
 16  pressure         int64  
 17  cloudcover       int64  
 18  DewPointF        int64  
 19  WindGustKmph     int64  
 20  tempF            int64  
 21  WindChillF       int64  
 22  humidity         int64  
 23  time             int64  
dtypes: float64(5), int64(17), object(2)
memory usage: 353.1+ MB
None


In [4]:
labelEncoder = LabelEncoder()
df["Origin"] = labelEncoder.fit_transform(df["Origin"])
df["Dest"] = labelEncoder.fit_transform(df["Dest"])
# Only need the observations where the flight is delayed
df = df[df["ArrDelayMinutes"] > 0]
df.reset_index(inplace=True, drop=True)
# print(df.columns)
# print(df.shape)
features = df.loc[:, df.columns != "ArrDelayMinutes"]
labels = np.asarray(df["ArrDelayMinutes"])
# print(features.shape)
# print(labels.shape)
# df.to_csv("Data/flight_and_weather_encoded_regression.csv")

In [5]:
# Number of samples/observations/rows is greater than 100,000
print(f"\nDataset shape: {df.shape}")
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.20, random_state=42)
print(f"features_train shape: {features_train.shape} | features_test shape: {features_test.shape}")
print(f"labels_train shape: {labels_train.shape} | labels_test shape: {labels_test.shape}")
# print(f"{features_train.shape[1]} Features: {features_train.columns.to_list()}")
del df


Dataset shape: (700439, 24)
features_train shape: (560351, 23) | features_test shape: (140088, 23)
labels_train shape: (560351,) | labels_test shape: (140088,)


In [6]:
perf_df = pd.DataFrame(columns=["Regressors", "MSE", "RMSE", "MAE", "R2"])
def print_metrics(labels_test, model_pred, regressor_name, perf_df):
    
    mse = mean_squared_error(labels_test, model_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(labels_test, model_pred)
    r2 = r2_score(labels_test, model_pred)
    
    print(f"MSE      : {mse}", end="\n\n")
    print(f"RMSE     : {rmse}", end="\n\n")
    print(f"MAE      : {mae}", end="\n\n")
    print(f"R2 Score : {r2}", end="\n\n")
    
    perf_df = perf_df.append({"Regressors": regressor_name,
                                        "MSE": mse, 
                                        "RMSE": rmse,
                                        "MAE": mae,
                                        "R2": r2}, ignore_index=True)
    return perf_df

## Training Different Regression Models

### Linear Regression

In [7]:
# model = LinearRegression(n_jobs=-1)
# model.fit(features_train, labels_train)
# joblib.dump(model, "./Regressors/LogisticRegression.joblib")
model = joblib.load("./Regressors/LogisticRegression.joblib")
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "LinearRegression", perf_df)
del model
del model_pred

MSE      : 246.11533169589046

RMSE     : 15.688063350710006

MAE      : 10.633112849932374

R2 Score : 0.9312750617189874



### Decision Tree Regressor

In [8]:
# model = DecisionTreeRegressor()
# model.fit(features_train, labels_train)
# joblib.dump(model, "./Regressors/DecisionTreeRegressor.joblib")
model = joblib.load("./Regressors/DecisionTreeRegressor.joblib")
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "DecisionTreeRegressor", perf_df)
del model
del model_pred

MSE      : 472.70934341271203

RMSE     : 21.741879942008513

MAE      : 14.630232425332649

R2 Score : 0.8680012324829942



### XGBoost

In [9]:
# model = GradientBoostingRegressor()
# model.fit(features_train, labels_train)
# joblib.dump(model, "./Regressors/GradientBoostingRegressor.joblib")
model = joblib.load("./Regressors/GradientBoostingRegressor.joblib")
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "GradientBoostingRegressor", perf_df)
del model
del model_pred

MSE      : 234.03676085446193

RMSE     : 15.298260059708161

MAE      : 10.367285945784936

R2 Score : 0.9346478667770067



### Random Forest

In [10]:
# model = RandomForestRegressor(n_jobs=-1)
# model.fit(features_train, labels_train)
# joblib.dump(model, "./Regressors/RandomForestRegressor.joblib")
model = joblib.load("./Regressors/RandomForestRegressor.joblib")
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "RandomForestRegressor", perf_df)
del model
del model_pred

MSE      : 228.859378607919

RMSE     : 15.128098975347795

MAE      : 10.419297918179321

R2 Score : 0.9360935925385799



### Extra Trees Regressor

In [11]:
model = ExtraTreesRegressor(n_jobs=-1)
model.fit(features_train, labels_train)
# joblib.dump(model, "./Regressors/ExtraTreesRegressor.joblib")
# model = joblib.load("./Regressors/ExtraTreesRegressor.joblib")
model_pred = model.predict(features_test)
perf_df = print_metrics(labels_test, model_pred, "ExtraTreesRegressor", perf_df)

MSE      : 232.2070527990739

RMSE     : 15.238341537026722

MAE      : 10.500640740106219

R2 Score : 0.9351587921724802



## Performance Summary

In [12]:
# Set name of the regressors as index labels
perf_df.set_index("Regressors", inplace=True)
perf_df

Unnamed: 0_level_0,MSE,RMSE,MAE,R2
Regressors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LinearRegression,246.115332,15.688063,10.633113,0.931275
DecisionTreeRegressor,472.709343,21.74188,14.630232,0.868001
GradientBoostingRegressor,234.036761,15.29826,10.367286,0.934648
RandomForestRegressor,228.859379,15.128099,10.419298,0.936094
ExtraTreesRegressor,232.207053,15.238342,10.500641,0.935159
