<a href="https://colab.research.google.com/github/noahruiz416/Airline_On_Time_AZ_Market/blob/main/Airline_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# In this notebook we will build a regression model based of the findings in the EDA. Additionally rudeimentary feature engineering will be used to find the best features to feed into the regression model 

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive 
from sklearn.linear_model import LinearRegression
from scipy import stats
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
airline_aggr_data = pd.read_csv("gdrive/My Drive/bq-results-20220226-154935-8t4st3sl5exn/ad.csv")
airline_aggr_data = airline_aggr_data.fillna(value = 0)

In [None]:
#calculating inflation factors / mutlicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

## Here we use the VIF, to test for collinearity between variables. After doing so we create various interaction terms, for variables that have high collinearity

In [None]:
#checking for independence between variables detected factors
x = airline_aggr_data[['DepDelay', 'CarrierDelay', 'WeatherDelay', 
        'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Distance', 'TaxiOut', 'TaxiIn']]
calc_vif(x)

Unnamed: 0,variables,VIF
0,DepDelay,13.668774
1,CarrierDelay,6.616666
2,WeatherDelay,1.360376
3,NASDelay,1.985785
4,SecurityDelay,1.030361
5,LateAircraftDelay,5.433587
6,Distance,2.770194
7,TaxiOut,2.705729
8,TaxiIn,2.084327


In [None]:
airline_aggr_data['interaction_dep_carrier'] = airline_aggr_data['DepDelay'] * airline_aggr_data['CarrierDelay']
airline_aggr_data['interaction_late'] = airline_aggr_data['DepDelay'] * airline_aggr_data['LateAircraftDelay']
airline_aggr_data['interaction_late_carrier'] = airline_aggr_data['CarrierDelay'] * airline_aggr_data['LateAircraftDelay']

## In this section, we iterate through multiple models, in order to find the best fit model for arrival delay.

In [None]:
from sklearn.model_selection import train_test_split
#complete model with interaction terms
X = airline_aggr_data[['DepDelay', 'CarrierDelay', 'LateAircraftDelay', 'interaction_dep_carrier', 'interaction_late']]
y = airline_aggr_data['ArrDelay']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.9164873757936949


In [None]:
#reduced model
X = airline_aggr_data[['DepDelay', 'CarrierDelay', 'LateAircraftDelay']]
y = airline_aggr_data['ArrDelay']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.9116172582683575


In [None]:
#model ignoring any internaction between terms, and instead focusing on terms with low colinearity, with excepction of departure delay
X = airline_aggr_data[['DepDelay', 'SecurityDelay', 'WeatherDelay', 'NASDelay']]
y = airline_aggr_data['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.9304725338077754


In [None]:
#Model with low colinnearity + interaction between all variables
X = airline_aggr_data[['DepDelay', 'CarrierDelay', 'SecurityDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay', 'interaction_dep_carrier', 'interaction_late', 'interaction_late_carrier']]
y = airline_aggr_data['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.980274965673917


In [None]:
#Low Collinearity + Interaction between departure delay and carrier delay, and interaction between departure delay and late aircraft delay
X = airline_aggr_data[['DepDelay', 'CarrierDelay', 'SecurityDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay', 'interaction_dep_carrier', 'interaction_late', 'interaction_late_carrier']]
y = airline_aggr_data['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.9769256086932188


## In this section we use various different metrics other than the, regression train score to test the effectiveness of our best models

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

In [None]:
#best model with interaction
X = airline_aggr_data[['DepDelay', 'CarrierDelay', 'SecurityDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay', 'interaction_dep_carrier', 'interaction_late', 'interaction_late_carrier']]
y = airline_aggr_data['ArrDelay']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10)

regr = LinearRegression()
regr.fit(X_train, y_train)
print(regr.score(X_test, y_test))

0.9750977213649205


In [None]:
#testing complete model
predicted_y_values = regr.predict(X)
actual_y_values = airline_aggr_data['ArrDelay']

In [None]:
r2_score(actual_y_values, predicted_y_values)

0.9779008188661079

In [None]:
explained_variance_score(actual_y_values, predicted_y_values)

0.9779008195375242

In [None]:
max_error(actual_y_values, predicted_y_values)

985.0011001036534

In [None]:
mean_absolute_error(actual_y_values, predicted_y_values)

3.440484730100887

In [None]:
mean_squared_error(actual_y_values, predicted_y_values)

35.45310456427907

In [None]:
#best model without interaction, 'reduced' model
X1 = airline_aggr_data[['DepDelay', 'CarrierDelay', 'SecurityDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay']]
y1 = airline_aggr_data['ArrDelay']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.10)

regr1 = LinearRegression()
regr1.fit(X1_train, y1_train)
print(regr1.score(X1_test, y1_test))

0.9698040450004816


In [None]:
#testing reduced model 
predicted_y_values = regr1.predict(X1)
actual_y_values = airline_aggr_data['ArrDelay']

In [None]:
r2_score(actual_y_values, predicted_y_values)

0.9771657989671639

In [None]:
explained_variance_score(actual_y_values, predicted_y_values)

0.9771657997291884

In [None]:
max_error(actual_y_values, predicted_y_values)

998.3560843369692

In [None]:
mean_absolute_error(actual_y_values, predicted_y_values)

3.517015802814346