In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
dades = pd.read_csv("../Sprint 11/NouFitxer.csv")

Crea almenys tres models de regressió diferents per intentar predir el millor possible l’endarreriment dels vols (ArrDelay) de DelayedFlights.csv.

In [3]:
dades.columns

# We will use 3 regression models to predict Arrival Delay

# Mulitple Linear Regression
# Neural Networks
# Decision Tree (?)

# Our model incorporates 3 independent variables: Departure Delay, Average Speed and Day of Week


Index(['Unnamed: 0', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime',
       'CRSDepTime', 'ArrTime', 'CRSArrTime', 'UniqueCarrier',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'Cancelled',
       'CancellationCode', 'Diverted', 'CarrierDelay', 'WeatherDelay',
       'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'AvSpeed'],
      dtype='object')

In [4]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

In [5]:
# Rescale variables into a 0-1 scale: MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(dades[["ArrDelay", "DepDelay", "AvSpeed", "DayOfWeek"]])

In [6]:
scaled_df = pd.DataFrame(scaled_df, columns = ["ArrDelay", "DepDelay", "AvSpeed", "DayOfWeek"])

In [7]:
# It was tried to remove outliers since in some cases it could be quite extreme, but they have been included in the
# end since it improved performance significantly

_________________________________________________________________________________________________________
#########################################################################################################
_________________________________________________________________________________________________________

In [8]:
# First model: Multiple Linear Regression

y = scaled_df["ArrDelay"]
X = scaled_df[["DepDelay", "AvSpeed", "DayOfWeek"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 23)


In [9]:
# We add a constant because statsmodels does not include it in their lineal regression models:

X_train = sm.add_constant(X_train) 
X_test = sm.add_constant(X_test) 


In [10]:
# We fit the model to the training data, then we predict the y with the test X and compare it to the actual y

model = sm.OLS(y_train, X_train).fit()

predictions = model.predict(X_test)

model.summary()

# All independent variables determine Arrival Delay in a statistically significant manner; Average Speed has a 
# strong negative effect, whereas Departure Delay has a strong positive effect

# R^2 is very good, but it is very bad at detecting overfitting; there is no difference between R^2 and Adj. R^2 so
# we should not be too worried about having too many independent variables

# When excluding outliers, this analysis was not as accurate (0,7 R^2 instead of 0,9)

# 1s execution time

0,1,2,3
Dep. Variable:,ArrDelay,R-squared:,0.907
Model:,OLS,Adj. R-squared:,0.907
Method:,Least Squares,F-statistic:,3024000.0
Date:,"Mon, 05 Jul 2021",Prob (F-statistic):,0.0
Time:,22:38:02,Log-Likelihood:,3238200.0
No. Observations:,935616,AIC:,-6476000.0
Df Residuals:,935612,BIC:,-6476000.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0040,3.39e-05,118.229,0.000,0.004,0.004
DepDelay,0.9723,0.000,3010.630,0.000,0.972,0.973
AvSpeed,-0.7852,0.004,-179.191,0.000,-0.794,-0.777
DayOfWeek,-3.998e-05,2.36e-05,-1.690,0.091,-8.63e-05,6.38e-06

0,1,2,3
Omnibus:,949177.131,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,781371798.792
Skew:,4.122,Prob(JB):,0.0
Kurtosis:,144.334,Cond. No.,629.0


In [11]:
# Root Mean Square Error (RMSE)

rmse = np.sqrt(model.mse_total)

print(rmse)

# 2% of the variance is lost with this model (since data is standardised MSE is in fact on a scale to 0 to 1)

0.024844660926039014


_________________________________________________________________________________________________________
#########################################################################################################
_________________________________________________________________________________________________________

In [12]:
# Second model: Neural Networks

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

from sklearn.neural_network import MLPRegressor

neural = MLPRegressor(random_state=0)

In [13]:
# Fitting model:
neural.fit(X_train, y_train)

# 2min 30s execute time

MLPRegressor(random_state=0)

In [14]:
# Predicting Y values:
Y_preds = neural.predict(X_test)

In [15]:
# R^2 score

print(neural.score(X_train, y_train))
print(neural.score(X_test, y_test))

0.9124515439222259
0.9130351095064904


In [16]:
# RMSE score

from sklearn.metrics import mean_squared_error as mse

print(np.sqrt(mse(y_test, Y_preds)))

# 0,7% of the variance is not explained by the neural network model, which is even better than the 
# linear regression model

0.007292404003420018


_________________________________________________________________________________________________________
#########################################################################################################
_________________________________________________________________________________________________________

In [17]:
# Third model: Random Tree

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 12)

from sklearn.ensemble import RandomForestRegressor


In [18]:
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)

rf.fit(X_train, y_train)

# Execution time: 8 minutes

RandomForestRegressor(random_state=42)

In [19]:
Y_pred = rf.predict(X_test)

# Execution time: 1 minute

In [20]:
rf.feature_importances_

# 92% of the prediction power comes from Departure Delay, and just 6% from Average Speed and 1% Weekday

array([0.9222219 , 0.06651678, 0.01126131])

In [21]:
# R^2 score, random forest:
 
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

# Execution time: 2 min 55sec

0.9820290148170939
0.8974523702508646


In [22]:
# RMSE score, random forest:

print(np.sqrt(mse(y_test, Y_preds)))

# 0,3% of the variance is not explained by the random tree model

0.034389114509021816


_________________________________________________________________________________________________________


Compara’ls en base al MSE i al R2 :

In [23]:
# R^2 scores:
#   Linear Regression: 0.907
#   Neural Network Regression: 0.913
#   Random Tree Regression: 0.898

# Even though Neural Networks provides a slightly better result, all three models are very satisfactory in terms 
# of model performance as measured by R^2

In [24]:
# RMSE scores:
#   Linear Regression: 2,48%
#   Neural Network Regression: 0,73%
#   Random Tree Regression: 3,44%

# Note: even if RMSE is an absolute value, all our variables are standardised in the range of 0 to 1, so absolute 
# variance is easily translated into %

# All models are quite good, but Neural Networks provide the best performance in terms of RMSE

In [28]:
# This exercise continues, due to size concerns, at S12 T01 P2