In [None]:
import pandas as pd
import numpy as np
import pickle
# import scipy.stats as st
# import plotly.express as px

#stats
import statsmodels.api as sm

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import TransformedTargetRegressor

# For Linear Regression

## Vanilla Baseline Model:
- No feature scaling
- No removal of correlated features

### All features:

In [None]:
# adding a constant for X
X_all = sm.add_constant(X)

#modeling vanilla ls
model = sm.OLS(y, X_all).fit()
# pickle.dump(model, open('Data/Pickles/vanilla_ls_allfeat_noscalling.sav', 'wb'))
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              arr_delay   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     34.45
Date:                Wed, 22 Sep 2021   Prob (F-statistic):           4.28e-87
Time:                        17:25:55   Log-Likelihood:            -2.6493e+05
No. Observations:               50000   AIC:                         5.299e+05
Df Residuals:                   49986   BIC:                         5.300e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     9.55

In [None]:
# Model Evaluation
y_pred = model.predict(X_all)
n = y.shape[0]
p = X_all.shape[1]
print('R2 (adj) =', 1-(1-r2_score(y, y_pred))*(n-1)/(n-p-1))
print('RMSE =', mean_squared_error(y, y_pred))
print('MAE =', mean_absolute_error(y, y_pred))


R2 (adj) = 0.008603122510426053
RMSE = 2343.579124388721
MAE = 20.949733276135145


### Removing features with p-value > 0.05 (prob. weight = 0 is statistically significant):

In [None]:
# adding a constant for X
X_pval_filter = X_all.drop(['Season', 'Weekday', 'dest_airp_fl_ind',
       'orig_airp_pss_ind', 'orig_airp_pss_ind', 'dest_airp_pss_ind'], axis=1)

#modeling vanilla ls
model = sm.OLS(y, X_pval_filter).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              arr_delay   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     55.12
Date:                Wed, 22 Sep 2021   Prob (F-statistic):           8.18e-90
Time:                        17:38:08   Log-Likelihood:            -2.6494e+05
No. Observations:               50000   AIC:                         5.299e+05
Df Residuals:                   49991   BIC:                         5.300e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     8.98

In [None]:
# Model Evaluation
y_pred = model.predict(X_pval_filter)
n = y.shape[0]
p = X_pval_filter.shape[1]
print('R2 (adj) =', 1-(1-r2_score(y, y_pred))*(n-1)/(n-p-1))
print('RMSE =', mean_squared_error(y, y_pred))
print('MAE =', mean_absolute_error(y, y_pred))


R2 (adj) = 0.008565253114060645
RMSE = 2343.9030816633085
MAE = 20.955272884479278


## Vanilla now Scaled (features and/or target)

### Keeping all Features:

#### With original features:

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
# from sklearn.ensemble import RandomForestRegressor # TRY LATER CHANGING THE REGRESSOR PARAM BELOW

#instantiating the target regressor
ttr_lr = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp)
# adding 1 to zero values in y (flights with no delays)
y_plus1 = y + 1
# fitting 
ttr_lr.fit(X_all, y_plus1)

TransformedTargetRegressor(regressor=LinearRegression())

In [None]:
# Model Evaluation
y_pred = ttr_lr.predict(X_all)
n = y.shape[0]
p = X_all.shape[1]
print('R2 (adj) =', 1-(1-r2_score(y, y_pred))*(n-1)/(n-p-1))
print('RMSE =', mean_squared_error(y, y_pred))
print('MAE =', mean_absolute_error(y, y_pred))


R2 (adj) = 0.008603122510426053
RMSE = 2343.579124388721
MAE = 20.949733276135145


#### Standardizing features:

In [None]:
from sklearn.preprocessing import StandardScaler
#new df with standardized values
X_stscaled = X_all.copy()
# features to standardize
stand_feat = ['crs_elapsed_time', 'distance']
# adding the standardized columns
standardized_feat = StandardScaler().fit_transform(X_stscaled[stand_feat])
X_stscaled['crs_elap_t_std'], X_stscaled['dist_std'] = [standardized_feat[:,0], standardized_feat[:,1]]
# dropping the original values
# vanilla_std = flights.drop(['crs_elapsed_time', 'distance'], axis=1)

In [None]:
#instantiating the target regressor
ttr_lr_Xstd = TransformedTargetRegressor(regressor=LinearRegression(), func=np.log, inverse_func=np.exp)
# fitting 
ttr_lr_Xstd.fit(X_stscaled, y_plus1)

TransformedTargetRegressor(func=<ufunc 'log'>, inverse_func=<ufunc 'exp'>,
                           regressor=LinearRegression())

## Feature selection: correlation and low variance

## Removing redundant correlated features:

In [None]:
# dropping correlated features
X_correl_filter = X_all.drop(['Season', 'Weekday', 'dest_airp_fl_ind',
       'orig_airp_pss_ind', 'orig_airp_pss_ind', 'dest_airp_pss_ind'], axis=1)

#modeling vanilla ls
model = sm.OLS(y, X_pval_filter).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              arr_delay   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     55.12
Date:                Wed, 22 Sep 2021   Prob (F-statistic):           8.18e-90
Time:                        17:38:08   Log-Likelihood:            -2.6494e+05
No. Observations:               50000   AIC:                         5.299e+05
Df Residuals:                   49991   BIC:                         5.300e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     8.98

In [None]:

y_pred = model.predict(X_pval_filter)
n = y.shape[0]
p = X_pval_filter.shape[1]
print('R2 (adj) =', 1-(1-r2_score(y, y_pred))*(n-1)/(n-p-1))
print('RMSE =', mean_squared_error(y, y_pred))
print('MAE =', mean_absolute_error(y, y_pred))


NameError: name 'model' is not defined