In [1]:
# Jupyter setup to expand cell display to 100% width on your screen (optional)
# Import relevant modules and setup for calling glmnet
%reset -f
%matplotlib inline

from sqlalchemy import create_engine
import sys
import pandas as pd
import numpy as np
import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer

import statsmodels.api as sm
import statsmodels.genmod as genmod
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
import sklearn.model_selection as ms

import warnings
warnings.filterwarnings('ignore')

## Load TSR and attributes data from DB

In [2]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
# geom_sql = """select distinct grid_id, grid_geom from fs_fiadb.pergrid"""
pergrid_base = """select distinct * from predictor.pergrid_base"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)

In [3]:
pergrid_base_df.columns

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wkb_geometry', 'tsr', 'wa'],
      dtype='object')

## Process TSR predictor and outcome varaibles

### only use lasso-selected variables 

In [4]:
y = pergrid_base_df['tsr']
grid_id = pergrid_base_df['grid_id']
lasso_var= ['aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wa']
pergrid_base_selected_df = pergrid_base_df[lasso_var]

### fill in no-data grid with value from neighboring grids

In [5]:
imputer = KNNImputer(n_neighbors=5)
pergrid_base_selected_filled = imputer.fit_transform(pergrid_base_selected_df)
pergrid_base_df = pd.DataFrame(pergrid_base_selected_filled)

In [6]:
pergrid_base_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,20.036236,87.0,16.299375,1826.768199,50.411522,16.0,2.833333,0.604166,444.860212,2.27,11.597701,5.8,28.7925,4.1166
1,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,20.298634,92.0,16.340292,1811.927757,49.797996,14.0,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152
2,65535.0,1.30362,0.164286,-524.7088,0.0711,1148.5,0.309987,131.5,21.304167,1918.111111,0.713689,1.0,1.833333,0.241667,3.698817,6.524,0.011696,2.0,28.626191,302.9049
3,5135.0,0.2898,23.950167,-2941595.0,3.0042,565.0,20.193576,76.0,15.517042,1972.128352,43.00118,108.0,2.333333,0.391667,492.281141,2.12,61.886973,5.4,29.458834,1.2528
4,5645.0,0.3017,23.526261,-2959517.0,10.035,580.0,20.710174,80.0,15.754,1928.968811,44.328019,34.0,3.416667,0.620833,489.587254,2.12,32.270955,5.0,29.300416,4.248


In [16]:
from sklearn.preprocessing import MinMaxScaler

ss = MinMaxScaler()
X_std = ss.fit_transform(pergrid_base_df)

Xstd=pd.DataFrame(data=X_std[0:,0:],
                index=pergrid_base_df.index,
                columns=lasso_var)

In [8]:
x_train, x_test, y_train, y_test = train_test_split(Xstd, y, test_size=0.2, random_state=12345)

## Calculate VIF

In [19]:
# if the VIF is between 5-10, multicolinearity is likely present and you should consider dropping the variable.
pd.Series([variance_inflation_factor(Xstd.values, i) 
               for i in range(Xstd.shape[1])], 
              index=Xstd.columns)

aet        2.099093
ai        48.545420
art     1543.590152
ewd      240.599854
fa         5.583221
map       92.286433
mat      425.471442
mpdq      47.314070
mtcq     655.984657
pet      182.075913
psn       21.420803
ra        39.271648
rmap       5.040411
rmat      28.442990
tsn     1254.755912
mfdf      87.922182
alt       28.171383
shg        4.822418
mtwq     517.123907
wa         2.254314
dtype: float64

## Build GLM Model

In [21]:
poisson_model = sm.GLM(y_train, x_train, family=sm.families.Poisson())

In [22]:
poisson_results = poisson_model.fit()

In [23]:
print(poisson_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    tsr   No. Observations:                12248
Model:                            GLM   Df Residuals:                    12228
Model Family:                 Poisson   Df Model:                           19
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -38804.
Date:                Fri, 07 Aug 2020   Deviance:                       26275.
Time:                        06:54:51   Pearson chi2:                 2.58e+04
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
aet           -0.0549      0.015     -3.591      0.0

In [24]:
# null hypothesis: predictors have no effect; A low p-value (< 0.05) indicates that the null hypothesis can be rejected
print("P values of predictors: ")
print(poisson_results.pvalues.sort_values(ascending=True))

P values of predictors: 
fa       0.000000e+00
map     6.392115e-199
ai      3.307000e-165
ewd     2.070648e-156
alt     4.834023e-148
art     2.898304e-137
rmat    6.382324e-122
psn      2.045880e-61
pet      1.817505e-56
tsn      3.088192e-44
ra       5.692087e-34
shg      1.426522e-17
mtwq     2.476689e-17
mfdf     1.828794e-13
rmap     4.082301e-08
mpdq     2.050089e-04
aet      3.298309e-04
mat      2.233036e-02
wa       4.495572e-02
mtcq     5.517467e-01
dtype: float64


In [25]:
print("Predictors which are not statistically significant:")
print(poisson_results.pvalues[poisson_results.pvalues > 0.05])

Predictors which are not statistically significant:
mtcq    0.551747
dtype: float64


## Build Cross-validation GLM Model

In [26]:
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels GLM w/ Possion """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X, sm.families.Poisson())
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [27]:
wrapped_possion_glm = SMWrapper(sm.GLM)

In [31]:
wrapped_possion_glm.fit(Xstd,y)

## Model Evaluation

In [32]:
import statistics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [33]:
r2_cross_val = cross_val_score(wrapped_possion_glm, Xstd, y, scoring='r2', cv = ms.StratifiedKFold(shuffle = True))
print("Cross-validated R2: ",)
statistics.mean(r2_cross_val.tolist())

Cross-validated R2: 


0.8058423037888429

In [34]:
mae_cross_val = cross_val_score(wrapped_possion_glm, Xstd, y, scoring='neg_mean_absolute_error', cv = ms.StratifiedKFold(shuffle = True))
print("Cross-validated MAE: ",)
statistics.mean(mae_cross_val.tolist())* -1

Cross-validated MAE: 


4.311706953897733

In [35]:
y_train_pred = wrapped_possion_glm.predict(x_train)
print('train MAE', mean_absolute_error(y_train_pred, y_train))
print('train r2', r2_score(y_train, y_train_pred))

train MAE 4.315572123086133
train r2 0.8052040946243777


In [36]:
y_test_pred = wrapped_possion_glm.predict(x_test)
print('test MAE', mean_absolute_error(y_test_pred, y_test))
print('test r2', r2_score(y_test, y_test_pred))

test MAE 4.258236870898238
test r2 0.8116288786404352


In [37]:
glm_y_test = pd.DataFrame(
    {'tsr': y_test,
     'tsr_predicted': y_test_pred})
glm_y_test.to_sql(name='glm_y_test', con=engine, schema='predictor', if_exists='replace', index=False)

In [38]:
glm_y_test.shape

(3062, 2)

## Model Prediction

In [39]:
Y_pred = wrapped_possion_glm.predict(Xstd)

In [41]:
pergrid_all_predicted = pd.DataFrame(
    {'grid_id': grid_id,
     'tsr': y,
     'tsr_predicted': Y_pred.tolist()})

In [42]:
pergrid_all_predicted.head()

Unnamed: 0,grid_id,tsr,tsr_predicted
0,110,4.0,6.236435
1,111,2.0,5.930308
2,195,5.0,4.747933
3,337,1.0,5.229746
4,338,1.0,5.67263


In [43]:
pergrid_all_predicted.to_sql(name='lasso_glm', con=engine, schema='predictor', if_exists='replace', index=False)

In [44]:
update_geom = """
alter table predictor.lasso_glm add column if not exists wkb_geometry geometry(Polygon,4269);
update predictor.lasso_glm A SET wkb_geometry = B.wkb_geometry
FROM predictor.pergrid_base B
WHERE A.grid_id = B.grid_id
"""

In [45]:
connection = engine.connect()
connection.execute(update_geom)

<sqlalchemy.engine.result.ResultProxy at 0x7fe7cee43150>

In [46]:
update_residual = """
alter table predictor.lasso_glm add column residual double precision;
update predictor.lasso_glm set residual = (tsr_predicted-tsr);
"""

In [47]:
connection = engine.connect()
connection.execute(update_residual)

<sqlalchemy.engine.result.ResultProxy at 0x7fe7d419c5d0>