In [1]:
# Jupyter setup to expand cell display to 100% width on your screen (optional)
# Import relevant modules and setup for calling glmnet
%reset -f
%matplotlib inline

from sqlalchemy import create_engine
import sys
import pandas as pd
import numpy as np
import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm
import statsmodels.genmod as genmod
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
import sklearn.model_selection as ms

## Load TSR and attributes data from DB

In [2]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
# geom_sql = """select distinct grid_id, grid_geom from fs_fiadb.pergrid"""
pergrid_base = """select distinct * from predictor.pergrid_base"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)
print(pergrid_base_df.shape)
pergrid_base_df_na_remove = pergrid_base_df.dropna()
print(pergrid_base_df_na_remove.shape)

(15310, 22)
(13503, 22)


## Process TSR predictor and outcome varaibles

In [3]:
pergrid_base_encoded=pd.get_dummies(pergrid_base_df_na_remove, columns=["hydrogroup"])

In [4]:
print(pergrid_base_encoded.columns)

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt',
       'mtwq', 'wkb_geometry', 'tsr', 'hydrogroup_A', 'hydrogroup_A/D',
       'hydrogroup_B', 'hydrogroup_B/D', 'hydrogroup_C', 'hydrogroup_C/D',
       'hydrogroup_D'],
      dtype='object')


In [5]:
lasso_var= ["mpdq", "fa", "tsn", "pet", "alt", "ra", "map", "aet", "ewd"]
X = pergrid_base_encoded[lasso_var]

In [6]:
# standarize predictors
ss = StandardScaler(with_mean=False, with_std=False)
X_std = ss.fit_transform(X.values)

Xstd=pd.DataFrame(data=X_std[0:,0:],
                index=X.index,
                columns=lasso_var)

In [7]:
y_true = pergrid_base_encoded['tsr']

In [8]:
# X_train, X_test, y_train, y_test = train_test_split(Xstd, y_true, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(Xstd, y_true, test_size=0.2, random_state=0)

## Calculate VIF

In [10]:
# if the VIF is between 5-10, multicolinearity is likely present and you should consider dropping the variable.
pd.Series([variance_inflation_factor(Xstd.values, i) 
               for i in range(Xstd.shape[1])], 
              index=Xstd.columns)

mpdq    13.259813
fa       5.230895
tsn      8.439359
pet     59.975969
alt      7.237232
ra       5.591052
map     23.217014
aet      1.662599
ewd     20.953972
dtype: float64

## Build GLM Model

In [11]:
poisson_model = sm.GLM(y_train, X_train, family=sm.families.Poisson())

In [12]:
poisson_results = poisson_model.fit()

In [13]:
print(poisson_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    tsr   No. Observations:                10802
Model:                            GLM   Df Residuals:                    10793
Model Family:                 Poisson   Df Model:                            8
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -36245.
Date:                Wed, 08 Apr 2020   Deviance:                       26619.
Time:                        19:45:31   Pearson chi2:                 2.56e+04
No. Iterations:                     5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
mpdq           0.0033   5.13e-05     64.457      0.0

## Build Cross-validation GLM Model

In [14]:
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels GLM w/ Possion """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X, sm.families.Poisson())
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [15]:
wrapped_possion_glm = SMWrapper(sm.GLM)

In [16]:
wrapped_possion_glm.fit(Xstd,y_true)

  return ptp(axis=axis, out=out, **kwargs)


## Model Evaluation

In [17]:
print(cross_val_score(wrapped_possion_glm, Xstd, y_true, scoring='r2', cv = ms.StratifiedKFold(shuffle = True)))

[0.75484457 0.76394775 0.75918327 0.75580873 0.75762677]


  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


In [18]:
print("Cross-validated R2: ",)
sum([0.76476378,0.75316275,0.75810444,0.7704703,0.74380079])/5

Cross-validated R2: 


0.758060412

In [19]:
print(cross_val_score(wrapped_possion_glm, Xstd, y_true, scoring='neg_mean_absolute_error', cv = ms.StratifiedKFold(shuffle = True)))

[-4.95497311 -4.76473108 -4.80091369 -4.85612298 -4.90726798]


  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


In [20]:
print("Cross-validated MAE: ",)
abs(sum([-4.81841455, -5.03859822, -4.85686593, -4.7553791,-4.8159155])/5)

Cross-validated MAE: 


4.85703466

In [21]:
# null hypothesis: predictors have no effect; A low p-value (< 0.05) indicates that the null hypothesis can be rejected
print("P values of predictors: ")
print(poisson_results.pvalues.sort_values(ascending=True))

P values of predictors: 
mpdq    0.000000e+00
fa      0.000000e+00
tsn     0.000000e+00
pet     0.000000e+00
alt     0.000000e+00
ewd     0.000000e+00
map     4.175534e-92
aet     8.142814e-03
ra      6.728446e-01
dtype: float64


In [22]:
print("Predictors which are not statistically significant:")
print(poisson_results.pvalues[poisson_results.pvalues > 0.05])

Predictors which are not statistically significant:
ra    0.672845
dtype: float64


## Model Prediction

In [23]:
Y_pred = wrapped_possion_glm.predict(Xstd)

  return ptp(axis=axis, out=out, **kwargs)


In [24]:
pergrid_all_predicted = pd.DataFrame(
    {'grid_id': pergrid_base_encoded['grid_id'].to_list(),
     'tsr': pergrid_base_encoded['tsr'].to_list(),
     'tsr_predicted': Y_pred.tolist()})

In [25]:
pergrid_all_predicted.head()

Unnamed: 0,grid_id,tsr,tsr_predicted
0,111,2.0,5.717853
1,338,1.0,4.879671
2,339,2.0,5.139307
3,340,2.0,5.477383
4,342,2.0,5.591175


In [26]:
pergrid_all_predicted.to_sql(name='m1_base_lasso_glm', con=engine, schema='predictor', if_exists='replace', index=False)

  "Did not recognize type '%s' of column '%s'" % (attype, name)


In [27]:
update_geom = """
alter table predictor.m1_base_lasso_glm add column if not exists wkb_geometry geometry(Polygon,4269);
update predictor.m1_base_lasso_glm A SET wkb_geometry = B.wkb_geometry
FROM predictor.pergrid_base B
WHERE A.grid_id = B.grid_id
"""

In [28]:
connection = engine.connect()
connection.execute(update_geom)

<sqlalchemy.engine.result.ResultProxy at 0x130093c88>

In [29]:
update_residual = """
alter table predictor.m1_base_lasso_glm add column residual double precision;
update predictor.m1_base_lasso_glm set residual = (tsr_predicted-tsr);
"""

In [30]:
connection = engine.connect()
connection.execute(update_residual)

<sqlalchemy.engine.result.ResultProxy at 0x130093198>