In [1]:
# Jupyter setup to expand cell display to 100% width on your screen (optional)
# Import relevant modules and setup for calling glmnet
%reset -f
%matplotlib inline

from sqlalchemy import create_engine
import sys
import pandas as pd
import numpy as np
import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.impute import KNNImputer

import statsmodels.api as sm
import statsmodels.genmod as genmod
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_absolute_error

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import cross_val_score
import sklearn.model_selection as ms

import statistics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load TSR and attributes data from DB

In [2]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
# geom_sql = """select distinct grid_id, grid_geom from fs_fiadb.pergrid"""
pergrid_base = """select distinct * from predictor.pergrid_base"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)

In [3]:
pergrid_base_df.columns

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wkb_geometry', 'tsr', 'wa', 'ha', 'wkt', 'lat', 'lon'],
      dtype='object')

In [4]:
y_true = pergrid_base_df['tsr']
grid_id = pergrid_base_df['grid_id']

In [5]:
var= ['grid_id', 'tsr', 'lat', 'lon', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg', 'mtwq', 'wa']
pred_var= ['aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg', 'mtwq', 'wa']

In [6]:
pergrid_base_df = pergrid_base_df[var]

### calculate cartesian coordinates 

In [7]:
# Converting lat/long to cartesian
import numpy as np

def get_cartesian(lat=None,lon=None):
    lat, lon = np.deg2rad(lat), np.deg2rad(lon)
    R = 6371 # radius of the earth
    x = R * np.cos(lat) * np.cos(lon)
    y = R * np.cos(lat) * np.sin(lon)
    return x,y

In [8]:
pergrid_base_df['x'] = 0.0
pergrid_base_df['y'] = 0.0

In [9]:
for i in pergrid_base_df.index:
    lat = pergrid_base_df.at[i, 'lat']
    lon = pergrid_base_df.at[i, 'lon']
    x, y = get_cartesian(lat,lon)
    
    pergrid_base_df.at[i, 'x'] = x
    pergrid_base_df.at[i, 'y'] = y

In [10]:
pergrid_base_df.head()

Unnamed: 0,grid_id,tsr,lat,lon,aet,ai,art,ewd,fa,map,...,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa,x,y
0,110,4.0,25.933,-97.631,6000.0,0.367,21.192,-2523666.2,7.706,681.0,...,2.833,0.604,444.86,2.27,11.598,,28.793,4.117,-760.845,-5678.742
1,111,2.0,25.936,-97.433,65535.0,0.383,20.234,-2537902.123,14.15,699.0,...,2.0,0.621,444.26,2.1,5.983,5.0,28.717,11.815,-741.219,-5681.193
2,195,5.0,24.834,-80.943,65535.0,,0.164,-524.709,0.071,1148.5,...,1.833,0.242,3.699,,0.012,,28.626,302.905,910.198,-5709.793
3,337,1.0,26.099,-98.427,5135.0,0.29,23.95,-2941595.415,3.004,565.0,...,2.333,0.392,492.281,2.12,61.887,,29.459,1.253,-838.474,-5659.588
4,338,1.0,26.104,-98.229,5645.0,0.302,23.526,-2959517.411,10.035,580.0,...,3.417,0.621,489.587,2.12,32.271,5.0,29.3,4.248,-818.867,-5662.238


In [11]:
var_coord = ['grid_id', 'tsr', 'lat', 'lon', 'x', 'y', 'aet', 'ai', 'art', 'ewd', 'fa', 'map',
       'mat', 'mpdq', 'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn',
       'mfdf', 'alt', 'shg', 'mtwq', 'wa']

### fill NA and split data

In [12]:
imputer = KNNImputer(n_neighbors=5)
pergrid_base_df_filled = imputer.fit_transform(pergrid_base_df)
pergrid_base_df = pd.DataFrame(pergrid_base_df_filled)

# standarize predictors
ss = StandardScaler(with_mean=False, with_std=False)
pergrid_base_df_std = ss.fit_transform(pergrid_base_df)

pergrid_base_df_std=pd.DataFrame(data=pergrid_base_df_std[0:,0:],
                index=pergrid_base_df.index,
                columns=var_coord)

In [14]:
pergrid_base_df_std.head(2)

Unnamed: 0,grid_id,tsr,lat,lon,x,y,aet,ai,art,ewd,...,psn,ra,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa
0,110.0,4.0,25.933,-97.631,6000.0,0.367,21.192,-2523666.2,7.706,681.0,...,2.833,0.604,444.86,2.27,11.598,4.6,28.793,4.117,-760.845,-5678.742
1,111.0,2.0,25.936,-97.433,65535.0,0.383,20.234,-2537902.123,14.15,699.0,...,2.0,0.621,444.26,2.1,5.983,5.0,28.717,11.815,-741.219,-5681.193


In [15]:
pergrid_base_df_test = pergrid_base_df_std.sample(frac=0.2, replace=False, random_state=1)
pergrid_base_df_train = pergrid_base_df_std.drop(pergrid_base_df_test.index)

## Build GLM Model

In [16]:
poisson_model = sm.GLM(pergrid_base_df_std['tsr'], pergrid_base_df_std[pred_var],family=sm.families.Poisson())

In [17]:
poisson_results = poisson_model.fit()

In [18]:
print(poisson_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    tsr   No. Observations:                15310
Model:                            GLM   Df Residuals:                    15290
Model Family:                 Poisson   Df Model:                           19
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -47486.
Date:                Sat, 23 Jan 2021   Deviance:                       30795.
Time:                        15:00:19   Pearson chi2:                 2.98e+04
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
aet            0.0939      0.004     26.475      0.0

In [19]:
params_df = poisson_results.params
params_df = params_df.to_frame()

In [20]:
params_df['var'] = params_df.index

In [21]:
params_df.columns = ['coefficent','var']

In [22]:
params_df['coefficent_abs'] = abs(params_df['coefficent'])
params_df.sort_values(by='coefficent_abs', ascending=False)

Unnamed: 0,coefficent,var,coefficent_abs
ra,0.111,ra,0.111
aet,0.094,aet,0.094
alt,-0.078,alt,0.078
fa,0.057,fa,0.057
mat,0.035,mat,0.035
mtcq,-0.01,mtcq,0.01
rmat,0.01,rmat,0.01
mfdf,-0.006,mfdf,0.006
art,0.003,art,0.003
rmap,-0.001,rmap,0.001


In [23]:
# # null hypothesis: predictors have no effect; A low p-value (< 0.05) indicates that the null hypothesis can be rejected
# print("P values of predictors: ")
# print(poisson_results.pvalues.sort_values(ascending=True))

In [24]:
print("Predictors which are not statistically significant:")
print(poisson_results.pvalues[poisson_results.pvalues > 0.05])

Predictors which are not statistically significant:
Series([], dtype: float64)


### Evaluation

In [28]:
perc_dev_explained_all = (1 - (poisson_results.deviance/poisson_results.null_deviance))*100
print("R-squared (% Deviance explained): {}".format(perc_dev_explained_all))

R-squared (% Deviance explained): 80.8821377845158


## Build Cross-validation GLM Model

In [29]:
class SMWrapper(BaseEstimator, RegressorMixin):
    """ A universal sklearn-style wrapper for statsmodels GLM w/ Possion """
    def __init__(self, model_class, fit_intercept=True):
        self.model_class = model_class
        self.fit_intercept = fit_intercept
    def fit(self, X, y):
        if self.fit_intercept:
            X = sm.add_constant(X)
        self.model_ = self.model_class(y, X, sm.families.Poisson())
        self.results_ = self.model_.fit()
    def predict(self, X):
        if self.fit_intercept:
            X = sm.add_constant(X)
        return self.results_.predict(X)

In [30]:
wrapped_possion_glm = SMWrapper(sm.GLM)

In [31]:
wrapped_possion_glm.fit(pergrid_base_df_std[pred_var],pergrid_base_df_std['tsr'])

In [32]:
r2_cross_val = cross_val_score(wrapped_possion_glm, pergrid_base_df_std[pred_var], pergrid_base_df_std['tsr'], scoring='r2', cv = ms.StratifiedKFold(n_splits=10, shuffle = True))
print("Cross-validated R2: ",)
statistics.mean(r2_cross_val.tolist())

Cross-validated R2: 


0.8135840860442178

### calculate residual

In [33]:
test_df = pergrid_base_df_test[['grid_id', 'tsr', 'lat', 'lon', 'x', 'y']]
test_df['predicted_tsr'] = wrapped_possion_glm.predict(pergrid_base_df_test[pred_var])
test_residual =  test_df['predicted_tsr'] - test_df['tsr']
test_df['residual'] = test_residual.tolist()

In [38]:
train_df = pergrid_base_df_train[['grid_id', 'tsr', 'lat', 'lon', 'x', 'y']]
train_df['predicted_tsr'] = wrapped_possion_glm.predict(pergrid_base_df_train[pred_var])
train_residual =  train_df['predicted_tsr'] - train_df['tsr']
train_df['residual'] = train_residual.tolist()

### kriging

In [40]:
import numpy as np
import pykrige.kriging_tools as kt
from pykrige.ok import OrdinaryKriging
import matplotlib.pyplot as plt

In [42]:
OK = OrdinaryKriging(
    x=train_df['x'], #'lon'
    y=train_df['y'], # 'lat'
    z=train_df['residual'],
    coordinates_type='euclidean', #'geographic'
    variogram_model="linear",
    verbose=False,
    enable_statistics=False
)

In [43]:
predict_value_list, ss = OK.execute("points", test_df['x'], test_df['y'])

In [44]:
test_df['residual_kriging'] = predict_value_list

In [45]:
test_df['predicted_tsr_kriging'] = test_df['predicted_tsr'] + test_df['residual_kriging']

## Model Prediction

In [46]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [48]:
test_df.head()

Unnamed: 0,grid_id,tsr,lat,lon,x,y,predicted_tsr,residual,residual_kriging,predicted_tsr_kriging
2158,8613.0,2.0,32.078,-107.147,892.0,0.104,1.545,-0.455,0.393,1.938
2123,8489.0,23.0,31.846,-84.349,5835.0,0.787,22.67,-0.33,-0.053,22.617
7905,17690.0,18.0,39.542,-92.299,6204.0,0.758,18.858,0.858,0.029,18.887
10586,21879.0,19.0,42.309,-84.556,6103.0,0.757,17.176,-1.824,0.025,17.201
6617,15787.0,13.0,37.837,-105.017,2770.0,0.316,8.897,-4.103,0.235,9.131


In [49]:
print('test MAE', mean_absolute_error(test_df['predicted_tsr'], test_df['tsr']))
print('test r2', r2_score(test_df['tsr'], test_df['predicted_tsr']))

test MAE 4.169709495258144
test r2 0.8138277897282293


In [50]:
print('test MAE', mean_absolute_error(test_df['predicted_tsr_kriging'], test_df['tsr']))
print('test r2', r2_score(test_df['tsr'], test_df['predicted_tsr_kriging']))

test MAE 4.317353164768281
test r2 0.7985889798781045


In [27]:
pergrid_all_predicted = pd.DataFrame(
    {'grid_id': grid_id,
     'tsr': y,
     'tsr_predicted': Y_pred.tolist()})

In [28]:
pergrid_all_predicted.head()

Unnamed: 0,grid_id,tsr,tsr_predicted
0,110,4.0,6.236435
1,111,2.0,5.930308
2,195,5.0,4.747933
3,337,1.0,5.229746
4,338,1.0,5.67263


In [29]:
pergrid_all_predicted.to_sql(name='glm', con=engine, schema='predictor', if_exists='replace', index=False)

In [30]:
update_geom = """
alter table predictor.glm add column if not exists wkb_geometry geometry(Polygon,4269);
update predictor.glm A SET wkb_geometry = B.wkb_geometry
FROM predictor.pergrid_base B
WHERE A.grid_id = B.grid_id
"""

In [31]:
connection = engine.connect()
connection.execute(update_geom)

<sqlalchemy.engine.result.ResultProxy at 0x7f86e6314850>

In [32]:
update_residual = """
alter table predictor.glm add column residual double precision;
update predictor.glm set residual = (tsr_predicted-tsr);
"""

In [60]:
connection = engine.connect()
connection.execute(update_residual)

<sqlalchemy.engine.result.ResultProxy at 0x7fd544ea50d0>