In [1]:
# Jupyter setup to expand cell display to 100% width on your screen (optional)
# Import relevant modules and setup for calling glmnet
%reset -f
%matplotlib inline

from sqlalchemy import create_engine
import sys
import pandas as pd
import numpy as np
import scipy, importlib, pprint, matplotlib.pyplot as plt, warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

import statsmodels.api as sm
import statsmodels.genmod as genmod
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

### Residual Estimate - Kriging
import pykrige.kriging_tools as kt
from pykrige.ok import OrdinaryKriging

In [2]:
# Machine learning 
from sklearn.ensemble import RandomForestRegressor

### load data

In [3]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
pergrid_base = """select distinct * from predictor.pergrid_base"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)

In [4]:
pergrid_base_df.columns

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wkb_geometry', 'tsr', 'wa', 'ha', 'wkt', 'lat', 'lon'],
      dtype='object')

In [5]:
y_true = pergrid_base_df['tsr']
grid_id = pergrid_base_df['grid_id']

In [6]:
var= ['grid_id', 'tsr', 'lat', 'lon', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg', 'mtwq', 'wa']
pred_var= ['aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg', 'mtwq', 'wa']

In [7]:
pergrid_base_df = pergrid_base_df[var]

In [8]:
pergrid_base_df.head(2)

Unnamed: 0,grid_id,tsr,lat,lon,aet,ai,art,ewd,fa,map,...,psn,ra,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa
0,110,4.0,25.932871,-97.631112,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,...,50.411522,16.0,2.833333,0.604166,444.860212,2.27,11.597701,,28.7925,4.1166
1,111,2.0,25.935809,-97.433331,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,...,49.797996,14.0,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152


### calculate cartesian coordinates 

In [9]:
# Converting lat/long to cartesian
import numpy as np

def get_cartesian(lat=None,lon=None):
    lat, lon = np.deg2rad(lat), np.deg2rad(lon)
    R = 6371 # radius of the earth
    x = R * np.cos(lat) * np.cos(lon)
    y = R * np.cos(lat) * np.sin(lon)
    return x,y

In [10]:
pergrid_base_df['x'] = 0.0
pergrid_base_df['y'] = 0.0

In [11]:
for i in pergrid_base_df.index:
    lat = pergrid_base_df.at[i, 'lat']
    lon = pergrid_base_df.at[i, 'lon']
    x, y = get_cartesian(lat,lon)
    
    pergrid_base_df.at[i, 'x'] = x
    pergrid_base_df.at[i, 'y'] = y

In [12]:
pergrid_base_df.head()

Unnamed: 0,grid_id,tsr,lat,lon,aet,ai,art,ewd,fa,map,...,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa,x,y
0,110,4.0,25.932871,-97.631112,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,...,2.833333,0.604166,444.860212,2.27,11.597701,,28.7925,4.1166,-760.844723,-5678.742385
1,111,2.0,25.935809,-97.433331,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,...,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152,-741.219177,-5681.193259
2,195,5.0,24.83353,-80.942689,65535.0,,0.164286,-524.7088,0.0711,1148.5,...,1.833333,0.241667,3.698817,,0.011696,,28.626191,302.9049,910.19797,-5709.793273
3,337,1.0,26.099436,-98.427128,5135.0,0.2898,23.950167,-2941595.0,3.0042,565.0,...,2.333333,0.391667,492.281141,2.12,61.886973,,29.458834,1.2528,-838.473507,-5659.588039
4,338,1.0,26.103897,-98.229,5645.0,0.3017,23.526261,-2959517.0,10.035,580.0,...,3.416667,0.620833,489.587254,2.12,32.270955,5.0,29.300416,4.248,-818.866554,-5662.237626


In [13]:
var_coord = ['grid_id', 'tsr', 'lat', 'lon', 'x', 'y', 'aet', 'ai', 'art', 'ewd', 'fa', 'map',
       'mat', 'mpdq', 'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn',
       'mfdf', 'alt', 'shg', 'mtwq', 'wa']

### fill NA and split data

In [14]:
imputer = KNNImputer(n_neighbors=5)
pergrid_base_df_filled = imputer.fit_transform(pergrid_base_df)
pergrid_base_df = pd.DataFrame(pergrid_base_df_filled)

# standarize predictors
ss = StandardScaler(with_mean=False, with_std=False)
pergrid_base_df_std = ss.fit_transform(pergrid_base_df)

pergrid_base_df_std=pd.DataFrame(data=pergrid_base_df_std[0:,0:],
                index=pergrid_base_df.index,
                columns=var_coord)

In [15]:
pergrid_base_df_std.head(2)

Unnamed: 0,grid_id,tsr,lat,lon,x,y,aet,ai,art,ewd,...,psn,ra,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa
0,110.0,4.0,25.932871,-97.631112,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,...,2.833333,0.604166,444.860212,2.27,11.597701,4.6,28.7925,4.1166,-760.844723,-5678.742385
1,111.0,2.0,25.935809,-97.433331,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,...,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152,-741.219177,-5681.193259


In [19]:
cleaned_data = pergrid_base_df_std[var]
cleaned_data.to_csv('cleaned.csv')

In [16]:
pergrid_base_df_test = pergrid_base_df_std.sample(frac=0.2, replace=False, random_state=1)
pergrid_base_df_train = pergrid_base_df_std.drop(pergrid_base_df_test.index)

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(pergrid_base_df_std, y_true, test_size=0.2, random_state=0)

In [18]:
pergrid_base_df_train.head(2)

Unnamed: 0,grid_id,tsr,lat,lon,x,y,aet,ai,art,ewd,...,psn,ra,rmap,rmat,tsn,mfdf,alt,shg,mtwq,wa
0,110.0,4.0,25.932871,-97.631112,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,...,2.833333,0.604166,444.860212,2.27,11.597701,4.6,28.7925,4.1166,-760.844723,-5678.742385
1,111.0,2.0,25.935809,-97.433331,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,...,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152,-741.219177,-5681.193259


### test on a 10-trees forest

In [19]:
print("Training Random Forest Regressor...")
rf = RandomForestRegressor(n_estimators = 10, random_state = 42, bootstrap=True)
rf.fit(pergrid_base_df_train[pred_var], pergrid_base_df_train['tsr'])

Training Random Forest Regressor...


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [20]:
y_pred = rf.predict(pergrid_base_df_test[pred_var])

In [21]:
test_score_cv=rf.score(pergrid_base_df_test[pred_var],pergrid_base_df_test['tsr'])
print("Test score: ", test_score_cv)

Test score:  0.8944871492446951


### tuning forest structures and hyper-paramters

In [22]:
# Grid and Random Search best hyperparameters
import time
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

In [23]:
# Specify parameters and distributions to sample from
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 20)]
# max number of features considered for splitting a node
max_features = ['auto', 'sqrt']
# max number of levels in each decision tree
max_depth = [int(x) for x in np.linspace(30, 80, num = 10)]
max_depth.append(None)
# min number of data points placed in a node before the node is split
min_samples_split = [5, 10]
# min number of data points allowed in a leaf node
min_samples_leaf = [5, 10]
# Method of selecting samples for training each tree
criterion = ["mse"] #mse

In [24]:
param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Run Randomized Search
n_iter_search = 100 # Number of parameter settings that are sampled 
rfr = RandomForestRegressor()
random_search = RandomizedSearchCV(estimator = rfr, 
                                   param_distributions=param_dist, 
                                   scoring='neg_root_mean_squared_error',
                                   n_iter=n_iter_search,
                                   n_jobs = -1, # using all processors
                                   cv = 10)

start = time.time()
random_search.fit(pergrid_base_df_train[pred_var], pergrid_base_df_train['tsr'])
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time.time() - start), n_iter_search))

RandomizedSearchCV took 4250.71 seconds for 100 candidates parameter settings.


In [25]:
random_search.best_params_

{'n_estimators': 345,
 'min_samples_split': 5,
 'min_samples_leaf': 5,
 'max_features': 'sqrt',
 'max_depth': 57}

### calculate residual

In [26]:
test_df = pergrid_base_df_test[['grid_id', 'tsr', 'lat', 'lon', 'x', 'y']]

test_df['predicted_tsr'] = random_search.predict(pergrid_base_df_test[pred_var])
test_residual =  test_df['predicted_tsr'] - test_df['tsr']

test_df['residual'] = test_residual.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
train_residual = random_search.predict(pergrid_base_df_train[pred_var]) - pergrid_base_df_train['tsr']
train_df = pergrid_base_df_train[['grid_id', 'tsr', 'lat', 'lon', 'x', 'y']]
train_df['residual'] = train_residual.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
test_df.head(2)

Unnamed: 0,grid_id,tsr,lat,lon,x,y,predicted_tsr,residual
2158,8613.0,2.0,32.07781,-107.14711,892.0,0.1041,1.500633,-0.499367
2123,8489.0,23.0,31.845669,-84.348885,5835.0,0.7869,26.718313,3.718313


In [29]:
import numpy as np
import pykrige.kriging_tools as kt
from pykrige.ok import OrdinaryKriging
import matplotlib.pyplot as plt

In [30]:
OK = OrdinaryKriging(
    x=train_df['x'], #'lon'
    y=train_df['y'], # 'lat'
    z=train_df['residual'],
    coordinates_type='euclidean', #'geographic'
    variogram_model="linear",
    verbose=False,
    enable_statistics=False
)

In [31]:
OK.print_statistics()

Q1 = None
Q2 = None
cR = None


In [32]:
OK.get_epsilon_residuals()

In [33]:
# predict_value_list, ss = OK.execute("points", test_df['lon'], test_df['lat'])
predict_value_list, ss = OK.execute("points", test_df['x'], test_df['y'])

In [34]:
test_df['residual_pred'] = predict_value_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [35]:
test_df['predicted_tsr_kriging'] = test_df['predicted_tsr'] + test_df['residual_pred']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
test_df.head()

Unnamed: 0,grid_id,tsr,lat,lon,x,y,predicted_tsr,residual,residual_pred,predicted_tsr_kriging
2158,8613.0,2.0,32.07781,-107.14711,892.0,0.1041,1.500633,-0.499367,0.098732,1.599365
2123,8489.0,23.0,31.845669,-84.348885,5835.0,0.7869,26.718313,3.718313,0.012107,26.73042
7905,17690.0,18.0,39.542235,-92.299311,6204.0,0.7579,21.526816,3.526816,0.129916,21.656733
10586,21879.0,19.0,42.30877,-84.55563,6103.0,0.7574,19.281692,0.281692,0.108548,19.39024
6617,15787.0,13.0,37.836948,-105.017268,2770.0,0.3163,9.926901,-3.073099,0.033144,9.960045


In [38]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print('test MAE', mean_absolute_error(test_df['predicted_tsr'], test_df['tsr']))
print('test r2', r2_score(test_df['tsr'], test_df['predicted_tsr']))

test MAE 2.815842979435388
test r2 0.905314103406712


In [36]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print('test MAE', mean_absolute_error(test_df['predicted_tsr_kriging'], test_df['tsr']))
print('test r2', r2_score(test_df['tsr'], test_df['predicted_tsr_kriging']))

test MAE 2.855160662284437
test r2 0.9025367862042929
