In [1]:
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.impute import KNNImputer

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
# geom_sql = """select distinct grid_id, grid_geom from fs_fiadb.pergrid"""
pergrid_base = """select distinct * from predictor.pergrid_base_eus"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)

In [3]:
pergrid_base_df.columns

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wkb_geometry', 'tsr', 'wa'],
      dtype='object')

In [4]:
# separate predictor variables from outcome varaible
y = pergrid_base_df['tsr']
grid_id = pergrid_base_df['grid_id']
pred_var= ['aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg','mtwq','wa']
pergrid_base_pred_df = pergrid_base_df[pred_var]

In [5]:
# fill NaN with values from neighbor pixels
imputer = KNNImputer(n_neighbors=5)
pergrid_base_filled = imputer.fit_transform(pergrid_base_pred_df)
pergrid_base_df = pd.DataFrame(pergrid_base_filled)
pergrid_base_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,65535.0,1.07484,0.164286,-524.708793,0.0711,1148.5,0.309987,131.5,21.304167,1918.111111,0.713689,1.0,1.833333,0.241667,3.698817,8.088,0.011696,1.2,28.626191,302.9049
1,13144.0,0.6577,8.82239,-894732.416005,0.1062,1208.0,12.47678,128.0,19.837514,1833.026596,32.850457,1.0,4.333333,0.679166,169.350913,3.52,0.774757,4.4,27.766178,149.0013
2,13867.0,0.667,7.376786,-656234.650346,0.0549,1245.0,10.635709,130.0,19.96661,1850.611285,27.834838,1.0,7.833333,0.945833,143.805179,3.26,0.628627,4.2,27.982256,164.0862
3,65535.0,0.94772,1.734753,-48762.101679,0.0342,1223.0,2.873174,130.5,20.691381,1898.352941,7.071733,2.0,10.083333,0.6375,36.816597,4.76,0.172147,2.0,28.2394,353.2644
4,65535.0,1.07484,0.631731,-6380.557906,0.0027,1183.6,1.045456,249.4,3.757151,1895.387097,2.569308,2.0,4.0,0.2125,13.451322,4.76,0.068093,1.2,22.364209,121.9797


In [6]:
from sklearn.preprocessing import MinMaxScaler

ss = MinMaxScaler()
X_std = ss.fit_transform(pergrid_base_df)

Xstd=pd.DataFrame(data=X_std[0:,0:],
                index=pergrid_base_pred_df.index,
                columns=pred_var)

In [7]:
x_train, x_test, y_train, y_test = train_test_split(Xstd, y, test_size=0.2, random_state=12345)

In [8]:
"""Lasso modeling with alpha=0.000001"""
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(Xstd, y)

test_score=lasso00001.score(x_test,y_test)
print("Test score for alpha =0.0001: ", test_score)

Test score for alpha =0.0001:  0.6740403950040608


In [9]:
"""Lasso Cross-validation modeling with eps=0.000001"""
lassocv = LassoCV(eps=0.000001, max_iter=10e6, n_alphas=1000, cv=10, random_state=None)
lassocv.fit(x_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=10, eps=1e-06, fit_intercept=True,
        max_iter=10000000.0, n_alphas=1000, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [10]:
df = pd.DataFrame(list(zip(pred_var, lassocv.coef_.tolist())), columns =['Features', 'Lasso Coefficents'])
df.loc[df['Lasso Coefficents']==0]

Unnamed: 0,Features,Lasso Coefficents
7,mpdq,0.0


In [11]:
df_lasso = df.loc[df['Lasso Coefficents']!=0]
df_lasso_sort = df_lasso.iloc[df_lasso['Lasso Coefficents'].abs().argsort()]

In [12]:
df_lasso_sort.Features.to_list()

['aet',
 'mat',
 'shg',
 'ra',
 'mfdf',
 'wa',
 'rmat',
 'alt',
 'psn',
 'rmap',
 'mtcq',
 'mtwq',
 'map',
 'ai',
 'fa',
 'ewd',
 'pet',
 'tsn',
 'art']

In [13]:
model = LassoCV(eps=0.000001, max_iter=10e6, n_alphas=1000, cv=10, random_state=None).fit(x_test, y_test)

test_score_cv=model.score(x_test,y_test)
print("Test score for 10-fold Cross-validation Lasso: ", test_score_cv)

Test score for 10-fold Cross-validation Lasso:  0.6769795523244915


In [14]:
df_lasso

Unnamed: 0,Features,Lasso Coefficents
0,aet,-0.108983
1,ai,-22.443652
2,art,210.75363
3,ewd,25.22392
4,fa,23.414913
5,map,19.195628
6,mat,-0.627271
8,mtcq,17.603188
9,pet,-27.251972
10,psn,-9.234998


In [15]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

y_test_pred = model.predict(x_test)
print('test MAE', mean_absolute_error(y_test_pred, y_test))
print('test r2', r2_score(y_test, y_test_pred))

test MAE 5.04382679777225
test r2 0.6769795523244915


In [16]:
lasso_y_test_eus = pd.DataFrame(
    {'tsr': y_test,
     'tsr_predicted': y_test_pred})
lasso_y_test_eus.to_sql(name='lasso_y_test_eus', con=engine, schema='predictor', if_exists='replace', index=False)