In [1]:
from sqlalchemy import create_engine

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.impute import KNNImputer

from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [2]:
localhost = {'user': 'postgres', 'password': 'postgres', 'host': 'localhost', 'port': 5432, 'db': 'fiadb'}
params = 'postgresql://{0}:{1}@{2}:{3}/{4}'
engine = create_engine(params.format(localhost['user'], localhost['password'], localhost['host'], localhost['port'], localhost['db']))
# geom_sql = """select distinct grid_id, grid_geom from fs_fiadb.pergrid"""
pergrid_base = """select distinct * from predictor.pergrid_base"""
pergrid_base_df = pd.read_sql(pergrid_base, engine)

In [3]:
pergrid_base_df.columns

Index(['grid_id', 'aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt', 'shg',
       'mtwq', 'wkb_geometry', 'tsr', 'wa'],
      dtype='object')

In [11]:
# separate predictor variables from outcome varaible
y = pergrid_base_df['tsr']
grid_id = pergrid_base_df['grid_id']
pred_var= ['aet', 'ai', 'art', 'ewd', 'fa', 'map', 'mat', 'mpdq',
       'mtcq', 'pet', 'psn', 'ra', 'rmap', 'rmat', 'tsn', 'mfdf', 'alt','shg','mtwq','wa']
pergrid_base_pred_df = pergrid_base_df[pred_var]

In [12]:
# fill NaN with values from neighbor pixels
imputer = KNNImputer(n_neighbors=5)
pergrid_base_filled = imputer.fit_transform(pergrid_base_pred_df)
pergrid_base_df = pd.DataFrame(pergrid_base_filled)
pergrid_base_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,6000.0,0.3675,21.191667,-2523666.0,7.7058,681.0,20.036236,87.0,16.299375,1826.768199,50.411522,16.0,2.833333,0.604166,444.860212,2.27,11.597701,5.8,28.7925,4.1166
1,65535.0,0.3829,20.233612,-2537902.0,14.1498,699.0,20.298634,92.0,16.340292,1811.927757,49.797996,14.0,2.0,0.620833,444.259982,2.1,5.98289,5.0,28.717458,11.8152
2,65535.0,1.30362,0.164286,-524.7088,0.0711,1148.5,0.309987,131.5,21.304167,1918.111111,0.713689,1.0,1.833333,0.241667,3.698817,6.524,0.011696,2.0,28.626191,302.9049
3,5135.0,0.2898,23.950167,-2941595.0,3.0042,565.0,20.193576,76.0,15.517042,1972.128352,43.00118,108.0,2.333333,0.391667,492.281141,2.12,61.886973,5.4,29.458834,1.2528
4,5645.0,0.3017,23.526261,-2959517.0,10.035,580.0,20.710174,80.0,15.754,1928.968811,44.328019,34.0,3.416667,0.620833,489.587254,2.12,32.270955,5.0,29.300416,4.248


In [14]:
from sklearn.preprocessing import MinMaxScaler

ss = MinMaxScaler()
X_std = ss.fit_transform(pergrid_base_df)

Xstd=pd.DataFrame(data=X_std[0:,0:],
                index=pergrid_base_pred_df.index,
                columns=pred_var)

In [15]:
x_train, x_test, y_train, y_test = train_test_split(Xstd, y, test_size=0.2, random_state=12345)

In [17]:
"""Lasso modeling with alpha=0.000001"""
lasso00001 = Lasso(alpha=0.0001, max_iter=10e5)
lasso00001.fit(Xstd, y)

test_score=lasso00001.score(x_test,y_test)
print("Test score for alpha =0.0001: ", test_score)

Test score for alpha =0.0001:  0.7961979946045569


In [19]:
"""Lasso Cross-validation modeling with eps=0.000001"""
lassocv = LassoCV(eps=0.000001, max_iter=10e6, n_alphas=1000, cv=10, random_state=None)
lassocv.fit(Xstd, y)

LassoCV(alphas=None, copy_X=True, cv=10, eps=1e-06, fit_intercept=True,
        max_iter=10000000.0, n_alphas=1000, n_jobs=None, normalize=False,
        positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [20]:
df = pd.DataFrame(list(zip(var, lassocv.coef_.tolist())), columns =['Features', 'Lasso Coefficents'])
df.loc[df['Lasso Coefficents']==0]

Unnamed: 0,Features,Lasso Coefficents
5,map,-0.0
11,ra,0.0
18,mtwq,-0.0
19,wa,-0.0


In [21]:
df_lasso = df.loc[df['Lasso Coefficents']!=0]
df_lasso_sort = df_lasso.iloc[df_lasso['Lasso Coefficents'].abs().argsort()]

In [26]:
df_lasso_sort.Features.to_list()

['aet',
 'mfdf',
 'psn',
 'shg',
 'rmap',
 'ai',
 'rmat',
 'pet',
 'mat',
 'mtcq',
 'fa',
 'ewd',
 'alt',
 'mpdq',
 'tsn',
 'art']

In [24]:
model = LassoCV(eps=0.000001, max_iter=10e6, n_alphas=1000, cv=10, random_state=None).fit(Xstd, y)

test_score_cv=model.score(x_test,y_test)
print("Test score for 10-fold Cross-validation Lasso: ", test_score_cv)

Test score for 10-fold Cross-validation Lasso:  0.7956391337093669
