# Correlations between model performance and geospatial features

The performances resulting from the CNN can be affected by several factors. Here, the most important are evaluated as an insight into other input variables that can provide additional information to feed the model and get higher accuracies. 

The incorporation of those variables or the affectations differs from well to well. Some can modify the groundwater levels in some wells but not in others. Here the influence of different locations is evaluated altogether.

In [91]:
#Import libraries
%matplotlib notebook
import geopandas as gpd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
patht="D:/Erasmus/Thesis/"
pathdat=patht+"/data/"

pathshp=pathdat+"/SHP/SHP2/"
pathrast=pathdat+"/Raster/"
pathpick=pathdat+"/Pickle/"
pathfig=patht+"/Figures/"

path=r"D:\Erasmus\Thesis\Code\08072022\GW_forecasting_ML\projections/"

In [92]:
#Import spatial information per well -- results of the Accuracy_plot script
gw_sel_int=gpd.read_file("D:/Data/students/mariana/data/SHP/gw_sel_int.shp")
gw_sel_int['r2+NSE']=gw_sel_int['r2']+gw_sel_int['NSE']
gw_sel_int['r2_2+NSE_2']=gw_sel_int['r2_2']+gw_sel_int['NSE_2']

Create a list of wells with high model performance to use for projections

In [93]:
#Select and save pdf with model performance above 0.6 NSE and 0.7 r2 -- the selection is done for the simulation with P and T only
dfsel=gw_sel_int[['MEST_ID', 'r2','NSE','r2_2','NSE_2']]
dfsel60=dfsel.loc[(dfsel['r2_2']>0.7) & (dfsel['NSE_2']>0.6)]
dfsel60c=dfsel60['MEST_ID'].astype(str)
dfsel60c.to_csv(path+'well_list60.txt',
                index=False)

## Make correlations

Pearson, Spearmann and Kendall-tau correlation coefficients. Only significant coefficients (90% confidence interval) are considered. 

Small p-values correpond to strong evidence.In this case, a confidence interval of 0.1 is used. If p-values are below 0.1, then the correlation between both variables is statistically significant. The smaller the p-value, the less plausible is the null hypothesis, meaning that the alternative hypothesis ('the slope of the regression line is nonzero') is true.

In [151]:
#Columns to perform the correlations
lcol=[ "FOK",'ho4_30m_sd',  'ho4_30m_ds', 'ho4_30m_lp','dist_pww',
      'dist_coast', 'imperv_buf', 'recharge_b', 'prop_211',
       'prop_231', 'prop_312', 'prop_112', 'prop_311','geo_1011', 
         'geo_115','geo_114','geo_111', 'geo_1012',
      'geo_1021','lai01_buff', 'lai06_buff', 'lai0106_bu']

#Performance indicators r2, NSE (P,T,RH) and r_2,NSE_2 (P,T)
perfor=['r2','NSE', 'r2_2', 'NSE_2','r2+NSE','r2_2+NSE_2']

#Create dataframe to store the correlations
lzero=np.zeros(len(lcol))
d = {'variable': lcol, 'r2': lzero,'NSE':lzero, 'r2_2':lzero, 'NSE_2':lzero}
dfrval = pd.DataFrame(data=d)
dfslope= pd.DataFrame(data=d)
pval=pd.DataFrame(data=d)
spearm =pd.DataFrame(data=d)
pvalspearm= pd.DataFrame(data=d)
kendalltau= pd.DataFrame(data=d)
pvalkendallt=pd.DataFrame(data=d)


for p in perfor:
    lslope, lrvalue, pvalue, spearv, pvalspearv, kendv,pvalkend= [],[],[],[],[],[],[]
    for col in lcol:
        
        #Restrict values bellow 9 km 
        if col== 'dist_pww':
            sortdf=gw_sel_int.sort_values(by=col).dropna(subset=[col])[:90]
        
        #Restricted distance to coastline to 25 km
        elif col== 'dist_coast':
            sortdf=gw_sel_int.sort_values(by=col).dropna(subset=[col])[:50]
  
        else:    
            sortdf=gw_sel_int.sort_values(by=col).dropna(subset=[col])

        slope, intercept, r_value, p_value, std_err = stats.linregress(sortdf[col], sortdf[p])
        # add only the correlations that are statistically significant
        lrvalue.append(round(r_value,2)) if p_value <= 0.1 else lrvalue.append(np.nan)
        pvalue.append(round(p_value,2)) if p_value <= 0.1 else pvalue.append(0)
        lslope.append(round(slope,2))
        
        #Spearman's rank correlation coefficient
        spear, pvalspear=stats.spearmanr(sortdf[col], sortdf[p])
        spearv.append(round(spear,2)) if pvalspear <= 0.1 else spearv.append(np.nan)
        pvalspearv.append(pvalspear)
        
        #Kendall's tau 
        kendall, pvalkendall=stats.kendalltau(sortdf[col], sortdf[p])
        kendv.append(round(kendall,2)) if pvalkendall <= 0.1 else kendv.append(np.nan)
        pvalkend.append(pvalkendall)
            
    
    
    dfslope[p]=lslope
    dfrval[p]=lrvalue
    pval[p]=pvalue
    
    spearm[p]=spearv
    pvalspearm[p]=pvalspearv
    
    kendalltau[p]=kendv
    pvalkendallt[p]=pvalkend
      

In [152]:
#Select the correlation type dfrval means Pearson correlation, spearm is Spearman's Rank correlation coefficient
corrsel=spearm.copy()

In [159]:
#Create a dictionary to change the codes to the proper name
dictionary={
            "FOK":'Filter depth', 
            'dist_pww':'Distance to water-works', 
            'ho4_30m_sd':'HO4_SD',
            'ho4_30m_ds':'HO4_DSD', 
            'ho4_30m_lp': 'HO4_LP',
            'dist_coast':'Distance to coastline', 
            'imperv_buf': 'Average Imperveousness', 
            'recharge_b':'Average long-term Recharge', 
            'prop_211':' Non-irrigated arable land',
            'prop_231': 'Pastures ' , 
            'prop_312':'Coniferous forest' ,
            'prop_112':' AS:Discontinuous urban fabric', 
            'prop_311':'Forest: Broad-leaved forest ',
            'geo_1011':'Low Relief/ High SMI',
            'geo_1012':'Low Relief/ Medium SMI',
            'geo_114':'Sink areas/ Medium SMI',
            'geo_115':'Sink areas/ Low SMI' ,
            'geo_111':'Sink areas/ High SMI',
            'geo_1021':'Moderate Relief/ low SMI',
            'lai01_buff':'LAI Jan',
            'lai06_buff' : 'LAI Jun',
            'lai0106_bu':'LAI Jun-Jan'
}

corrsel["variables_d"]=corrsel['variable'].map(dictionary)
dfrvalc=corrsel.drop(['variable'],axis=1)

pvalsel["variables_d"]=pvalsel['variable'].map(dictionary)
pvalc=pvalsel.drop(['variable'],axis=1)

## Plot  the correlation coeficients

In [161]:
dfrvalcc=dfrvalc.copy()
dfrvalcc.rename(columns={"r2": "r$^2$ (RH)", "NSE": "NSE (RH)",
                         "r2_2":"r$^2$","NSE_2":"NSE",
                         "r2+NSE":"r$^2$+NSE (RH)","r2_2+NSE_2":"r$^2$+NSE"}, inplace=True)
dfcop=dfrvalcc[:].set_index('variables_d')

In [164]:
importlib.reload(plt); importlib.reload(sns) #to avoid seaborn style

data1=dfcop[dfcop.columns[:-2]][:].dropna(how='all')
data2=data1.loc[~data1["r$^2$ (RH)"].isna()]
data=data2.drop(index='LAI Jun-Jan')

plt.figure(figsize=(8,5))
cmap = sns.diverging_palette(20, 220, as_cmap=True)
sns.heatmap(data, annot=True,cmap=cmap, center=0,  annot_kws={"size":8})
plt.ylabel(' ')
plt.tight_layout()

<IPython.core.display.Javascript object>

## Scatter plots

In [335]:
subset = gw_sel_int.sort_values(by=['FOK','imperv_buf'])[:100]
plt.figure(figsize=(10,8))

sns.scatterplot(x='FOK', y='imperv_buf', data=subset, 
                 hue='r2+NSE',palette="coolwarm_r", alpha=0.9)
plt.grid(True, alpha=0.2)

<IPython.core.display.Javascript object>

In [233]:
# Subset data

subset = gw_sel_int.sort_values(by='dist_pww')[:100]
plt.figure(figsize=(10,8))

sns.scatterplot(x='dist_pww', y='FOK', data=subset, 
                 hue='r2+NSE',palette="coolwarm_r", alpha=0.9)
plt.grid(True, alpha=0.2)

<IPython.core.display.Javascript object>

In [158]:
fig = plt.figure()
ax = plt.axes(projection='3d')

gw_sel_int2=gw_sel_int.sort_values(by='dist_pww')

# Data for a three-dimensional line
zline = gw_sel_int2['r2'][:100]
xline = gw_sel_int2['dist_pww'][:100]
yline = gw_sel_int2['FOK'][:100]
ax.scatter3D(xline, yline, zline, c=zline, cmap='viridis_r')
ax.set_xlabel('Dist waterworks')
ax.set_ylabel('Filter depth')
ax.set_zlabel('r2')


<IPython.core.display.Javascript object>

Text(0.5, 0, 'r2')