# Data exploration

Notebook looking at correlation between variables and correlation between variables and $C^*$

## Feature correlation

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
pad = Path(os.getcwd())
if pad.name == "ml_observation_operator":
    pad_correct = pad.parent
    os.chdir(pad_correct)

In [None]:
ML_data_pad = Path("data/Zwalm_data/ML_data")
X_full_all = pd.read_pickle(ML_data_pad/"X_full_all.pkl")

y_train = pd.read_pickle(ML_data_pad/"y_train.pkl")
y_test = pd.read_pickle(ML_data_pad/"y_test.pkl")
y_full = pd.read_pickle(ML_data_pad/"y_full.pkl")

Cstar = pd.read_pickle(ML_data_pad/"Cstar.pkl")

X_train = pd.read_pickle(ML_data_pad/"X_train.pkl")
X_test = pd.read_pickle(ML_data_pad/"X_test.pkl")
X_full = pd.read_pickle(ML_data_pad/"X_full.pkl")
display(X_full.head())
print(X_full.shape)

X_train_small = pd.read_pickle(ML_data_pad/"X_train_small.pkl")
X_test_small = pd.read_pickle(ML_data_pad/"X_test_small.pkl")
X_full_small = pd.read_pickle(ML_data_pad/"X_full_small.pkl")
display(X_full_small.head())


In [None]:
features_corr = X_full.corr(method = 'pearson')
features_corr.style.background_gradient(cmap = 'coolwarm')

In [None]:
type(features_corr)
features_corr.columns

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(features_corr.values, cmap = 'coolwarm',vmin= -1, vmax = 1)
cb = fig.colorbar(im)
cb.set_label(r'$\rho$ [-]')
column_names = ['$\gamma^0_{\mathrm{VV,forest}}$','$\gamma^0_{\mathrm{VH,forest}}$','$\gamma^0_{\mathrm{VV,pasture}}$','$\gamma^0_{\mathrm{VH,pasture}}$','$\gamma^0_{\mathrm{VV,agriculture}}$','$\gamma^0_{\mathrm{VV,agriculture}}$','LAI$_{\mathrm{forest}}$','LAI$_{\mathrm{pasture}}$','LAI$_{\mathrm{agriculture}}$','Ascending',#'Descending',
                '$\Delta t$','DOY$_{\sin}$','DOY$_{\cos}$']
ax.set_xticks(range(len(features_corr.columns)), column_names, rotation=90)
ax.set_yticks(range(len(features_corr.columns)), column_names)

Now also the same plotted for the small dataset

In [None]:
features_corr_small = X_full_small.corr(method = 'pearson')
features_corr_small.style.background_gradient(cmap = 'coolwarm')

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(features_corr_small.values, cmap = 'coolwarm',vmin = -1, vmax = 1)
cb = fig.colorbar(im)
cb.set_label(r'$\rho$ [-]')
column_names_small = ['$\gamma^0_{\mathrm{VV}}$','$\gamma^0_{\mathrm{VH}}$','LAI','Ascending',#'Descending',
                      '$\Delta t$','DOY$_{\sin}$','DOY$_{\cos}$']
ax.set_xticks(range(len(features_corr_small.columns)), column_names_small, rotation=90)
ax.set_yticks(range(len(features_corr_small.columns)), column_names_small)

Now combine the two figures

In [None]:
fig, axes = plt.subplots(1,2, constrained_layout = True,figsize = (9,4.5))
im = axes[0].imshow(features_corr.values, cmap = 'coolwarm',vmin = -1, vmax = 1)
#cb = fig.colorbar(im)
axes[0].set_xticks(range(len(features_corr.columns)), column_names, rotation=90)
axes[0].set_yticks(range(len(features_corr.columns)), column_names)
axes[0].set_title('(a)')

im = axes[1].imshow(features_corr_small.values, cmap = 'coolwarm', vmin = -1, vmax = 1)
cb = fig.colorbar(im)
cb.set_label(r'$\rho$ [-]')
axes[1].set_xticks(range(len(features_corr_small.columns)), column_names_small, rotation=90)
axes[1].set_yticks(range(len(features_corr_small.columns)), column_names_small)
axes[1].set_title('(b)')

figures_pad = Path('Figures/Figures_chapter_ML_obs_op')
if not os.path.exists(figures_pad):
    os.makedirs(figures_pad)
fig.savefig(figures_pad/'feature_correlations.pdf',format = 'pdf')

Only as an experiment: PCA

In [None]:
from statsmodels.multivariate.pca import PCA
pca_test = PCA(X_full, standardize=True)
display(pca_test.scores)
fig, ax = plt.subplots()
pca_test.plot_scree(ax = ax, ncomp = 10)


## Feature with $C^*$ correlation

In [None]:
Cstar = pd.read_pickle(ML_data_pad/"Cstar.pkl")
Cstar = pd.DataFrame(Cstar)
Cstar = Cstar.reset_index()
Cstar = Cstar.rename(columns={'Time':'t'})
Cstar = Cstar.set_index('t')
pd_compare = X_full.merge(Cstar, on = 't' , how = 'left')
corr_matrix_pd = pd_compare.groupby('ascending').corr(method = 'pearson')
corr_matrix_pd_Cstar = pd.DataFrame(corr_matrix_pd['Cstar'])
corr_matrix_pd_Cstar.style.background_gradient(cmap = 'coolwarm')

In [None]:
pd_compare_small = X_full_small.merge(Cstar, on = 't' , how = 'left')
corr_matrix_pd_small = pd_compare_small.groupby('ascending').corr(method = 'pearson')
corr_matrix_pd_Cstar_small = pd.DataFrame(corr_matrix_pd_small['Cstar'])
corr_matrix_pd_Cstar_small.style.background_gradient(cmap = 'coolwarm')

In [None]:
X_full_small['VV_past_agr'].plot()

In [None]:
X_full['VHForest'].plot()

Now plot as more beautiful correlation matrices

In [None]:
column_names.count('Ascending') == 1

In [None]:
#data to right format
np_corr_full = corr_matrix_pd_Cstar.values.reshape(-1,2, order = 'F')
np_corr = np_corr_full[0:-1,:] #Drop the last row with Cstar itself!
np_corr_full_small = corr_matrix_pd_Cstar_small.values.reshape(-1,2, order = 'F')
np_corr_small = np_corr_full_small[0:-1,:] #Drop the last row with Cstar itself!
#select highest absolute value
max_1 = np.max(np.abs(np_corr))
max_2 = np.max(np.abs(np_corr_small))
max_val = np.max([max_1,max_2])
print(max_val)



fig, axes = plt.subplots(2,1,height_ratios = [0.75,0.25]) #constrained_layout #= True, )

#row_names = ['Ascending', 'Descending']
row_names = ['$C^*_{desc}$','$C^*_{asc}$']
im = axes[0].imshow(np_corr.transpose(), cmap = 'coolwarm',vmin = -max_val, vmax = max_val)
if column_names.count('Ascending') == 1:
    print(column_names)
    column_names.remove('Ascending')
axes[0].set_yticks(range(len(row_names)), row_names)
axes[0].set_xticks(range(len(column_names)), column_names, rotation = 90)
axes[0].set_title('(a)')


im = axes[1].imshow(np_corr_small.transpose(), cmap = 'coolwarm',vmin = -max_val, vmax = max_val)
if column_names_small.count('Ascending') == 1:
    column_names_small.remove('Ascending')
axes[1].set_yticks(range(len(row_names)), row_names)
axes[1].set_xticks(range(len(column_names_small)), column_names_small, rotation = 90)
axes[1].set_title('(b)')

fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.10, 0.03, 0.55])
cb = fig.colorbar(im, cax=cbar_ax)
cb.set_label(r'$\rho$ [-]')
fig.savefig(figures_pad/'Cstar_features_correlation.pdf',format = 'pdf', bbox_inches = 'tight')

In [None]:
np_corr_full.reshape(-1,2,order = 'F')

## Compare rainfall with signals from backscatter!

Idea is that increases should be observed when rainfall event occurs?!

In [None]:
preprocess_output_folder = Path('data/Zwalm_data/preprocess_output')
p_zwalm = pd.read_pickle(preprocess_output_folder / 'zwalm_p_thiessen.pkl')
p_zwalm_t = p_zwalm.rename(columns={'Timestamp':'t'})
p_zwalm_t = p_zwalm_t.set_index('t')
p_zwalm_t_daily = p_zwalm_t['P_thiessen'].resample('D').sum()
display(p_zwalm_t_daily.head())
display(p_zwalm_t_daily.plot())
#Merge with the features column!
rain_comparison = X_full.merge(p_zwalm_t_daily, how = 'left', on = 't')
rain_comparison.corr()['P_thiessen']

Plotting rain vs backscatter values for VH pasture

In [None]:
fig, ax = plt.subplots()
rain_comparison.plot.scatter(x = 'P_thiessen', y = 'VHPasture',ax = ax)
ax.set_xlim(0.001,)
ax.set_xscale('log')


So there seems to be some correlation!
Idea: mainly deviation from a long term mean should be related to deviation from a long term mean!

In [None]:
X_full_rm_30D = X_full.rolling('30D').mean()
X_full_rm_30D['VVAgriculture'].plot()

In [None]:
X_full_diff_from_rm = X_full - X_full_rm_30D
X_full_diff_from_rm['VVAgriculture'].plot()

In [None]:
diff_compare = X_full_diff_from_rm.merge(p_zwalm_t_daily, how = 'left', on = 't')
diff_compare.corr()['P_thiessen']

In [None]:
fig, ax = plt.subplots()
diff_compare.plot.scatter(x= 'P_thiessen', y = 'VVPasture',ax = ax)
ax.set_xlim(1e-3,)
ax.set_xscale('log')

## Compare $C^*$ deviations with backscatter signal deviations from seasonality

In [None]:
Cstar_rolling_mean =Cstar.rolling('30D', center = True).mean()
fig, ax = plt.subplots()
Cstar.plot(ax = ax, linestyle = '--', label = '$C^*$')
Cstar_rolling_mean.plot(ax = ax, label = 'rolling mean')
ax.legend()

In [None]:
Cstar_diff = Cstar - Cstar_rolling_mean 
Cstar_diff.plot()

In [None]:
compare_Cstar_diff = X_full.merge(Cstar_diff, how = 'left', on = 't')
#ad daily rain
compare_Cstar_diff = compare_Cstar_diff.merge(p_zwalm_t_daily, how = 'left', on = 't') 
corr_diff = compare_Cstar_diff.groupby('ascending').corr()['Cstar']
corr_diff

In [None]:
compare_Cstar_diff.plot.scatter(x = 'VVAgriculture',y = 'Cstar')

In [None]:
ax = compare_Cstar_diff.plot.scatter(x = 'P_thiessen',y = 'Cstar')
ax.set_xlim(1e-3,)
ax.set_xscale('log')


In [None]:
compare_Cstar_diff_small = X_full_small.merge(Cstar_diff, how = 'left', on = 't')
compare_Cstar_diff_small.groupby('ascending').corr()['Cstar']