In this notebook, I log-transform depth data, as well as the aphy observations, then standardize the training and testing data. Note that standardization of the testing data parameterized by the training set. Standardization of the input data is often sufficient, but sampling efficiency in bayesian modeling can be improved if the the target data is also standardized. For good measure, I will therefore standardize similary the target data. 
Target data standardization is the same as for the input data.

In [1]:
import pickle
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from seaborn import heatmap
import matplotlib.pyplot as pl

In [2]:
with open('../PickleJar/DataSets/SplitSets.pkl', 'rb') as fb:
    datadict = pickle.load(fb)

In [3]:
x_train_pca = datadict['x_train_pca']
x_test_pca = datadict['x_test_pca']
y_test = datadict['y_test']
y_train = datadict['y_train']

Log transforming:

In [14]:
x_train_pca.insert(x_train_pca.columns.get_loc('etopo2'),
                   'log_depth', np.log10(x_train_pca.etopo2))

x_test_pca.insert(x_test_pca.columns.get_loc('etopo2'),
                   'log_depth', np.log10(x_test_pca.etopo2))

In [15]:
x_train_pca.drop('etopo2', axis=1, inplace=True)

x_test_pca.drop('etopo2', axis=1, inplace=True)

In [6]:
x_train_pca.head()

Unnamed: 0_level_0,oisst,log_depth,solz,pc1,pc2,pc3,pc4,pc5,pc6
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,0.78,2.518514,52.7,-0.008285,-0.00099,8.6e-05,0.00017,-0.00022,4.3e-05
3,1.79,3.341039,45.9,-0.005603,-0.00156,0.000351,0.000102,-0.000144,-2.3e-05
26,27.99,3.643749,13.3,-0.000137,-0.00688,0.001303,0.001624,0.000121,-0.000153
31,6.68,1.146128,27.4,0.000158,0.003202,-0.002319,0.000189,0.000257,-0.000177
32,10.66,0.954243,22.2,0.003382,0.003694,-0.001514,0.000991,0.000237,-4.3e-05


In [45]:
y_train_log = pd.DataFrame(np.log10(y_train.values+1e-10), 
                           columns=[f'log10_{col}' for col in y_train.columns],
                          index=y_train.index)
y_test_log = pd.DataFrame(np.log10(y_test.values+1e-10),
                          columns=[f'log10_{col}' for col in y_test.columns])

In [46]:
y_train_log.head()

Unnamed: 0_level_0,log10_aphy411,log10_aphy443,log10_aphy489,log10_aphy510,log10_aphy555,log10_aphy670
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,-1.617983,-1.511731,-1.686766,-1.899285,-2.414539,-1.982967
3,-1.724458,-1.641494,-1.834459,-2.02826,-2.508638,-2.029188
26,-2.701147,-2.59176,-2.742321,-2.987163,-3.508638,-3.251812
31,-0.321901,-0.26759,-0.544318,-0.68338,-1.201073,-0.450384
32,-1.134422,-1.107961,-1.342944,-1.461803,-2.047692,-1.385419


Scaling:

In [47]:
std_scaler_X = StandardScaler()
std_scaler_Y = StandardScaler()

In [48]:
x_train_s = pd.DataFrame(std_scaler_X.fit_transform(x_train_pca), index=x_train_pca.index,
                         columns=[f'{col}_s' for col in x_train_pca.columns])

In [49]:
x_test_s = pd.DataFrame(std_scaler_X.transform(x_test_pca), index=x_test_pca.index,
                        columns=[f'{col}_s' for col in x_test_pca.columns])

In [50]:
y_train_s = pd.DataFrame(std_scaler_Y.fit_transform(y_train_log),
                         index=y_train_log.index,
                         columns=[f'{col}_s' for col in y_train_log.columns])
y_test_s = pd.DataFrame(std_scaler_Y.fit_transform(y_test_log),
                        index=y_test_log.index,
                        columns=[f'{col}_s' for col in y_test_log.columns])

In [51]:
datadict['x_train_s'] = x_train_s
datadict['x_test_s'] = x_test_s
datadict['y_train_s'] = y_train_s
datadict['y_test_s'] = y_test_s
datadict['y_train'] = y_train_log
datadict['y_test'] = y_test_log
datadict['std_scaler_X'] = std_scaler_X
datadict['std_scaler_Y'] = std_scaler_Y

In [52]:
with open('../PickleJar/DataSets/AphiTrainTestSplitDataSets_4compw_GIOP.pkl', 'wb') as fb:
    pickle.dump(datadict, fb)

In [54]:
import pymc3 as pm

In [None]:
pm.sample_posterior_predictive