In [1]:
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as pl
from matplotlib import rcParams
from seaborn import PairGrid, heatmap, kdeplot
import cmocean.cm as cmo

In [2]:
% matplotlib inline
rcParams['axes.titlesize'] = 18
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['axes.labelsize'] = 16
rcParams['font.size'] = 16

In [3]:
df = pd.read_pickle('./pickleJar/df_5_APHY.pkl')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 36 columns):
sin_doy         495 non-null float64
cos_doy         495 non-null float64
sin_minofday    495 non-null float64
cos_minofday    495 non-null float64
x               495 non-null float64
y               495 non-null float64
z               495 non-null float64
log10_etopo2    495 non-null float64
oisst           495 non-null float64
solz            495 non-null float64
PC1             494 non-null float64
PC2             494 non-null float64
PC3             494 non-null float64
PC4             494 non-null float64
PC5             494 non-null float64
PC6             494 non-null float64
aphy405         495 non-null float64
aphy411         495 non-null float64
aphy443         495 non-null float64
aphy455         495 non-null float64
aphy465         495 non-null float64
aphy489         495 non-null float64
aphy510         495 non-null float64
aphy520         495 non-null float64
aphy530

In [5]:
X = df.loc[:,:'PC6'].values
X_s = (X - np.nanmean(X, axis=0)) / np.nanstd(X, axis=0)

In [6]:
feat_cols = df.loc[:,:'PC6'].columns.tolist()
df_s = pd.DataFrame(X_s, columns=['%s_s' % col for col in feat_cols])

In [7]:
df_s = pd.concat((df_s, df.filter(regex='aphy', axis=1)), axis=1)

In [8]:
df_s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495 entries, 0 to 494
Data columns (total 36 columns):
sin_doy_s         495 non-null float64
cos_doy_s         495 non-null float64
sin_minofday_s    495 non-null float64
cos_minofday_s    495 non-null float64
x_s               495 non-null float64
y_s               495 non-null float64
z_s               495 non-null float64
log10_etopo2_s    495 non-null float64
oisst_s           495 non-null float64
solz_s            495 non-null float64
PC1_s             494 non-null float64
PC2_s             494 non-null float64
PC3_s             494 non-null float64
PC4_s             494 non-null float64
PC5_s             494 non-null float64
PC6_s             494 non-null float64
aphy405           495 non-null float64
aphy411           495 non-null float64
aphy443           495 non-null float64
aphy455           495 non-null float64
aphy465           495 non-null float64
aphy489           495 non-null float64
aphy510           495 non-null float6

In [9]:
df_s.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sin_doy_s,495.0,-1.43544e-17,1.001012,-1.327679,-1.106023,-0.05802,0.992926,1.374237
cos_doy_s,495.0,-1.43544e-17,1.001012,-1.32488,-0.979925,-0.089465,0.919662,1.702949
sin_minofday_s,495.0,1.291896e-16,1.001012,-0.641262,-0.601682,-0.29222,0.097853,4.569315
cos_minofday_s,495.0,-2.3325900000000003e-17,1.001012,-1.637524,-0.733458,0.045883,0.692711,2.317581
x_s,495.0,0.0,1.001012,-3.429751,-0.143034,0.126237,0.391993,2.544533
y_s,495.0,7.177199e-18,1.001012,-0.711732,-0.663241,-0.15213,0.003582,4.984149
z_s,495.0,-2.87088e-17,1.001012,-4.423439,-0.05102,0.211793,0.649183,1.626489
log10_etopo2_s,495.0,-1.220124e-16,1.001012,-6.918917,-0.62743,0.131081,0.580287,1.46441
oisst_s,495.0,8.612639e-17,1.001012,-2.621186,-0.758434,0.076543,0.8678,1.640422
solz_s,495.0,1.43544e-17,1.001012,-2.156424,-0.733368,0.06947,0.779275,2.660603


In [10]:
pf = PolynomialFeatures(interaction_only=True, include_bias=False)

In [11]:
Xsp = pf.fit_transform(np.r_[X_s[:174], X_s[175:]])

In [12]:
y = df_s.filter(regex='aphy', axis=1)
y_sub = np.r_[y[:174], y[175:]]

In [13]:
poly_feat_nams = pf.get_feature_names(input_features=df_s.loc[:,:'PC6_s'].columns)
poly_df_cols = poly_feat_nams + y.columns.tolist()

In [14]:
df_sp = pd.DataFrame(np.c_[Xsp, y_sub], columns=poly_df_cols)

In [15]:
df_s.to_pickle('./pickleJar/df_6_APHY_Standardized.pkl')
df_sp.to_pickle('./pickleJar/df_6_APHY_Standardized_PolyFeatures.pkl')

In [16]:
aphys = ['aphy411', 'aphy443', 'aphy489', 'aphy510', 'aphy555', 'aphy670']
df_s_SWF = pd.concat((df_s.loc[:, :'PC6_s'], df_s[aphys]), axis=1)

In [17]:
df_sp_SWF = pd.concat((df_sp.loc[:, :'PC5_s PC6_s'], df_sp[aphys]), axis=1)

In [18]:
df_s_SWF.to_pickle('./pickleJar/df_6_APHY_Standardized_SWF.pkl')
df_sp_SWF.to_pickle('./pickleJar/df_6_APHY_Standardized_PolyFeatures_SWF.pkl')