In [1]:
import pickle
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as pl
from matplotlib import rcParams
from seaborn import PairGrid, heatmap, kdeplot
import cmocean.cm as cmo

In [2]:
% matplotlib inline
rcParams['axes.titlesize'] = 18
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['axes.labelsize'] = 16
rcParams['font.size'] = 16

In [3]:
df_pc = pd.read_pickle('./pickleJar/DevelopmentalDataSets/df_5_APHY_pc.pkl')
df_sat = pd.read_pickle('./pickleJar/DevelopmentalDataSets/df_5_APHY_sat')

In [4]:
df_pc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 36 columns):
sin_doy         495 non-null float64
cos_doy         495 non-null float64
sin_minofday    495 non-null float64
cos_minofday    495 non-null float64
x               495 non-null float64
y               495 non-null float64
z               495 non-null float64
log10_etopo2    495 non-null float64
oisst           495 non-null float64
solz            495 non-null float64
PC1             494 non-null float64
PC2             494 non-null float64
PC3             494 non-null float64
PC4             494 non-null float64
PC5             494 non-null float64
PC6             494 non-null float64
aphy405         495 non-null float64
aphy411         495 non-null float64
aphy443         495 non-null float64
aphy455         495 non-null float64
aphy465         495 non-null float64
aphy489         495 non-null float64
aphy510         495 non-null float64
aphy520         495 non-null float64
aphy530

In [5]:
df_sat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 36 columns):
oisst            495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
aphy405          495 non-null float64
aphy411          495 non-null float64
aphy443          495 non-null float64
aphy455          495 non-null float64
aphy465          495 non-null float64
aphy489          495 non-null float64
aphy510          495 non-null float64
aphy520          495 non-null float64
aphy530          495 non-null float64
aphy550          495 non-null float64
aphy555          495 non-null float64
aphy560          495 non-null float64
aphy565          495 non-null float64
aphy570          495 non-null float64
aphy590          495 non-null float64
aphy619          495 

In [6]:
df_pc = df_pc.loc[((df_pc.aphy411>0) & (df_pc.aphy443>0) & (df_pc.aphy489)
             & (df_pc.aphy510>0) & (df_pc.aphy555>0) & (df_pc.aphy670))
           ]

In [7]:
df_sat = df_sat.loc[((df_sat.aphy411>0) & (df_sat.aphy443>0) & (df_sat.aphy489)
             & (df_sat.aphy510>0) & (df_sat.aphy555>0) & (df_sat.aphy670))
           ]
df_sat = df_sat.loc[((df_sat.sat_rho_rc412>0) & (df_sat.sat_rho_rc443>0) & 
                    (df_sat.sat_rho_rc490>0) & (df_sat.sat_rho_rc510>0) &
                    (df_sat.sat_rho_rc555>0) & (df_sat.sat_rho_rc670>0))
                   ]

In [8]:
df_pc.loc[172:175].T

Unnamed: 0,172,174,175
sin_doy,-0.8359255,-0.835925,-0.8452491
cos_doy,-0.548843,-0.548843,-0.5343726
sin_minofday,-0.9807853,-0.854912,-0.7071068
cos_minofday,0.1950903,0.518773,-0.7071068
x,0.1036327,0.10113,0.1049779
y,-0.8824488,-0.883359,-0.8847049
z,0.4588512,0.457657,0.4541771
log10_etopo2,1.518514,1.623249,1.579784
oisst,29.9,29.9,29.9
solz,20.4,20.3,18.1


In [9]:
df_sat.loc[172:175].T

Unnamed: 0,172,175
oisst,29.9,29.9
solz,20.4,18.1
sat_rho_rc412,0.020585,0.018079
sat_rho_rc443,0.01972,0.017372
sat_rho_rc490,0.017998,0.016011
sat_rho_rc510,0.016039,0.014212
sat_rho_rc555,0.013809,0.012417
sat_rho_rc670,0.009977,0.009102
aphy405,0.00862,0.00821
aphy411,0.00941,0.00886


In [11]:
df_pc.dropna(inplace=True)
df_sat.dropna(inplace=True)

In [12]:
df_pc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 2 to 494
Data columns (total 36 columns):
sin_doy         162 non-null float64
cos_doy         162 non-null float64
sin_minofday    162 non-null float64
cos_minofday    162 non-null float64
x               162 non-null float64
y               162 non-null float64
z               162 non-null float64
log10_etopo2    162 non-null float64
oisst           162 non-null float64
solz            162 non-null float64
PC1             162 non-null float64
PC2             162 non-null float64
PC3             162 non-null float64
PC4             162 non-null float64
PC5             162 non-null float64
PC6             162 non-null float64
aphy405         162 non-null float64
aphy411         162 non-null float64
aphy443         162 non-null float64
aphy455         162 non-null float64
aphy465         162 non-null float64
aphy489         162 non-null float64
aphy510         162 non-null float64
aphy520         162 non-null float64
aphy530

In [13]:
sat_cols = df_sat.columns.tolist()

In [14]:
sat_cols_new = [col for col in sat_cols if not col.startswith('aphy')] +\
                [col for col in sat_cols if col.startswith('aphy')]

In [15]:
df_sat = df_sat[sat_cols_new]

In [16]:
df_sat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 2 to 494
Data columns (total 36 columns):
oisst            162 non-null float64
solz             162 non-null float64
sat_rho_rc412    162 non-null float64
sat_rho_rc443    162 non-null float64
sat_rho_rc490    162 non-null float64
sat_rho_rc510    162 non-null float64
sat_rho_rc555    162 non-null float64
sat_rho_rc670    162 non-null float64
log10_etopo2     162 non-null float64
sin_doy          162 non-null float64
cos_doy          162 non-null float64
sin_minofday     162 non-null float64
cos_minofday     162 non-null float64
x                162 non-null float64
y                162 non-null float64
z                162 non-null float64
aphy405          162 non-null float64
aphy411          162 non-null float64
aphy443          162 non-null float64
aphy455          162 non-null float64
aphy465          162 non-null float64
aphy489          162 non-null float64
aphy510          162 non-null float64
aphy520          162 

In [17]:
df_pc.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_PC.pkl')
df_sat.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_SAT.pkl')

In [33]:
sc_pc = StandardScaler()
sc_pc_lim = StandardScaler()
sc_sat = StandardScaler()

In [35]:
X_s_pc = sc_pc.fit_transform(df_pc.loc[:, :'PC6'].values)
X_s_sat = sc_sat.fit_transform(df_sat.loc[:, :'z'].values)

In [37]:
X_s_pc_lim = sc_pc_lim.fit_transform(df_pc.loc[:, 'log10_etopo2': 'PC6'].values)

In [20]:
X_s_pc.shape, X_s_sat.shape

((162, 16), (162, 16))

In [38]:
feat_cols_pc_lim = df_pc.loc[:, 'log10_etopo2': 'PC6'].columns.tolist()
feat_cols_pc = df_pc.loc[:,:'PC6'].columns.tolist()
feat_cols_sat = df_sat.loc[:, :'z'].columns.tolist()
df_pc_s_lim = pd.DataFrame(X_s_pc_lim,
                           columns=['%s_s' % col for col in feat_cols_pc_lim],
                          index=df_pc.index)
df_pc_s = pd.DataFrame(X_s_pc, columns=['%s_s' % col for col in feat_cols_pc],
                       index=df_pc.index)
df_sat_s = pd.DataFrame(X_s_sat, columns=['%s_s' % col for col in feat_cols_sat],
                       index=df_sat.index)


In [39]:
df_pc_s_lim.head()

Unnamed: 0,log10_etopo2_s,oisst_s,solz_s,PC1_s,PC2_s,PC3_s,PC4_s,PC5_s,PC6_s
2,0.534148,-2.990826,1.429637,0.837503,0.532905,0.698848,2.162606,0.257826,1.669492
3,1.349502,-2.838796,0.974309,0.930459,0.27552,0.273833,1.10379,-0.454318,1.145318
26,1.649573,1.104955,-1.208586,2.65271,-0.475101,3.021028,-0.735825,-2.286466,2.32467
31,-0.826275,-2.10273,-0.264451,-1.364768,-1.39734,0.269219,-1.193771,-1.714171,0.537851
32,-1.016488,-1.503641,-0.612642,-1.273982,-0.867069,1.733866,-0.981065,-0.554956,-0.522603


In [22]:
df_pc_s.head()

Unnamed: 0,sin_doy_s,cos_doy_s,sin_minofday_s,cos_minofday_s,x_s,y_s,z_s,log10_etopo2_s,oisst_s,solz_s,PC1_s,PC2_s,PC3_s,PC4_s,PC5_s,PC6_s
2,0.741932,0.882889,-0.329893,-0.782974,0.5459,0.791444,-4.151014,0.534148,-2.990826,1.429637,0.837503,0.532905,0.698848,2.162606,0.257826,1.669492
3,0.217216,1.321747,0.729568,-1.637071,0.548743,0.778556,-4.143126,1.349502,-2.838796,0.974309,0.930459,0.27552,0.273833,1.10379,-0.454318,1.145318
26,-0.593027,-1.68942,-0.417725,-0.627779,0.837302,-0.221119,0.360438,1.649573,1.104955,-1.208586,2.65271,-0.475101,3.021028,-0.735825,-2.286466,2.32467
31,0.906276,-0.90085,-0.593966,0.251762,0.323005,-0.216655,0.652795,-0.826275,-2.10273,-0.264451,-1.364768,-1.39734,0.269219,-1.193771,-1.714171,0.537851
32,0.895061,-0.924717,-0.007875,-1.170834,0.330135,-0.237037,0.620325,-1.016488,-1.503641,-0.612642,-1.273982,-0.867069,1.733866,-0.981065,-0.554956,-0.522603


In [40]:
df_pc_s.shape, df_sat_s.shape, df_pc_s_lim.shape

((162, 16), (162, 16), (162, 9))

In [41]:
df_pc_s = pd.concat((df_pc_s, df_pc.filter(regex='aphy', axis=1)), axis=1)
df_pc_s_lim = pd.concat((df_pc_s_lim, df_pc.filter(regex='aphy', axis=1)), axis=1)

In [42]:
df_pc_s.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 2 to 494
Data columns (total 36 columns):
sin_doy_s         162 non-null float64
cos_doy_s         162 non-null float64
sin_minofday_s    162 non-null float64
cos_minofday_s    162 non-null float64
x_s               162 non-null float64
y_s               162 non-null float64
z_s               162 non-null float64
log10_etopo2_s    162 non-null float64
oisst_s           162 non-null float64
solz_s            162 non-null float64
PC1_s             162 non-null float64
PC2_s             162 non-null float64
PC3_s             162 non-null float64
PC4_s             162 non-null float64
PC5_s             162 non-null float64
PC6_s             162 non-null float64
aphy405           162 non-null float64
aphy411           162 non-null float64
aphy443           162 non-null float64
aphy455           162 non-null float64
aphy465           162 non-null float64
aphy489           162 non-null float64
aphy510           162 non-null float6

In [43]:
df_pc_s_lim.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162 entries, 2 to 494
Data columns (total 29 columns):
log10_etopo2_s    162 non-null float64
oisst_s           162 non-null float64
solz_s            162 non-null float64
PC1_s             162 non-null float64
PC2_s             162 non-null float64
PC3_s             162 non-null float64
PC4_s             162 non-null float64
PC5_s             162 non-null float64
PC6_s             162 non-null float64
aphy405           162 non-null float64
aphy411           162 non-null float64
aphy443           162 non-null float64
aphy455           162 non-null float64
aphy465           162 non-null float64
aphy489           162 non-null float64
aphy510           162 non-null float64
aphy520           162 non-null float64
aphy530           162 non-null float64
aphy550           162 non-null float64
aphy555           162 non-null float64
aphy560           162 non-null float64
aphy565           162 non-null float64
aphy570           162 non-null float6

In [44]:
df_sat_s = pd.concat((df_sat_s, df_sat.filter(regex='aphy', axis=1)), axis=1)

In [45]:
pf_pc_lim = PolynomialFeatures(interaction_only=True, include_bias=False)
pf_pc = PolynomialFeatures(interaction_only=True, include_bias=False)
pf_sat = PolynomialFeatures(interaction_only=True, include_bias=False)

In [46]:
Xsp_pc_lim = pf_pc_lim.fit_transform(X_s_pc_lim)
Xsp_pc = pf_pc.fit_transform(X_s_pc)
Xsp_sat = pf_sat.fit_transform(X_s_sat)

In [47]:
y_aphy = df_pc.filter(regex='aphy', axis=1)

In [48]:
y_aphy.shape

(162, 20)

In [49]:
pf_nams_pc_lim = pf_pc_lim.get_feature_names(input_features=df_pc_s_lim.loc[:,:'PC6_s'].columns)
poly_df_cols_pc_lim = pf_nams_pc_lim + y_aphy.columns.tolist()
poly_feat_nams_pc = pf_pc.get_feature_names(input_features=df_pc_s.loc[:,:'PC6_s'].columns)
poly_df_cols_pc = poly_feat_nams_pc + y_aphy.columns.tolist()
poly_feat_nams_sat = pf_pc.get_feature_names(input_features=df_sat_s.loc[:,:'z_s'].columns)
poly_df_cols_sat = poly_feat_nams_sat + y_aphy.columns.tolist()

In [51]:
df_sp_pc_lim = pd.DataFrame(np.c_[Xsp_pc_lim, y_aphy], columns=poly_df_cols_pc_lim,
                           index=df_pc_s_lim.index)

In [53]:
df_sp_pc = pd.DataFrame(np.c_[Xsp_pc, y_aphy], columns=poly_df_cols_pc,
                        index=df_pc_s.index)

In [54]:
df_sp_sat = pd.DataFrame(np.c_[Xsp_sat, y_aphy], columns=poly_df_cols_sat,
                        index=df_sat_s.index)

In [55]:
df_sp_pc.shape, df_sp_sat.shape, df_sp_pc_lim.shape

((162, 156), (162, 156), (162, 65))

In [56]:
df_pc_s_lim.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_PC_lim.pkl')
df_sp_pc_lim.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_PC_lim.pkl')
df_pc_s.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PC.pkl')
df_sp_pc.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_PC.pkl')
df_sat_s.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_SAT.pkl')
df_sp_sat.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_SAT.pkl')

In [57]:
aphys = ['aphy411', 'aphy443', 'aphy489', 'aphy510', 'aphy555', 'aphy670']
df_s_SWF_pc = pd.concat((df_pc_s.loc[:, :'PC6_s'], df_pc_s[aphys]), axis=1)
df_s_SWF_pc_lim = pd.concat((df_pc_s_lim.loc[:, :'PC6_s'], df_pc_s_lim[aphys]), axis=1)
df_s_SWF_sat = pd.concat((df_sat_s.loc[:, :'z_s'], df_sat_s[aphys]), axis=1)

In [52]:
df_sp_sat.head().T

Unnamed: 0,2,3,26,31,32
oisst_s,-2.990826,-2.838796,1.104955,-2.102730,-1.503641
solz_s,1.429637,0.974309,-1.208586,-0.264451,-0.612642
sat_rho_rc412_s,-0.731412,-0.388146,1.069547,-0.445053,-0.093121
sat_rho_rc443_s,-0.878914,-0.498733,0.686661,-0.331946,-0.023906
sat_rho_rc490_s,-1.034350,-0.687684,-0.188815,-0.230660,0.166008
sat_rho_rc510_s,-1.008665,-0.748061,-0.701193,0.101169,0.517129
sat_rho_rc555_s,-1.033670,-0.855011,-0.913337,0.685269,1.199755
sat_rho_rc670_s,-0.980192,-0.764523,-0.774216,0.735696,0.697833
log10_etopo2_s,0.534148,1.349502,1.649573,-0.826275,-1.016488
sin_doy_s,0.741932,0.217216,-0.593027,0.906276,0.895061


In [58]:
df_sp_SWF_pc_lim = pd.concat((df_sp_pc_lim.loc[:, :'PC5_s PC6_s'], 
                              df_sp_pc_lim[aphys]), axis=1)
df_sp_SWF_pc = pd.concat((df_sp_pc.loc[:, :'PC5_s PC6_s'], df_sp_pc[aphys]), axis=1)
df_sp_SWF_sat = pd.concat((df_sp_sat.loc[:, :'y_s z_s'], df_sp_sat[aphys]), axis=1)

In [60]:
df_s_SWF_pc_lim.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_SWF_PC_lim.pkl')
df_s_SWF_pc.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_SWF_PC.pkl')
df_s_SWF_sat.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_SWF_SAT.pkl')
df_sp_SWF_pc_lim.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_SWF_PC_lim.pkl')
df_sp_SWF_pc.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_SWF_PC.pkl')
df_sp_SWF_sat.to_pickle('./pickleJar/OperationalDataSets/df_6_APHY_Standardized_PolyFeatures_SWF_SAT.pkl')