In [1]:
import pickle
import warnings

warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
import pandas as pd

from pandas.plotting import scatter_matrix
import numpy as np
import matplotlib.pyplot as pl
from matplotlib import rcParams
from seaborn import PairGrid, heatmap, kdeplot
import cmocean.cm as cmo

from scripts.prep_utils import log_transform_feature

In [2]:
%matplotlib inline
rcParams['axes.titlesize'] = 18
rcParams['xtick.labelsize'] = 16
rcParams['ytick.labelsize'] = 16
rcParams['font.size'] = 16

In [3]:
df_pca = pd.read_pickle('../DataJar/PickleJar/Interim/df_pca.pkl')
df_aph = pd.read_pickle('../DataJar/PickleJar/Interim/df_4_ML4aph.pkl')

In [4]:
df_pca.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 6 columns):
PC1    493 non-null float64
PC2    493 non-null float64
PC3    493 non-null float64
PC4    493 non-null float64
PC5    493 non-null float64
PC6    493 non-null float64
dtypes: float64(6)
memory usage: 23.2 KB


In [5]:
df_aph.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 18 columns):
oisst            495 non-null float64
etopo2           495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
datetime         495 non-null datetime64[ns]
lat              495 non-null float64
lon              495 non-null float64
aphy411          165 non-null float64
aphy443          167 non-null float64
aphy489          167 non-null float64
aphy510          167 non-null float64
aphy555          165 non-null float64
aphy670          167 non-null float64
dtypes: datetime64[ns](1), float64(17)
memory usage: 73.5 KB


Discard some of the columns:

In [6]:
sat_cols_2_drop = df_aph.filter(regex='sat').columns.tolist()
other_cols_2_drop = ['datetime', 'lat', 'lon']
cols_2_drop = sat_cols_2_drop + other_cols_2_drop
df_aph_small = df_aph.drop(cols_2_drop, axis=1)

In [7]:
log_transform_feature(df_aph_small, 
                      ['etopo2']+df_aph_small.filter(regex='aphy').columns.tolist())

In [8]:
df_aph_small.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 9 columns):
oisst            495 non-null float64
solz             495 non-null float64
log10_etopo2     495 non-null float64
log10_aphy411    165 non-null float64
log10_aphy443    167 non-null float64
log10_aphy489    167 non-null float64
log10_aphy510    167 non-null float64
log10_aphy555    165 non-null float64
log10_aphy670    167 non-null float64
dtypes: float64(9)
memory usage: 38.7 KB


In [9]:
cols =['log10_etopo2', 'oisst', 'solz']
df_features = df_aph_small[cols]
df_features = pd.concat((df_features, df_pca), axis=1)

In [10]:
df_final = pd.concat((df_features, df_aph_small.filter(regex='aphy', axis=1)),
                     axis=1)

In [11]:
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 9 columns):
log10_etopo2    495 non-null float64
oisst           495 non-null float64
solz            495 non-null float64
PC1             493 non-null float64
PC2             493 non-null float64
PC3             493 non-null float64
PC4             493 non-null float64
PC5             493 non-null float64
PC6             493 non-null float64
dtypes: float64(9)
memory usage: 38.7 KB


In [12]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 15 columns):
log10_etopo2     495 non-null float64
oisst            495 non-null float64
solz             495 non-null float64
PC1              493 non-null float64
PC2              493 non-null float64
PC3              493 non-null float64
PC4              493 non-null float64
PC5              493 non-null float64
PC6              493 non-null float64
log10_aphy411    165 non-null float64
log10_aphy443    167 non-null float64
log10_aphy489    167 non-null float64
log10_aphy510    167 non-null float64
log10_aphy555    165 non-null float64
log10_aphy670    167 non-null float64
dtypes: float64(15)
memory usage: 61.9 KB


In [13]:
df_final[df_final.PC1.isna()]

Unnamed: 0,log10_etopo2,oisst,solz,PC1,PC2,PC3,PC4,PC5,PC6,log10_aphy411,log10_aphy443,log10_aphy489,log10_aphy510,log10_aphy555,log10_aphy670
174,1.623249,29.9,20.3,,,,,,,-1.96453,-1.855395,-2.080347,-2.394587,-3.186419,-2.746904
494,1.531479,29.22,37.8,,,,,,,-1.3491,-1.308821,-1.504303,-1.748581,-2.156705,-1.853841


In [14]:
df_final.dropna(inplace=True)

In [15]:
df_final.to_pickle('../DataJar/PickleJar/Interim/df_5_non_standardized.pkl')