In this notebook, I use the index of a subset of this data used elsewhere in the GIOP model to extract a testing set.
The remainder of the data, with valid entries, is used to constitute a training data set.

In [54]:
import pickle

import pandas as pd
import numpy as np

In [53]:
def print_versions(pkg):
    print(f'{pkg.__name__}: {pkg.__version__}')

for p in [pickle, pd, np]:
    try:
        print_versions(p)
    except:
        pass

pandas: 0.24.2
numpy: 1.16.4
seaborn: 0.9.0


In [2]:
pl.style.use('bmh')

In [3]:
df_aphy = pd.read_pickle('../PickleJar/DataSets/df_ml_aphy_swf.pkl')

In [5]:
df_giop_test = pd.read_pickle('../PickleJar/DataSets/df_new_test_set.pkl')

In [6]:
df_giop_test.sort_index(inplace=True)

In [7]:
df_giop_test.drop_duplicates(inplace=True)

In [8]:
test_index = set(df_giop_test.index.values)
all_index = set(df_aphy.index.values)

In [9]:
train_index = all_index.difference(test_index)

In [11]:
df_aphy_train = df_aphy.loc[train_index]

In [12]:
df_aphy_test = df_aphy.loc[test_index]

In [14]:
df_aphy_test.head()

Unnamed: 0_level_0,id,datetime,lat,lon,oisst,etopo2,sola,solz,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,sat_rho_rc670,aphy411,aphy443,aphy489,aphy510,aphy555,aphy670
Row_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
129,1910,2000-03-03 16:50:00,26.129,-82.959,22.17,34.0,191.6,33.2,0.007258,0.007271,0.007041,0.005481,0.00382,0.00199,0.01827,0.01975,0.01128,0.00582,0.00148,0.00273
196,6573,2001-10-11 14:19:00,39.087,-13.419,19.97,3495.0,193.3,47.1,0.009356,0.009632,0.009137,0.007881,0.006752,0.005236,0.01238,0.01703,0.01024,0.0061,0.00178,0.00388
135,1917,2000-03-03 20:55:00,26.226,-83.188,22.77,43.0,191.2,33.3,0.006975,0.006786,0.006393,0.00514,0.003679,0.001897,0.01571,0.01748,0.0095,0.00473,0.00091,0.00219
200,6641,2002-03-11 13:41:00,-42.4345,-56.3325,14.75,3784.0,28.9,42.4,0.003014,0.003245,0.00349,0.002974,0.002267,0.000918,0.01504,0.01893,0.01355,0.00792,0.00136,0.00601
172,2041,2001-08-28 18:45:00,27.313,-83.302,29.9,33.0,211.0,20.4,0.020585,0.01972,0.017998,0.016039,0.013809,0.009977,0.00941,0.01173,0.00684,0.00327,0.00061,0.00149


In [18]:
pd.testing.assert_frame_equal(df_aphy_test.sort_index(),df_aphy.loc[df_giop_test.index.values] )

In [37]:
df_aphy_train_y = df_aphy_train.dropna().filter(regex='aphy')

In [38]:
df_aphy_test_y = df_aphy_test.dropna().filter(regex='aphy')

In [32]:
anc_columns = ['oisst', 'etopo2', 'solz']
ρ_rc_columns = df_aphy_train.filter(regex='sat_rho_rc').columns
all_columns = anc_columns + list(ρ_rc_columns)

In [39]:
df_aphy_train_x = df_aphy_train.dropna().loc[:, all_columns]

In [40]:
df_aphy_test_x = df_aphy_test.dropna().loc[:, all_columns]

In [41]:
df_aphy_train_y.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2 to 494
Data columns (total 6 columns):
aphy411    150 non-null float64
aphy443    150 non-null float64
aphy489    150 non-null float64
aphy510    150 non-null float64
aphy555    150 non-null float64
aphy670    150 non-null float64
dtypes: float64(6)
memory usage: 8.2 KB


In [42]:
df_aphy_test_y.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 14 entries, 129 to 187
Data columns (total 6 columns):
aphy411    14 non-null float64
aphy443    14 non-null float64
aphy489    14 non-null float64
aphy510    14 non-null float64
aphy555    14 non-null float64
aphy670    14 non-null float64
dtypes: float64(6)
memory usage: 784.0 bytes


In [44]:
df_aphy_train_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 2 to 494
Data columns (total 9 columns):
oisst            150 non-null float64
etopo2           150 non-null float64
solz             150 non-null float64
sat_rho_rc412    150 non-null float64
sat_rho_rc443    150 non-null float64
sat_rho_rc490    150 non-null float64
sat_rho_rc510    150 non-null float64
sat_rho_rc555    150 non-null float64
sat_rho_rc670    150 non-null float64
dtypes: float64(9)
memory usage: 11.7 KB


In [43]:
df_aphy_test_x.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 14 entries, 129 to 187
Data columns (total 9 columns):
oisst            14 non-null float64
etopo2           14 non-null float64
solz             14 non-null float64
sat_rho_rc412    14 non-null float64
sat_rho_rc443    14 non-null float64
sat_rho_rc490    14 non-null float64
sat_rho_rc510    14 non-null float64
sat_rho_rc555    14 non-null float64
sat_rho_rc670    14 non-null float64
dtypes: float64(9)
memory usage: 1.1 KB


In [None]:
datadict = dict(x_test=df_aphy_test_x, y_test=df_aphy_test_y,
                x_train=df_aphy_train_x, y_train=df_aphy_train_y)
with open('../PickleJar/DataSets/SplitSets.pkl', 'wb') as fb:
    pickle.dump(datadict, fb, protocol=pickle.HIGHEST_PROTOCOL)