In [1]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [2]:
% matplotlib inline

### Read in Nomad data

In [3]:
nomad_path = '../Raw/nomad_seawifs_v2.a2_2008200.txt'
with open(nomad_path) as f:
    for line in f:
        if 'fields=' in line:
            break
columns = line.strip().strip('/fields=').split(',')

In [4]:
df = pd.read_csv(nomad_path, names=columns, skiprows=107)

In [5]:
df.head()

Unnamed: 0,year,month,day,hour,minute,second,lat,lon,id,oisst,...,sat_rhot555,sat_rhot670,sat_file,tdiff,solz,sola,senz,sena,cv,va
0,1997,10,11,9,32,0,39.29,25.11,4069,19.57,...,0.08636,0.05066,S1997284110316.L2_MLAC,5880,48.7,201.5,37.7,250.1,0.058,1
1,2000,2,22,17,0,0,-61.45,-62.299,1596,2.54,...,0.08607,0.04931,S2000053153433.L2_MLAC,-3180,51.2,4.6,53.7,293.6,0.084,1
2,2001,2,19,16,10,0,-61.29,-56.29,1633,0.78,...,0.06491,0.03175,S2001050135427.L2_MLAC,-6000,52.7,27.8,27.1,57.4,0.058,1
3,2002,1,22,13,45,0,-60.999,-56.498,1659,1.79,...,0.06859,0.03436,S2002022133012.L2_MLAC,1080,45.9,38.6,45.8,91.4,0.123,1
4,1997,9,27,11,29,0,24.1392,-20.9995,6083,24.67,...,0.05935,0.03068,S1997270134451.L2_MLAC,8880,28.0,203.5,35.5,243.3,0.052,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Columns: 243 entries, year to va
dtypes: float64(226), int64(15), object(2)
memory usage: 941.7+ KB


### Read in Rayleigh corrected reflectance

In [7]:
df_rc = pd.read_csv('../Raw/Rayleigh&Fresnel_corrected_Rrc.txt', sep='\t')

In [8]:
df_rc.head()

Unnamed: 0,filename,lat,lon,Rrs_412,Rrs_443,Rrs_490,Rrs_510,Rrs_555,Rrs_670
0,S1997284110316.L2_MLAC.hdf,39.29,25.11,0.012088,0.012417,0.011739,0.010579,0.00911,0.006655
1,S2000053153433.L2_MLAC.hdf,-61.45,-62.299,0.010525,0.010636,0.009614,0.007913,0.006224,0.004794
2,S2001050135427.L2_MLAC.hdf,-61.29,-56.29,0.004443,0.004387,0.00424,0.003686,0.002646,0.001177
3,S2002022133012.L2_MLAC.hdf,-60.999,-56.498,0.005869,0.005866,0.005535,0.004643,0.003326,0.001747
4,S1997270134451.L2_MLAC.hdf,24.1392,-20.9995,0.009464,0.008968,0.007719,0.005974,0.004161,0.002517


In [9]:
df_rc['filename'] = df_rc.filename.str.strip('.hdf')

In [10]:
# checking to see if both datasets are congruent

sat_files = df.sat_file.tolist()
lat1 = df.lat.tolist()
lon1 = df.lon.tolist()

filenames = df_rc.filename.tolist()
lat2 = df_rc.lat.tolist()
lon2 = df_rc.lon.tolist()

for i, (s, f, lt1, lt2, ln1, ln2) in enumerate (zip(sat_files, filenames, lat1,
                                                    lat2, lon1, lon2)):
    if s != f:
        print(f'fname#{i}: {s}<->{f}')
    if lt1 != lt2:
        print(f'lat#{i}: {lt1}<->{lt2}')
    if ln1 != ln2:
        print(f'lon#{i}: {ln1}<->{ln2}')

In [11]:
def convert_to_dt(row):
    dt_str = f'{row.year}-{row.month}-{row.day} {row.hour}:{row.minute}:{row.second}'
    return pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')

# consolidate date and time columns into single datetime type column
df.insert(0, 'datetime', df.apply(convert_to_dt, axis=1))
df.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,datetime,lat,lon,id,oisst,etopo2,chl,chl_a,kd405,kd411,...,sat_rhot555,sat_rhot670,sat_file,tdiff,solz,sola,senz,sena,cv,va
0,1997-10-11 09:32:00,39.29,25.11,4069,19.57,462.0,0.091,-999.0,-999,-999.0,...,0.08636,0.05066,S1997284110316.L2_MLAC,5880,48.7,201.5,37.7,250.1,0.058,1
1,2000-02-22 17:00:00,-61.45,-62.299,1596,2.54,3549.0,0.132,0.118,-999,0.03851,...,0.08607,0.04931,S2000053153433.L2_MLAC,-3180,51.2,4.6,53.7,293.6,0.084,1
2,2001-02-19 16:10:00,-61.29,-56.29,1633,0.78,330.0,-999.0,-999.0,-999,0.08031,...,0.06491,0.03175,S2001050135427.L2_MLAC,-6000,52.7,27.8,27.1,57.4,0.058,1
3,2002-01-22 13:45:00,-60.999,-56.498,1659,1.79,2193.0,0.707,0.614,-999,0.06742,...,0.06859,0.03436,S2002022133012.L2_MLAC,1080,45.9,38.6,45.8,91.4,0.123,1
4,1997-09-27 11:29:00,24.1392,-20.9995,6083,24.67,4369.0,-999.0,0.158,-999,0.043,...,0.05935,0.03068,S1997270134451.L2_MLAC,8880,28.0,203.5,35.5,243.3,0.052,1


In [13]:
ad_cols_extract = df.filter(regex='ad[0-9]+').columns.tolist()
ag_cols_extract = df.filter(regex='ag[0-9]+').columns.tolist()
ap_cols_extract = df.filter(regex='ap[0-9]+').columns.tolist()
bb_cols_extract = df.filter(regex='bb[0-9]+').columns.tolist()
chl_cols_extract = ['chl', 'chl_a']

In [14]:
df_temp = df.filter(regex='datetime|lat|lon|id|sat_lt|sat_rhot|sat_rrs|lw|es', axis=1)

In [15]:
swf_bands = [412, 443, 490, 510, 555, 670]
nomad_swf_bands = [411, 443, 489, 510, 555, 670]
df_temp2 = pd.DataFrame(#columns=['datetime', 'lat', 'lon', 'id'] 
                        #+ ['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']
                        #+ [f'sat_lt{band}' for band in swf_bands] 
                        #+ [f'sat_rhot{band}' for band in swf_bands]
                        #+ [f'sat_rrs{band}' for band in swf_bands] 
                        #+ [f'rrs_from_lw/es{band}' for band in nomad_swf_bands]
                        #+ ['chl', 'chl_a']
                        )
df_temp2[['datetime', 'lat', 'lon', 'id']] = df[['datetime', 'lat','lon', 'id']]
df_temp2[['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']] = df[['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']]
df_temp2[ad_cols_extract] = df[ad_cols_extract]
df_temp2[ag_cols_extract] = df[ag_cols_extract]
df_temp2[ap_cols_extract] = df[ap_cols_extract]
df_temp2[bb_cols_extract] = df[bb_cols_extract]
df_temp2[chl_cols_extract] = df[chl_cols_extract]

for sb, nsb in zip(swf_bands, nomad_swf_bands):
    df_temp2[f'sat_lt{sb}'] = df_temp[f'sat_lt{sb}']
    df_temp2[f'sat_rhot{sb}'] = df_temp[f'sat_rhot{sb}']
    df_temp2[f'sat_rrs{sb}'] = df_temp[f'sat_rrs{sb}']
    df_temp2[f'rrs_from_lw/es{nsb}'] = df_temp[f'lw{nsb}']/df_temp[f'es{nsb}'] 

In [16]:
df_temp2.head()

Unnamed: 0,datetime,lat,lon,id,oisst,wt,sal,etopo2,sola,solz,...,sat_rrs510,rrs_from_lw/es510,sat_lt555,sat_rhot555,sat_rrs555,rrs_from_lw/es555,sat_lt670,sat_rhot670,sat_rrs670,rrs_from_lw/es670
0,1997-10-11 09:32:00,39.29,25.11,4069,19.57,-999.0,-999.0,462.0,201.5,48.7,...,0.00345,0.002792,3.3288,0.08636,0.00194,0.00147,1.61295,0.05066,0.00027,0.000232
1,2000-02-22 17:00:00,-61.45,-62.299,1596,2.54,-999.0,-999.0,3549.0,4.6,51.2,...,0.0035,0.004369,3.20798,0.08607,0.00169,0.001759,1.5179,0.04931,0.00037,0.000128
2,2001-02-19 16:10:00,-61.29,-56.29,1633,0.78,1.19,-999.0,330.0,27.8,52.7,...,0.00279,0.00315,2.34232,0.06491,0.00171,0.001535,0.94619,0.03175,0.00023,1.0
3,2002-01-22 13:45:00,-60.999,-56.498,1659,1.79,1.61,-999.0,2193.0,38.6,45.9,...,0.00305,0.004119,2.8686,0.06859,0.00167,0.002115,1.18668,0.03436,0.00023,1.0
4,1997-09-27 11:29:00,24.1392,-20.9995,6083,24.67,-999.0,-999.0,4369.0,203.5,28.0,...,0.00389,0.00336,3.04057,0.05935,0.00184,0.001615,1.298,0.03068,0.00027,1.0


In [17]:
df_temp2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Columns: 116 entries, datetime to rrs_from_lw/es670
dtypes: datetime64[ns](1), float64(114), int64(1)
memory usage: 449.6 KB


In [18]:
df_rc.rename(columns=dict(Rrs_412='sat_rho_rc412', Rrs_443='sat_rho_rc443',
                          Rrs_490='sat_rho_rc490', Rrs_510='sat_rho_rc510',
                          Rrs_555='sat_rho_rc555', Rrs_670='sat_rho_rc670'),
           inplace=True)

In [19]:
df_2 = pd.merge(df_temp2, df_rc, left_index=True, right_index=True)
df_2.rename(columns={'lat_x': 'lat', 'lon_x': 'lon'}, inplace=True)
df_2.drop(['lat_y', 'lon_y'], axis=1, inplace=True)

In [20]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Columns: 123 entries, datetime to sat_rho_rc670
dtypes: datetime64[ns](1), float64(120), int64(1), object(1)
memory usage: 479.5+ KB


In [21]:
df.to_pickle('../PickleJar/df_0_NMD_v2_a2_2008200.pkl')
df_temp2.to_pickle('../PickleJar/df_0_NMD_SWF_v2_a2_2008200.pkl')
df_rc.to_pickle('../PickleJar/df_0_R&F_Corr.pkl')
df_2.to_pickle('../PickleJar/df_1_merged.pkl')

#### <u>Cleaning data</u>

In [22]:
df_2.replace(-999, np.NaN, inplace=True)

In [23]:
df_2.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
lat,495.0,28.887491,20.588358,-67.643000,26.936900,32.400000,42.350000,79.000000
lon,495.0,-70.959878,41.512573,-170.198000,-83.391000,-76.013200,-66.999000,171.716000
id,495.0,4439.327273,2405.666837,92.000000,2036.500000,4192.000000,6666.500000,7831.000000
oisst,495.0,19.066020,6.983298,0.780000,13.775000,19.600000,25.120000,30.510000
wt,158.0,19.741563,7.176002,1.100000,13.872500,18.990000,25.882500,36.250000
sal,96.0,31.294687,5.939344,9.580000,29.197500,33.560000,34.342500,37.010000
etopo2,495.0,768.961616,1410.753061,0.000000,20.500000,156.000000,519.000000,5529.000000
sola,495.0,188.826667,51.128164,0.000000,182.750000,191.900000,204.100000,356.700000
solz,495.0,31.291919,14.525706,0.000000,20.650000,32.300000,42.600000,69.900000
ad405,165.0,0.028610,0.078987,0.001120,0.003600,0.007310,0.017900,0.720960


In [24]:
df_2.filter(regex='sat').loc[df_2.sat_lt412==0]

Unnamed: 0,sat_lt412,sat_rhot412,sat_rrs412,sat_lt443,sat_rhot443,sat_rrs443,sat_lt490,sat_rhot490,sat_rrs490,sat_lt510,...,sat_rrs555,sat_lt670,sat_rhot670,sat_rrs670,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,sat_rho_rc670
47,0.0,0.0,0.00271,0.0,0.0,0.00336,0.0,0.0,0.00344,0.0,...,0.00148,0.0,0.0,5e-05,0.004873,0.005417,0.005583,0.004854,0.003908,0.002249
259,0.0,0.0,0.00312,0.0,0.0,0.00272,0.0,0.0,0.00315,0.0,...,0.00189,0.0,0.0,0.00033,0.004175,0.004056,0.004442,0.004098,0.00333,0.001712
431,0.0,0.0,0.00062,0.0,0.0,0.00101,0.0,0.0,0.00168,0.0,...,0.00369,0.0,0.0,0.00105,0.008051,0.008214,0.008475,0.008718,0.009881,0.00699
432,0.0,0.0,0.00083,0.0,0.0,0.00118,0.0,0.0,0.00171,0.0,...,0.00355,0.0,0.0,0.00131,0.004872,0.005095,0.005414,0.005687,0.006682,0.00415
433,0.0,0.0,0.0019,0.0,0.0,0.00252,0.0,0.0,0.00356,0.0,...,0.0037,0.0,0.0,0.00048,0.003303,0.003848,0.004685,0.00488,0.004931,0.001915


In [25]:
df_2.filter(regex='sat').loc[df_2.sat_rhot412==0]

Unnamed: 0,sat_lt412,sat_rhot412,sat_rrs412,sat_lt443,sat_rhot443,sat_rrs443,sat_lt490,sat_rhot490,sat_rrs490,sat_lt510,...,sat_rrs555,sat_lt670,sat_rhot670,sat_rrs670,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,sat_rho_rc670
47,0.0,0.0,0.00271,0.0,0.0,0.00336,0.0,0.0,0.00344,0.0,...,0.00148,0.0,0.0,5e-05,0.004873,0.005417,0.005583,0.004854,0.003908,0.002249
259,0.0,0.0,0.00312,0.0,0.0,0.00272,0.0,0.0,0.00315,0.0,...,0.00189,0.0,0.0,0.00033,0.004175,0.004056,0.004442,0.004098,0.00333,0.001712
431,0.0,0.0,0.00062,0.0,0.0,0.00101,0.0,0.0,0.00168,0.0,...,0.00369,0.0,0.0,0.00105,0.008051,0.008214,0.008475,0.008718,0.009881,0.00699
432,0.0,0.0,0.00083,0.0,0.0,0.00118,0.0,0.0,0.00171,0.0,...,0.00355,0.0,0.0,0.00131,0.004872,0.005095,0.005414,0.005687,0.006682,0.00415
433,0.0,0.0,0.0019,0.0,0.0,0.00252,0.0,0.0,0.00356,0.0,...,0.0037,0.0,0.0,0.00048,0.003303,0.003848,0.004685,0.00488,0.004931,0.001915


In [26]:
df_2.filter(regex='sat').loc[df_2.sat_rho_rc412==0]

Unnamed: 0,sat_lt412,sat_rhot412,sat_rrs412,sat_lt443,sat_rhot443,sat_rrs443,sat_lt490,sat_rhot490,sat_rrs490,sat_lt510,...,sat_rrs555,sat_lt670,sat_rhot670,sat_rrs670,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,sat_rho_rc670
174,11.50068,0.22709,0.01201,10.42751,0.18733,0.00999,8.11119,0.14105,0.00758,6.78725,...,0.00307,2.56028,0.05787,0.0006,0.0,0.0,0.0,0.0,0.0,0.0


#### <u>Subsetting data</u>

In [27]:
swf_bands = [412, 443, 490, 510, 555, 670]
nomad_swf_bands = [411, 443, 489, 510, 555, 670]
time_loc_cols_extract = ['datetime', 'lat', 'lon', 'id']
anc_cols_extract = ['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']
sat_cols_extract = ['sat_rrs%d' %b for b in swf_bands]\
                     + ['sat_lt%d' %b for b in swf_bands]\
                     + ['sat_rhot%d' %b for b in swf_bands]\
                     + ['sat_rho_rc%d' %b for b in swf_bands]

In [28]:
ad_cols_extract = df.filter(regex='ad[0-9]+').columns.tolist()
ag_cols_extract = df.filter(regex='ag[0-9]+').columns.tolist()
ap_cols_extract = df.filter(regex='ap[0-9]+').columns.tolist()
bb_cols_extract = df.filter(regex='bb[0-9]+').columns.tolist()
chl_cols_extract = ['chl', 'chl_a']

In [29]:
df_anc = df_2[time_loc_cols_extract + anc_cols_extract]
df_sat = df_2[time_loc_cols_extract + sat_cols_extract]
df_ad = df_2[time_loc_cols_extract + ad_cols_extract]
df_ag = df_2[time_loc_cols_extract + ag_cols_extract]
df_ap = df_2[time_loc_cols_extract + ap_cols_extract]
df_bb = df_2[time_loc_cols_extract + bb_cols_extract]
df_chl = df_2[time_loc_cols_extract + chl_cols_extract]

In [30]:
df_sat.head()

Unnamed: 0,datetime,lat,lon,id,sat_rrs412,sat_rrs443,sat_rrs490,sat_rrs510,sat_rrs555,sat_rrs670,...,sat_rhot490,sat_rhot510,sat_rhot555,sat_rhot670,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,sat_rho_rc670
0,1997-10-11 09:32:00,39.29,25.11,4069,0.00552,0.00575,0.00485,0.00345,0.00194,0.00027,...,0.13704,0.11923,0.08636,0.05066,0.012088,0.012417,0.011739,0.010579,0.00911,0.006655
1,2000-02-22 17:00:00,-61.45,-62.299,1596,0.00826,0.00774,0.0061,0.0035,0.00169,0.00037,...,0.14396,0.12151,0.08607,0.04931,0.010525,0.010636,0.009614,0.007913,0.006224,0.004794
2,2001-02-19 16:10:00,-61.29,-56.29,1633,0.0043,0.00389,0.00359,0.00279,0.00171,0.00023,...,0.11013,0.0943,0.06491,0.03175,0.004443,0.004387,0.00424,0.003686,0.002646,0.001177
3,2002-01-22 13:45:00,-60.999,-56.498,1659,0.00485,0.00457,0.00419,0.00305,0.00167,0.00023,...,0.11763,0.10012,0.06859,0.03436,0.005869,0.005866,0.005535,0.004643,0.003326,0.001747
4,1997-09-27 11:29:00,24.1392,-20.9995,6083,0.0101,0.00857,0.00621,0.00389,0.00184,0.00027,...,0.104,0.08705,0.05935,0.03068,0.009464,0.008968,0.007719,0.005974,0.004161,0.002517


#### <u>Creating distinction between hplc and fluo chl</u>

In [31]:
df_chl.replace(to_replace=-999, value=np.NaN, inplace=True)

In [32]:
df_chl.rename(columns={'chl': 'chl_fluo', 'chl_a': 'chl_hplc'}, inplace=True)

In [33]:
def get_chl(row):
    if pd.isna(row.chl_hplc):
        return row.chl_fluo
    return row.chl_hplc

df_chl['chl'] = df_chl.apply(get_chl, axis=1)

In [34]:
df_chl['is_hplc'] = ~df_chl.chl_hplc.isnull()
df_chl.to_pickle('../PickleJar/df_2_chl.pkl')

#### <u> Creating phytoplankton absorption DataFrame</u>

In [35]:
a_bb_bands = list(df_ap.columns.str.extract('([0-9]+)').dropna().values.squeeze())

In [36]:
a_bb_bands

['405',
 '411',
 '443',
 '455',
 '465',
 '489',
 '510',
 '520',
 '530',
 '550',
 '555',
 '560',
 '565',
 '570',
 '590',
 '619',
 '625',
 '665',
 '670',
 '683']

In [37]:
df_aphy = pd.DataFrame(columns=time_loc_cols_extract + ['aphy%s' % b for b in a_bb_bands])
df_aphy[time_loc_cols_extract] = df_2[time_loc_cols_extract]

In [38]:
for b in a_bb_bands:
    df_aphy['aphy%s' %b] = df_ap['ap%s' % b] - df_ad['ad%s' %b]

In [39]:
df_aphy.head()

Unnamed: 0,datetime,lat,lon,id,aphy405,aphy411,aphy443,aphy455,aphy465,aphy489,...,aphy555,aphy560,aphy565,aphy570,aphy590,aphy619,aphy625,aphy665,aphy670,aphy683
0,1997-10-11 09:32:00,39.29,25.11,4069,,,,,,,...,,,,,,,,,,
1,2000-02-22 17:00:00,-61.45,-62.299,1596,,,,,,,...,,,,,,,,,,
2,2001-02-19 16:10:00,-61.29,-56.29,1633,0.02149,0.0241,0.03078,0.02838,0.02765,0.02057,...,0.00385,0.0034,0.0031,0.00291,0.00324,0.00328,0.00348,0.00774,0.0104,0.00949
3,2002-01-22 13:45:00,-60.999,-56.498,1659,0.01693,0.01886,0.02283,0.02071,0.01999,0.01464,...,0.0031,0.00275,0.00252,0.00239,0.00282,0.00301,0.00319,0.00723,0.00935,0.00798
4,1997-09-27 11:29:00,24.1392,-20.9995,6083,,,,,,,...,,,,,,,,,,


In [40]:
df_aphy.to_pickle('../PickleJar/df_2_aphy.pkl')

In [41]:
# Target: chl
df_ml_chl = pd.concat((df_anc.drop(['datetime', 'lat', 'lon', 'id'], axis=1),
                       df_sat.filter(regex='sat_rho_rc'), df_chl), axis=1)
df_ml_chl.replace(to_replace=-999, value=np.NaN, inplace=True)
df_ml_chl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 20 columns):
oisst            495 non-null float64
wt               158 non-null float64
sal              96 non-null float64
etopo2           495 non-null float64
sola             495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
datetime         495 non-null datetime64[ns]
lat              495 non-null float64
lon              495 non-null float64
id               495 non-null int64
chl_fluo         364 non-null float64
chl_hplc         158 non-null float64
chl              424 non-null float64
is_hplc          495 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(17), int64(1)
memory usage: 77.8 KB


In [42]:
# dropping wt and sal for too sparse data content
df_ml_chl.drop(['wt', 'sal'], axis=1, inplace=True)

In [43]:
# Target: aphy
df_ml_aphy = pd.concat((df_anc.drop(['datetime', 'lat', 'lon'], axis=1)
                        , df_sat.filter(regex='sat_rho_rc'), df_aphy.drop(['id'], axis=1)), axis=1)
df_giop = pd.concat((df_anc.drop(['datetime', 'lat', 'lon'], axis=1)
                        , df_sat.filter(regex='sat_rrs'), df_aphy.drop(['id'], axis=1)), axis=1)
df_giop = pd.concat((df_giop, df_chl.drop(['datetime', 'lat', 'lon', 'id'], axis=1)), axis=1)
df_ml_aphy.drop(['wt', 'sal'], axis=1, inplace=True)
df_giop.drop(['wt', 'sal'], axis=1, inplace=True)

In [44]:
df_giop.head()

Unnamed: 0,id,oisst,etopo2,sola,solz,sat_rrs412,sat_rrs443,sat_rrs490,sat_rrs510,sat_rrs555,...,aphy590,aphy619,aphy625,aphy665,aphy670,aphy683,chl_fluo,chl_hplc,chl,is_hplc
0,4069,19.57,462.0,201.5,48.7,0.00552,0.00575,0.00485,0.00345,0.00194,...,,,,,,,0.091,,0.091,False
1,1596,2.54,3549.0,4.6,51.2,0.00826,0.00774,0.0061,0.0035,0.00169,...,,,,,,,0.132,0.118,0.118,True
2,1633,0.78,330.0,27.8,52.7,0.0043,0.00389,0.00359,0.00279,0.00171,...,0.00324,0.00328,0.00348,0.00774,0.0104,0.00949,,,,False
3,1659,1.79,2193.0,38.6,45.9,0.00485,0.00457,0.00419,0.00305,0.00167,...,0.00282,0.00301,0.00319,0.00723,0.00935,0.00798,0.707,0.614,0.614,True
4,6083,24.67,4369.0,203.5,28.0,0.0101,0.00857,0.00621,0.00389,0.00184,...,,,,,,,,0.158,0.158,True


In [45]:
df_ml_aphy.head()

Unnamed: 0,id,oisst,etopo2,sola,solz,sat_rho_rc412,sat_rho_rc443,sat_rho_rc490,sat_rho_rc510,sat_rho_rc555,...,aphy555,aphy560,aphy565,aphy570,aphy590,aphy619,aphy625,aphy665,aphy670,aphy683
0,4069,19.57,462.0,201.5,48.7,0.012088,0.012417,0.011739,0.010579,0.00911,...,,,,,,,,,,
1,1596,2.54,3549.0,4.6,51.2,0.010525,0.010636,0.009614,0.007913,0.006224,...,,,,,,,,,,
2,1633,0.78,330.0,27.8,52.7,0.004443,0.004387,0.00424,0.003686,0.002646,...,0.00385,0.0034,0.0031,0.00291,0.00324,0.00328,0.00348,0.00774,0.0104,0.00949
3,1659,1.79,2193.0,38.6,45.9,0.005869,0.005866,0.005535,0.004643,0.003326,...,0.0031,0.00275,0.00252,0.00239,0.00282,0.00301,0.00319,0.00723,0.00935,0.00798
4,6083,24.67,4369.0,203.5,28.0,0.009464,0.008968,0.007719,0.005974,0.004161,...,,,,,,,,,,


In [46]:
df_giop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 38 columns):
id            495 non-null int64
oisst         495 non-null float64
etopo2        495 non-null float64
sola          495 non-null float64
solz          495 non-null float64
sat_rrs412    495 non-null float64
sat_rrs443    495 non-null float64
sat_rrs490    495 non-null float64
sat_rrs510    495 non-null float64
sat_rrs555    495 non-null float64
sat_rrs670    495 non-null float64
datetime      495 non-null datetime64[ns]
lat           495 non-null float64
lon           495 non-null float64
aphy405       165 non-null float64
aphy411       165 non-null float64
aphy443       167 non-null float64
aphy455       167 non-null float64
aphy465       167 non-null float64
aphy489       167 non-null float64
aphy510       167 non-null float64
aphy520       167 non-null float64
aphy530       167 non-null float64
aphy550       166 non-null float64
aphy555       165 non-null float64
aphy560       1

In [47]:
df_ml_aphy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 34 columns):
id               495 non-null int64
oisst            495 non-null float64
etopo2           495 non-null float64
sola             495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
datetime         495 non-null datetime64[ns]
lat              495 non-null float64
lon              495 non-null float64
aphy405          165 non-null float64
aphy411          165 non-null float64
aphy443          167 non-null float64
aphy455          167 non-null float64
aphy465          167 non-null float64
aphy489          167 non-null float64
aphy510          167 non-null float64
aphy520          167 non-null float64
aphy530          167 non-null float64
aphy550         

In [48]:
df_ml_chl.to_pickle('../PickleJar/DataSets/df_3_ML4chl.pkl')
df_ml_aphy.to_pickle('../PickleJar/DataSets/df_3_ML4aphy.pkl')
df_giop.to_pickle('../PickleJar/DataSets/df_3_GIOP.pkl')

In [49]:
df_ml_chl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 18 columns):
oisst            495 non-null float64
etopo2           495 non-null float64
sola             495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
datetime         495 non-null datetime64[ns]
lat              495 non-null float64
lon              495 non-null float64
id               495 non-null int64
chl_fluo         364 non-null float64
chl_hplc         158 non-null float64
chl              424 non-null float64
is_hplc          495 non-null bool
dtypes: bool(1), datetime64[ns](1), float64(15), int64(1)
memory usage: 70.1 KB


In [50]:
df_giop_swf = df_giop[['id', 'datetime', 'lat', 'lon', 'oisst','etopo2','sola','solz'] +
                      [f'sat_rrs{b}' for b in swf_bands] + [f'aphy{b}' for b in nomad_swf_bands]]

In [51]:
df_ml_aphy_swf = df_ml_aphy[['id', 'datetime', 'lat', 'lon', 'oisst','etopo2','sola','solz'] +
                      [f'sat_rho_rc{b}' for b in swf_bands] + [f'aphy{b}' for b in nomad_swf_bands]]

In [52]:
df_ml_aphy_swf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 495 entries, 0 to 494
Data columns (total 20 columns):
id               495 non-null int64
datetime         495 non-null datetime64[ns]
lat              495 non-null float64
lon              495 non-null float64
oisst            495 non-null float64
etopo2           495 non-null float64
sola             495 non-null float64
solz             495 non-null float64
sat_rho_rc412    495 non-null float64
sat_rho_rc443    495 non-null float64
sat_rho_rc490    495 non-null float64
sat_rho_rc510    495 non-null float64
sat_rho_rc555    495 non-null float64
sat_rho_rc670    495 non-null float64
aphy411          165 non-null float64
aphy443          167 non-null float64
aphy489          167 non-null float64
aphy510          167 non-null float64
aphy555          165 non-null float64
aphy670          167 non-null float64
dtypes: datetime64[ns](1), float64(18), int64(1)
memory usage: 81.2 KB


In [53]:
df_giop_swf.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164 entries, 2 to 494
Data columns (total 20 columns):
id            164 non-null int64
datetime      164 non-null datetime64[ns]
lat           164 non-null float64
lon           164 non-null float64
oisst         164 non-null float64
etopo2        164 non-null float64
sola          164 non-null float64
solz          164 non-null float64
sat_rrs412    164 non-null float64
sat_rrs443    164 non-null float64
sat_rrs490    164 non-null float64
sat_rrs510    164 non-null float64
sat_rrs555    164 non-null float64
sat_rrs670    164 non-null float64
aphy411       164 non-null float64
aphy443       164 non-null float64
aphy489       164 non-null float64
aphy510       164 non-null float64
aphy555       164 non-null float64
aphy670       164 non-null float64
dtypes: datetime64[ns](1), float64(18), int64(1)
memory usage: 26.9 KB


In [54]:
df_ml_aphy_swf.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164 entries, 2 to 494
Data columns (total 20 columns):
id               164 non-null int64
datetime         164 non-null datetime64[ns]
lat              164 non-null float64
lon              164 non-null float64
oisst            164 non-null float64
etopo2           164 non-null float64
sola             164 non-null float64
solz             164 non-null float64
sat_rho_rc412    164 non-null float64
sat_rho_rc443    164 non-null float64
sat_rho_rc490    164 non-null float64
sat_rho_rc510    164 non-null float64
sat_rho_rc555    164 non-null float64
sat_rho_rc670    164 non-null float64
aphy411          164 non-null float64
aphy443          164 non-null float64
aphy489          164 non-null float64
aphy510          164 non-null float64
aphy555          164 non-null float64
aphy670          164 non-null float64
dtypes: datetime64[ns](1), float64(18), int64(1)
memory usage: 26.9 KB


In [55]:
df_giop_swf.describe()

Unnamed: 0,id,lat,lon,oisst,etopo2,sola,solz,sat_rrs412,sat_rrs443,sat_rrs490,sat_rrs510,sat_rrs555,sat_rrs670,aphy411,aphy443,aphy489,aphy510,aphy555,aphy670
count,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,495.0,165.0,167.0,167.0,167.0,165.0,167.0
mean,4439.327273,28.887491,-70.959878,19.06602,768.961616,188.826667,31.291919,0.004258,0.004354,0.004459,0.003785,0.003138,0.000582,0.0446,0.051823,0.032713,0.021539,0.008329,0.022059
std,2405.666837,20.588358,41.512573,6.983298,1410.753061,51.128164,14.525706,0.003353,0.002626,0.002283,0.002175,0.002596,0.000873,0.07521,0.083448,0.049682,0.037574,0.017483,0.05112
min,92.0,-67.643,-170.198,0.78,0.0,0.0,0.0,-0.00227,-0.0007,0.00057,0.00097,0.00099,-0.00037,0.00199,0.00256,0.00181,0.00103,0.0,0.00056
25%,2036.5,26.9369,-83.391,13.775,20.5,182.75,20.65,0.00188,0.00232,0.00277,0.00254,0.00173,0.00017,0.01312,0.0168,0.010035,0.005345,0.00114,0.002815
50%,4192.0,32.4,-76.0132,19.6,156.0,191.9,32.3,0.00335,0.00377,0.00409,0.00323,0.00212,0.0003,0.02259,0.02781,0.01802,0.01027,0.00328,0.00663
75%,6666.5,42.35,-66.999,25.12,519.0,204.1,42.6,0.006185,0.006135,0.00592,0.004175,0.003425,0.00054,0.0399,0.04983,0.032795,0.01986,0.00724,0.018895
max,7831.0,79.0,171.716,30.51,5529.0,356.7,69.9,0.01773,0.01234,0.01855,0.01999,0.0213,0.00753,0.56639,0.58777,0.28555,0.21519,0.13273,0.40058


In [56]:
df_giop_swf.dropna().loc[((df_giop_swf.sat_rrs412<=0) | (df_giop_swf.sat_rrs443<=0) )]

Unnamed: 0,id,datetime,lat,lon,oisst,etopo2,sola,solz,sat_rrs412,sat_rrs443,sat_rrs490,sat_rrs510,sat_rrs555,sat_rrs670,aphy411,aphy443,aphy489,aphy510,aphy555,aphy670
278,2245,2001-12-13 18:02:00,34.158,-119.947,13.21,511.0,182.7,57.4,-3e-05,0.00075,0.0015,0.0016,0.00159,0.00026,0.0399,0.05796,0.03637,0.02477,0.00905,0.04537
279,2246,2001-12-13 18:46:00,34.203,-119.925,13.21,558.0,182.7,57.4,-6e-05,0.00081,0.00166,0.00166,0.00137,0.00014,0.04183,0.05558,0.03319,0.02238,0.00863,0.03561
280,2247,2001-12-13 19:52:00,34.252,-119.904,13.21,518.0,182.7,57.5,-0.0002,0.00069,0.00164,0.00161,0.00132,8e-05,0.01903,0.03104,0.01872,0.01142,0.00312,0.02292
281,2249,2001-12-13 21:44:00,34.348,-119.863,13.21,176.0,182.8,57.6,-0.00061,0.0004,0.00147,0.00144,0.00118,-6e-05,0.02353,0.03313,0.02134,0.01282,0.00357,0.02009
282,2253,2002-01-10 19:28:00,34.203,-119.926,13.58,558.0,178.4,56.1,-0.00042,0.00037,0.00151,0.00171,0.00148,-9e-05,0.0525,0.07042,0.04877,0.02772,0.00798,0.03088
283,2254,2002-01-10 20:40:00,34.251,-119.905,13.58,520.0,178.4,56.2,-0.00095,-1e-05,0.00113,0.00147,0.00143,-0.00012,0.06089,0.08491,0.05807,0.03104,0.00724,0.03404
292,2277,2002-03-19 17:51:00,34.158,-119.947,12.24,511.0,174.1,34.7,-3e-05,0.00037,0.00108,0.00122,0.00152,0.00015,0.11101,0.13349,0.08322,0.05946,0.02347,0.0863
295,2293,2002-05-21 20:58:00,34.348,-119.863,13.35,176.0,226.8,19.2,-2e-05,0.00045,0.00105,0.00128,0.00201,0.00069,0.17733,0.20519,0.12669,0.0927,0.04037,0.13924
308,4264,2001-04-08 03:47:00,38.008,133.578,10.84,871.0,205.5,33.2,-0.00227,-0.0007,0.00057,0.00097,0.0012,0.00013,0.04993,0.06603,0.04511,0.02568,0.00658,0.02171
392,1251,1998-04-11 14:40:00,38.0,-76.267,5.12,9.0,177.9,29.6,-0.0006,0.00037,0.00165,0.00226,0.00481,0.00187,0.56639,0.58777,0.27644,0.21519,0.13273,0.40058


In [66]:
df_giop_swf.to_pickle('../PickleJar/Datasets/df_giop_swf.pkl')
df_ml_aphy_swf.to_pickle('../PickleJar/Datasets/df_ml_aphy_swf.pkl')
df_ml_chl.to_pickle('../PickleJar/DataSets/df_ml_chl_swf.pkl')

In [60]:
df_giop.index.name='Row_ID'

In [61]:
df_giop_swf.dropna().to_csv('/Users/erdemk/Desktop/giop_swf.csv')

In [62]:
df_giop_swf.dropna().info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 164 entries, 2 to 494
Data columns (total 20 columns):
id            164 non-null int64
datetime      164 non-null datetime64[ns]
lat           164 non-null float64
lon           164 non-null float64
oisst         164 non-null float64
etopo2        164 non-null float64
sola          164 non-null float64
solz          164 non-null float64
sat_rrs412    164 non-null float64
sat_rrs443    164 non-null float64
sat_rrs490    164 non-null float64
sat_rrs510    164 non-null float64
sat_rrs555    164 non-null float64
sat_rrs670    164 non-null float64
aphy411       164 non-null float64
aphy443       164 non-null float64
aphy489       164 non-null float64
aphy510       164 non-null float64
aphy555       164 non-null float64
aphy670       164 non-null float64
dtypes: datetime64[ns](1), float64(18), int64(1)
memory usage: 26.9 KB
