In [83]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [2]:
% matplotlib inline

In [3]:
with open('./data/nomad_seawifs_v2.a2_2008200.txt', 'r') as f:
    for line in f:
        if 'fields=' in line:
            break
columns = line.strip().strip('/fields=').split(',')

In [4]:
df = pd.read_csv('./data/nomad_seawifs_v2.a2_2008200.txt', names=columns, skiprows=107)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Columns: 243 entries, year to va
dtypes: float64(226), int64(15), object(2)
memory usage: 941.7+ KB


In [6]:
df_rc = pd.read_csv('./data/Rayleigh&Fresnel_corrected_Rrc.txt', sep='\t')

In [7]:
df_rc.head()

Unnamed: 0,filename,lat,lon,Rrs_412,Rrs_443,Rrs_490,Rrs_510,Rrs_555,Rrs_670
0,S1997284110316.L2_MLAC.hdf,39.29,25.11,0.012088,0.012417,0.011739,0.010579,0.00911,0.006655
1,S2000053153433.L2_MLAC.hdf,-61.45,-62.299,0.010525,0.010636,0.009614,0.007913,0.006224,0.004794
2,S2001050135427.L2_MLAC.hdf,-61.29,-56.29,0.004443,0.004387,0.00424,0.003686,0.002646,0.001177
3,S2002022133012.L2_MLAC.hdf,-60.999,-56.498,0.005869,0.005866,0.005535,0.004643,0.003326,0.001747
4,S1997270134451.L2_MLAC.hdf,24.1392,-20.9995,0.009464,0.008968,0.007719,0.005974,0.004161,0.002517


In [8]:
df_rc['filename'] = df_rc.filename.str.strip('.hdf')

In [9]:
# checking to see if both datasets are congruent

sat_files = df.sat_file.tolist()
lat1 = df.lat.tolist()
lon1 = df.lon.tolist()

filenames = df_rc.filename.tolist()
lat2 = df_rc.lat.tolist()
lon2 = df_rc.lon.tolist()

for i, (s, f, lt1, lt2, ln1, ln2) in enumerate (zip(sat_files, filenames, lat1,
                                                    lat2, lon1, lon2)):
    if s != f:
        print(f'fname#{i}: {s}<->{f}')
    if lt1 != lt2:
        print(f'lat#{i}: {lt1}<->{lt2}')
    if ln1 != ln2:
        print(f'lon#{i}: {ln1}<->{ln2}')

In [10]:
def convert_to_dt(row):
    dt_str = f'{row.year}-{row.month}-{row.day} {row.hour}:{row.minute}:{row.second}'
    return pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')

# consolidate date and time columns into single datetime type column
df.insert(0, 'datetime', df.apply(convert_to_dt, axis=1))
df.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [11]:
df_rc.rename(columns=dict(Rrs_412='sat_rrs412_rc', Rrs_443='sat_rrs443_rc',
                         Rrs_490='sat_rrs490_rc', Rrs_510='sat_rrs510_rc',
                         Rrs_555='sat_rrs555_rc', Rrs_670='sat_rrs670_rc'),
           inplace=True)

In [12]:
df_2 = pd.merge(df, df_rc, left_index=True, right_index=True)
df_2.rename(columns={'lat_x': 'lat', 'lon_x': 'lon'}, inplace=True)
df_2.drop(['lat_y', 'lon_y'], axis=1, inplace=True)

In [13]:
df.to_pickle('./pickleJar/df_0_NMD_SWF_v2_a2_2008200.pkl')
df_rc.to_pickle('./pickleJar/df_0_R&F_Corr.pkl')
df_2.to_pickle('./pickleJar/df_1_merged.pkl')

#### <u>Subsetting data</u>

In [None]:
swf_bands = [412, 443, 490, 510, 555, 670]
time_loc_cols_extract = ['datetime', 'lat', 'lon']
anc_cols_extract = ['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']
sat_cols_extract = ['sat_rrs%d' %b for b in swf_bands]\
                     + ['sat_lt%d' %b for b in swf_bands]\
                     + ['sat_rhot%d' %b for b in swf_bands]\
                     + ['sat_rrs%d_rc' %b for b in swf_bands]

In [30]:
ad_cols_extract = df_2.filter(regex='ad[0-9]+').columns.tolist()
ag_cols_extract = df_2.filter(regex='ag[0-9]+').columns.tolist()
ap_cols_extract = df_2.filter(regex='ap[0-9]+').columns.tolist()
bb_cols_extract = df_2.filter(regex='bb[0-9]+').columns.tolist()
chl_cols_extract = ['chl', 'chl_a']

In [93]:
df_anc = df_2[time_loc_cols_extract + anc_cols_extract]
df_sat = df_2[time_loc_cols_extract + sat_cols_extract]
df_ad = df_2[time_loc_cols_extract + ad_cols_extract]
df_ag = df_2[time_loc_cols_extract + ag_cols_extract]
df_ap = df_2[time_loc_cols_extract + ap_cols_extract]
df_bb = df_2[time_loc_cols_extract + bb_cols_extract]
df_chl = df_2[time_loc_cols_extract + chl_cols_extract]

In [85]:
pkldir = './pickleJar/'
df_anc.to_pickle(pkldir + 'df_2_ancillary.pkl')
df_sat.to_pickle(pkldir + 'df_2_satellite.pkl')
df_ad.to_pickle(pkldir + 'df_2_ad.pkl')
df_ag.to_pickle(pkldir + 'df_2_ag.pkl')
df_ap.to_pickle(pkldir + 'df_2_ap.pkl')
df_bb.to_pickle(pkldir + 'df_2_bb.pkl')

#### <u>Creating distinction between hplc and fluo chl</u>

In [94]:
df_chl.replace(to_replace=-999, value=np.NaN, inplace=True)

In [95]:
df_chl.rename(columns={'chl': 'chl_fluo', 'chl_a': 'chl_hplc'}, inplace=True)

In [96]:
def get_chl(row):
    if row.chl_hplc == np.NaN:
        return row.chl_fluo
    return row.chl_hplc

df_chl['chl'] = df_chl.apply(get_chl, axis=1)

In [107]:
df_chl['is_hplc'] = ~df_chl.chl_hplc.isnull()

df_chl.to_pickle(pkldir + 'df_2_chl.pkl')

#### <u> Creating phytoplankton absorption DataFrame</u>

In [42]:
a_bb_bands = list(df_ap.columns.str.extract('([0-9]+)').dropna().values.squeeze())

In [55]:
df_aphy = pd.DataFrame(columns=time_loc_cols_extract + ['aphy%s' % b for b in a_bb_bands])
df_aphy[time_loc_cols_extract] = df_2[time_loc_cols_extract]

In [56]:
for b in a_bb_bands:
    df_aphy['aphy%s' %b] = df_ap['ap%s' % b] - df_ad['ad%s' %b]

In [108]:
df_aphy.head().T

Unnamed: 0,0,1,2,3,4
datetime,1997-10-11 09:32:00,2000-02-22 17:00:00,2001-02-19 16:10:00,2002-01-22 13:45:00,1997-09-27 11:29:00
lat,39.29,-61.45,-61.29,-60.999,24.1392
lon,25.11,-62.299,-56.29,-56.498,-20.9995
aphy405,0,0,0.02149,0.01693,0
aphy411,0,0,0.0241,0.01886,0
aphy443,0,0,0.03078,0.02283,0
aphy455,0,0,0.02838,0.02071,0
aphy465,0,0,0.02765,0.01999,0
aphy489,0,0,0.02057,0.01464,0
aphy510,0,0,0.01261,0.00937,0


In [109]:
df_aphy.to_pickle(pkldir + 'df_2_aphy.pkl')

#### <u>Creating Dataset for OO Conference 2018</u>

This data will used rayleigh Lt and 

In [None]:
pca_spnorm = PCA()
pca_spnorm_data = pca_spnorm.fit_transform(df_spnorm[rrs_cols].as_matrix())
PlotPCARes(pca_spnorm, threshold=0.99, alpha=0.9, num_pca_disp=pca_spnorm_data.shape[1])
PlotCrossCorr(pca_spnorm_data, df_spnorm)