In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

### Methods

The dataset used in this study was generated from the NOMAD dataset. 6 Rrs bands were used at 6 wavelength corresponding to SeaWiFS spectral coverage; 412, 443, 490, 510, 555, 670 nm. The following data transformation steps were implemented:
* invalid data points were flagged and dropped
* for band ratio algorithms, relevant ratios were pre-computed
* for band ratio algorithms with max(Blue) formulation blue band used in the ratio was identified and flagged
* for PCA-based algorithms, principal components were computed for all bands.
* chlorophyll measurement method was identified and flagged
* all predictors were standardized

In [2]:
def read_nomad_like_file_to_pd(file_name):
    with open(file_name) as f:
        for i, line in enumerate(f.readlines()):
            if 'fields' in line:
                columns = line.split('=')[1].split(',')
                continue
            if '/end_header' in line:
                skiprows = i+1
        df = pd.read_csv(nomad_file, names=columns,
                             skiprows=skiprows, na_values=-999)
    return df

In [3]:
nomad_file = './data/raw/nomad_seabass_v2.a_2008200.txt'
df_raw = read_nomad_like_file_to_pd(nomad_file)
df_raw.columns.tolist()

['year',
 'month',
 'day',
 'hour',
 'minute',
 'second',
 'lat',
 'lon',
 'id',
 'oisst',
 'etopo2',
 'chl',
 'chl_a',
 'kd405',
 'kd411',
 'kd443',
 'kd455',
 'kd465',
 'kd489',
 'kd510',
 'kd520',
 'kd530',
 'kd550',
 'kd555',
 'kd560',
 'kd565',
 'kd570',
 'kd590',
 'kd619',
 'kd625',
 'kd665',
 'kd670',
 'kd683',
 'lw405',
 'lw411',
 'lw443',
 'lw455',
 'lw465',
 'lw489',
 'lw510',
 'lw520',
 'lw530',
 'lw550',
 'lw555',
 'lw560',
 'lw565',
 'lw570',
 'lw590',
 'lw619',
 'lw625',
 'lw665',
 'lw670',
 'lw683',
 'es405',
 'es411',
 'es443',
 'es455',
 'es465',
 'es489',
 'es510',
 'es520',
 'es530',
 'es550',
 'es555',
 'es560',
 'es565',
 'es570',
 'es590',
 'es619',
 'es625',
 'es665',
 'es670',
 'es683',
 'ap405',
 'ap411',
 'ap443',
 'ap455',
 'ap465',
 'ap489',
 'ap510',
 'ap520',
 'ap530',
 'ap550',
 'ap555',
 'ap560',
 'ap565',
 'ap570',
 'ap590',
 'ap619',
 'ap625',
 'ap665',
 'ap670',
 'ap683',
 'ad405',
 'ad411',
 'ad443',
 'ad455',
 'ad465',
 'ad489',
 'ad510',
 'ad520',


In [4]:
bands = [411, 443, 489, 510, 555, 670]

In [5]:
df_raw.loc[:, [f'lw{band}' for band in bands]].describe()

Unnamed: 0,lw411,lw443,lw489,lw510,lw555,lw670
count,4304.0,4456.0,4431.0,3492.0,3312.0,1602.0
mean,0.385876,0.43397,0.450727,0.416742,0.330696,0.136143
std,0.418701,0.437093,0.390907,0.36877,0.4309,0.231025
min,0.001019,0.00147,0.00147,0.00125,0.0006,0.0
25%,0.099433,0.118075,0.15052,0.188368,0.120175,0.016921
50%,0.23565,0.280952,0.3474,0.347605,0.210395,0.055817
75%,0.501023,0.609602,0.657315,0.524745,0.330312,0.159858
max,3.52527,2.988,3.5927,4.3987,6.2885,3.2222


In [6]:
df_raw.loc[:, [f'es{band}' for band in bands]].describe()

Unnamed: 0,es411,es443,es489,es510,es555,es670
count,4305.0,4459.0,4450.0,3446.0,3258.0,1616.0
mean,73.540755,85.777988,93.053646,99.686608,97.828945,96.691502
std,42.527826,49.526399,52.812348,47.437513,45.437147,37.23333
min,0.412,0.489,0.479,0.42,0.284,1.28788
25%,34.741,40.442,44.96945,61.57505,62.379225,74.092875
50%,75.08,88.2018,96.76045,105.335,104.924,102.5
75%,110.525,127.6315,138.594,139.98475,136.329,123.0
max,216.678,238.739,249.221,252.287,243.629,194.6


### $\Rightarrow$ Computing Rrs and creating dataset and storing it in a dataframe (df) for this study:

In [8]:
for band in bands:
    df_raw[f'rrs{band}'] = df_raw[f'lw{band}'] / df_raw[f'es{band}'] # create rrs columns
columns = df_raw.filter(regex='rrs', axis=1).columns # get rrs column names
columns = ['id'] + ['etopo2', 'lat'] + columns.tolist() + ['chl', 'chl_a'] # add features of interest
df = df_raw[columns].copy() # subset and copy
df.head()

Unnamed: 0,id,etopo2,lat,rrs411,rrs443,rrs489,rrs510,rrs555,rrs670,chl,chl_a
0,1565,0.0,38.4279,0.001204,0.001686,0.003293,0.004036,0.007479,0.003465,38.19,
1,1566,0.0,38.368,0.001062,0.001384,0.002173,0.002499,0.004152,0.001695,35.01,
2,1567,1.0,38.3074,0.000971,0.001185,0.001843,0.002288,0.004246,0.001612,26.91,
3,1568,3.0,38.6367,0.001472,0.001741,0.002877,0.003664,0.006982,0.003234,47.96,
4,1559,1.0,38.3047,0.000905,0.001022,0.001506,0.001903,0.002801,0.001791,23.55,


In [9]:
df.describe()

Unnamed: 0,id,etopo2,lat,rrs411,rrs443,rrs489,rrs510,rrs555,rrs670,chl,chl_a
count,4459.0,4459.0,4459.0,4293.0,4456.0,4422.0,3435.0,3255.0,1598.0,3392.0,1381.0
mean,4377.381251,1312.346715,1.868658,0.004881,0.004652,0.00459,0.00413,0.003256,0.001557,2.703251,2.285293
std,2298.272102,1766.435289,44.765125,0.003447,0.003002,0.002768,0.00313,0.003536,0.002387,5.611762,5.752391
min,6.0,0.0,-77.0356,5.1e-05,0.00019,0.000284,0.000261,0.000183,0.0,0.012,0.017
25%,2028.5,18.0,-61.299,0.002509,0.002617,0.003051,0.002831,0.001588,0.0002,0.274,0.145
50%,5039.0,240.0,27.093,0.003984,0.003899,0.004153,0.003425,0.002071,0.000614,0.83,0.538
75%,6271.5,2789.5,34.4585,0.006301,0.006076,0.005655,0.004242,0.003141,0.002,2.24008,1.694
max,7831.0,7978.0,79.69,0.0306,0.036769,0.063814,0.07774,0.0466,0.0277,77.8648,70.2133


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 11 columns):
id        4459 non-null int64
etopo2    4459 non-null float64
lat       4459 non-null float64
rrs411    4293 non-null float64
rrs443    4456 non-null float64
rrs489    4422 non-null float64
rrs510    3435 non-null float64
rrs555    3255 non-null float64
rrs670    1598 non-null float64
chl       3392 non-null float64
chl_a     1381 non-null float64
dtypes: float64(10), int64(1)
memory usage: 383.3 KB


### $\Rightarrow$ Add hplc flag column to identify protential quality/noise difference in chl

In [11]:
#chlorophyll measurement method was identified and flagged
def fill_chl(row):
    return row['chl_a'] if np.isfinite(row['chl_a']) else row['chl']

df['chlor_a'] = df.apply(fill_chl, axis=1)
df['is_hplc'] = np.isfinite(df.chl_a)
df.drop(['chl', 'chl_a'], axis=1, inplace=True)

In [12]:
df.describe()

Unnamed: 0,id,etopo2,lat,rrs411,rrs443,rrs489,rrs510,rrs555,rrs670,chlor_a
count,4459.0,4459.0,4459.0,4293.0,4456.0,4422.0,3435.0,3255.0,1598.0,4127.0
mean,4377.381251,1312.346715,1.868658,0.004881,0.004652,0.00459,0.00413,0.003256,0.001557,2.680228
std,2298.272102,1766.435289,44.765125,0.003447,0.003002,0.002768,0.00313,0.003536,0.002387,5.758436
min,6.0,0.0,-77.0356,5.1e-05,0.00019,0.000284,0.000261,0.000183,0.0,0.012
25%,2028.5,18.0,-61.299,0.002509,0.002617,0.003051,0.002831,0.001588,0.0002,0.233325
50%,5039.0,240.0,27.093,0.003984,0.003899,0.004153,0.003425,0.002071,0.000614,0.764
75%,6271.5,2789.5,34.4585,0.006301,0.006076,0.005655,0.004242,0.003141,0.002,2.15
max,7831.0,7978.0,79.69,0.0306,0.036769,0.063814,0.07774,0.0466,0.0277,77.8648


In [13]:
df.head()

Unnamed: 0,id,etopo2,lat,rrs411,rrs443,rrs489,rrs510,rrs555,rrs670,chlor_a,is_hplc
0,1565,0.0,38.4279,0.001204,0.001686,0.003293,0.004036,0.007479,0.003465,38.19,False
1,1566,0.0,38.368,0.001062,0.001384,0.002173,0.002499,0.004152,0.001695,35.01,False
2,1567,1.0,38.3074,0.000971,0.001185,0.001843,0.002288,0.004246,0.001612,26.91,False
3,1568,3.0,38.6367,0.001472,0.001741,0.002877,0.003664,0.006982,0.003234,47.96,False
4,1559,1.0,38.3047,0.000905,0.001022,0.001506,0.001903,0.002801,0.001791,23.55,False


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 11 columns):
id         4459 non-null int64
etopo2     4459 non-null float64
lat        4459 non-null float64
rrs411     4293 non-null float64
rrs443     4456 non-null float64
rrs489     4422 non-null float64
rrs510     3435 non-null float64
rrs555     3255 non-null float64
rrs670     1598 non-null float64
chlor_a    4127 non-null float64
is_hplc    4459 non-null bool
dtypes: bool(1), float64(9), int64(1)
memory usage: 352.8 KB


In [15]:
df.to_pickle('./pickleJar/df_main.pkl') # pickle nomad subset
df_raw.to_pickle('./pickleJar/df_raw.pkl') # pickle nomad raw data in dataframe

### End of this notebook on dataset cleanup & prep. 
### Next: BC-0-Cleanup-1 $\Rightarrow$ Subset the prepared dataset for specific studies.