In [75]:
import os

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as pl
import pandas as pd
from seaborn import PairGrid, heatmap, kdeplot
import numpy as np
from utils import *

In [2]:
% matplotlib inline

In [33]:
with open('./data/nomad_seawifs_v2.a2_2008200.txt', 'r') as f:
    for line in f:
        if 'fields=' in line:
            break
columns = line.strip().strip('/fields=').split(',')

In [34]:
columns

['year',
 'month',
 'day',
 'hour',
 'minute',
 'second',
 'lat',
 'lon',
 'id',
 'oisst',
 'etopo2',
 'chl',
 'chl_a',
 'kd405',
 'kd411',
 'kd443',
 'kd455',
 'kd465',
 'kd489',
 'kd510',
 'kd520',
 'kd530',
 'kd550',
 'kd555',
 'kd560',
 'kd565',
 'kd570',
 'kd590',
 'kd619',
 'kd625',
 'kd665',
 'kd670',
 'kd683',
 'lw405',
 'lw411',
 'lw443',
 'lw455',
 'lw465',
 'lw489',
 'lw510',
 'lw520',
 'lw530',
 'lw550',
 'lw555',
 'lw560',
 'lw565',
 'lw570',
 'lw590',
 'lw619',
 'lw625',
 'lw665',
 'lw670',
 'lw683',
 'es405',
 'es411',
 'es443',
 'es455',
 'es465',
 'es489',
 'es510',
 'es520',
 'es530',
 'es550',
 'es555',
 'es560',
 'es565',
 'es570',
 'es590',
 'es619',
 'es625',
 'es665',
 'es670',
 'es683',
 'ap405',
 'ap411',
 'ap443',
 'ap455',
 'ap465',
 'ap489',
 'ap510',
 'ap520',
 'ap530',
 'ap550',
 'ap555',
 'ap560',
 'ap565',
 'ap570',
 'ap590',
 'ap619',
 'ap625',
 'ap665',
 'ap670',
 'ap683',
 'ad405',
 'ad411',
 'ad443',
 'ad455',
 'ad465',
 'ad489',
 'ad510',
 'ad520',


In [39]:
df = pd.read_csv('./data/nomad_seawifs_v2.a2_2008200.txt', names=columns, skiprows=107)

In [35]:
len(columns)

243

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Columns: 243 entries, year to va
dtypes: float64(226), int64(15), object(2)
memory usage: 941.7+ KB


In [44]:
df_rc = pd.read_csv('./data/Rayleigh&Fresnel_corrected_Rrc.txt', sep='\t')

In [45]:
df_rc.head()

Unnamed: 0,filename,lat,lon,Rrs_412,Rrs_443,Rrs_490,Rrs_510,Rrs_555,Rrs_670
0,S1997284110316.L2_MLAC.hdf,39.29,25.11,0.012088,0.012417,0.011739,0.010579,0.00911,0.006655
1,S2000053153433.L2_MLAC.hdf,-61.45,-62.299,0.010525,0.010636,0.009614,0.007913,0.006224,0.004794
2,S2001050135427.L2_MLAC.hdf,-61.29,-56.29,0.004443,0.004387,0.00424,0.003686,0.002646,0.001177
3,S2002022133012.L2_MLAC.hdf,-60.999,-56.498,0.005869,0.005866,0.005535,0.004643,0.003326,0.001747
4,S1997270134451.L2_MLAC.hdf,24.1392,-20.9995,0.009464,0.008968,0.007719,0.005974,0.004161,0.002517


In [47]:
df_rc['filename'] = df_rc.filename.str.strip('.hdf')

In [95]:
sat_files = df.sat_file.tolist()
lat1 = df.lat.tolist()
lon1 = df.lon.tolist()

filenames = df_rc.filename.tolist()
lat2 = df_rc.lat.tolist()
lon2 = df_rc.lon.tolist()

for i, (s, f, lt1, lt2, ln1, ln2) in enumerate (zip(sat_files, filenames, lat1,
                                                    lat2, lon1, lon2)):
    if s != f:
        print(f'fname#{i}: {s}<->{f}')
    if lt1 != lt2:
        print(f'lat#{i}: {lt1}<->{lt2}')
    if ln1 != ln2:
        print(f'lon#{i}: {ln1}<->{ln2}')

In [62]:
def convert_to_dt(row):
    dt_str = f'{row.year}-{row.month}-{row.day} {row.hour}:{row.minute}:{row.second}'
    return pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')

In [106]:
# consolidate date and time columns into single datetime type column
df.insert(0, 'datetime', df.apply(convert_to_dt, axis=1))
df.drop(['year', 'month', 'day', 'hour', 'minute', 'second'], axis=1, inplace=True)

In [107]:
df_rc.rename(columns=dict(Rrs_412='sat_rrs412_rc', Rrs_443='sat_rrs443_rc',
                         Rrs_490='sat_rrs490_rc', Rrs_510='sat_rrs510_rc',
                         Rrs_555='sat_rrs555_rc', Rrs_670='sat_rrs670_rc'),
           inplace=True)

In [115]:
df_2 = pd.merge(df, df_rc, left_index=True, right_index=True)
df_2.rename(columns={'lat_x': 'lat', 'lon_x': 'lon'})
df_2.drop(['lat_y', 'lon_y'], axis=1, inplace=True)

In [119]:
df.to_pickle('./pickleJar/df_0_NMD_SWF_v2_a2_2008200.pkl')
df_rc.to_pickle('./pickleJar/df_0_R&F_Corr.pkl')
df_2.to_pickle('./pickleJar/df_1_merged.pkl')

In [120]:
swf_bands = [412, 443, 489, 510, 555, 670]
time_loc_cols_extract = ['datetime', 'lat', 'lon']
anc_cols_extract = ['oisst', 'wt', 'sal', 'etopo2', 'sola', 'solz']
sat_cols_extract = ['sat_rrs%d' %b for b in swf_bands]\
                     + ['sat_lt%d' %b for b in swf_bands]\
                     + ['sat_rhot%d' %b for b in swf_bands]\
                     + ['sat_rrs%d_rc' %b for b in swf_bands]
bio_cols_extract = ['chl', 'chl_a', 'atot', 'ap', 'ad', 'adg', 'ag' ,'kd']

In [None]:
pca_spnorm = PCA()
pca_spnorm_data = pca_spnorm.fit_transform(df_spnorm[rrs_cols].as_matrix())
PlotPCARes(pca_spnorm, threshold=0.99, alpha=0.9, num_pca_disp=pca_spnorm_data.shape[1])
PlotCrossCorr(pca_spnorm_data, df_spnorm)