This notebook loads data stored in text files, formats and stored into pandas dataframes. Some additional producs are computed and stored in dataframes, including uncertainty band and chl anomaly. The dataframes are then pickled for plotting in a later notebook.

In [1]:
import pandas as pd
import pathlib
from datetime import datetime
import pickle
from scipy.stats import mode
import numpy as np
from IPython.core.display import HTML, display
from matplotlib import rcParams

In [2]:
%matplotlib inline

In [10]:
def convert_secs2dt(sec):
    """
    Converts seconds to python datetime object.
    :param sec 
    :return: datetime object
    """
    zd00 = datetime(2000, 1, 1)
    zd70 = datetime(1970, 1, 1)
    offset = (zd00 - zd70).total_seconds()
    z = datetime.utcfromtimestamp(sec + offset)
    return z


def get_doy(secs):
    """
    Converts seconds to fractional day of year.
    :param secs
    :return: fractional day of year 
    """
    z = convert_secs2dt(secs)
    y = z.year
    return (secs + (datetime(2000, 1, 1) - datetime(y, 1, 1)).total_seconds()) / 86400


def load_format_data(filepath, minimal=True, columns=None, quantity='chl_a'):
    """
    Loads chlorophyll data into a pandas dataframe,
    formats time entries, and creates a datetime index.
    :param filepath: string or pathlib object  
    :param minimal: if True returns only chl_a_mean; drops the rest.
    :return: pandas datetime indexed dataframe
    """
    if columns is None:
        columns = ['time', 'nbins', 'mean', 'median', 'stdv']
    
    df = pd.read_csv(filepath, delim_whitespace=True, names=columns)
    df['datetime'] = df.time.apply(convert_secs2dt)
    df.set_index('datetime', inplace=True)
    if minimal:
        df = df[['mean']]
    df.rename(columns={'mean': '%s_mean' % quantity}, inplace=True)
    return df


def regress_phyto_c(df, mean_col_label='bbp_443_giop_mean',
                   reg_slope=12128, reg_bias=0.59):
    df['phyto_c'] = df[mean_col_label] * reg_slope + reg_bias
    return df


def get_monthly_means(df, **kwargs):
    """
    Groups data by month and compute annual cycle based on monthly means.
    :param df: 
        datetime indexed pandas dataframe
    :param kwargs:
        year_start (optional): string, slice start
        year_end (optional): string, slice end
    :return: 
        month-indexed pandas dataframe with monthly means
    """
    year_start = kwargs.pop('year_start', df.index.year[0])
    year_end = kwargs.pop('year_end', df.index.year[-1])
    return df.loc[str(year_start): str(year_end)].groupby(lambda x: x.month).aggregate('mean')


def get_anomaly(df, df_ann_cycle, name='chl_a_mean', anomaly_name='anomaly'):
    """
    Computes annomaly by removing monthly mean for a given month
    :param df:
        pandas dataframe with [name] parameter column
    :param df_ann_cycle:
        pandas dataframe of length 12 containing monthly means
    :param name:
        str, label of quantity to get anomaly from
    :return:
        None
    """

    for month in df_ann_cycle.index:
        idx = df.index.month == month
        df.loc[idx, anomaly_name] = df.loc[idx, name] - df_ann_cycle.loc[month, name]


def test(datadir):
    testfile = datadir / 'ar2018.0m_AtlN55_chlor_a.txt'
    df_test = load_format_data(testfile, minimal=False)
    t0 = df_test.time[0]    
    z0 = convert_secs2dt(t0)
    zstr0 = z0.strftime('%Y%j%H%M%S')
    try:
        assert zstr0 + '000' == str(2002197194740000)
    except AssertionError as e:
        print(e)
        print(zstr0)
    assert z0.year == 2002
    doy0 = get_doy(t0)
    assert doy0 == 196.82476851851851
    tl = df_test.tail(1).time.values
    zl = convert_secs2dt(tl)
    zstrl = zl.strftime('%Y%j%H%M%S')
    assert zstrl + '000' == str(2016321013320000)
    assert zl.year == 2016
    doyl = get_doy(tl)
    assert doyl == 320.06481481481484
    print("all tests passed")

In [4]:
# Global graphics setup
display(HTML("<style>.container {width: 90%}</style>"))
plot_colors = {'swf': '#000000',
               'aqua': '#348ABD',
               'viirs': '#A60628',
               'mei': '#467821',
               }
rcParams['axes.labelsize'] = 15
rcParams['xtick.labelsize'] = 15
rcParams['ytick.labelsize'] = 15
rcParams['font.size'] = 15
rcParams['ytick.major.size'] = 8
rcParams['ytick.minor.size'] = 3
rcParams['xtick.major.size'] = 8
rcParams['xtick.minor.size'] = 3
rcParams['xtick.top'] = True
rcParams['ytick.direction'] = 'in'
rcParams['xtick.direction'] = 'in'

In [5]:
# Paths
home = pathlib.Path.home()
dataMain = home / 'DEV-ALL/State_of_the_Climate/soc2018/TIMESERIES/'
aquaIOPdir = dataMain / 'ar2018.0IOPm_ar2018.0IOPm/stats'
swfIOPdir = dataMain / 'sr2018.0IOPm_sr2018.0IOPm/stats'
viirsIOPdir = dataMain / 'vr2018.0IOPm_vr2018.0IOPm/stats'
fp_bbp_443_giop_aqua = aquaIOPdir / 'ar2018.0IOPm_eqsst_bbp_443_giop.txt'
fp_bbp_443_giop_swf = swfIOPdir / 'sr2018.0IOPm_eqsst_bbp_443_giop.txt'
fp_bbp_443_giop_viirs = viirsIOPdir / 'vr2018.0IOPm_eqsst_bbp_443_giop.txt'
#test(test_aquadir)

In [14]:
df_swf_bbp = load_format_data(fp_bbp_443_giop_swf, quantity='bbp_443_giop')
df_aqua_bbp = load_format_data(fp_bbp_443_giop_aqua, quantity='bbp_443_giop')
df_viirs_bbp = load_format_data(fp_bbp_443_giop_viirs, quantity='bbp_443_giop')

In [15]:
df_swf_bbp = regress_phyto_c(df_swf_bbp)

In [16]:
df_aqua_bbp = regress_phyto_c(df_aqua_bbp)
df_viirs_bbp = regress_phyto_c(df_viirs_bbp)

In [17]:
df_aqua_bbp.head()

Unnamed: 0_level_0,bbp_443_giop_mean,phyto_c
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-07-16 22:10:00,0.001823,22.701891
2002-08-19 09:03:30,0.001815,22.599288
2002-09-16 01:52:30,0.001882,23.419383
2002-10-16 13:17:30,0.001882,23.416473
2002-11-16 01:15:00,0.001899,23.621193


In [18]:
# Compute annual cycle and monthly anomalies
aqua_ann_cycle = get_monthly_means(df_aqua_bbp[['phyto_c']],
                                   year_start=2003, year_end=2011)
for dfi in [df_swf_bbp, df_aqua_bbp, df_viirs_bbp]:
        get_anomaly(dfi, aqua_ann_cycle, name='phyto_c', anomaly_name='phyto_C_anomaly')

In [40]:
aqua_ann_cycle

Unnamed: 0,phyto_c
1,22.32502
2,21.942961
3,21.928124
4,22.338698
5,22.505619
6,22.223684
7,21.685982
8,21.919877
9,22.446192
10,23.328343


In [19]:
df_aqua_bbp.head()

Unnamed: 0_level_0,bbp_443_giop_mean,phyto_c,phyto_C_anomaly
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2002-07-16 22:10:00,0.001823,22.701891,1.015909
2002-08-19 09:03:30,0.001815,22.599288,0.679411
2002-09-16 01:52:30,0.001882,23.419383,0.973191
2002-10-16 13:17:30,0.001882,23.416473,0.08813
2002-11-16 01:15:00,0.001899,23.621193,-0.191825


In [27]:
df_aqua_bbp.head()

Unnamed: 0_level_0,bbp_443_giop_mean,phyto_c,phyto_C_anomaly
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2002-07-16 22:10:00,0.001823,22.701891,1.015909
2002-08-19 09:03:30,0.001815,22.599288,0.679411
2002-09-16 01:52:30,0.001882,23.419383,0.973191
2002-10-16 13:17:30,0.001882,23.416473,0.08813
2002-11-16 01:15:00,0.001899,23.621193,-0.191825


In [37]:
# Create a new dataframe to compute and store an "uncertainty" band based on SWF/Aqua overlapping data.
anomaly_name = 'phyto_C_anomaly'
df_swf_aqua = pd.concat((df_swf_bbp[[anomaly_name]], df_aqua_bbp[[anomaly_name]]), axis=1)
df_swf_aqua.columns=['swf_%s' % anomaly_name, 'aqua_%s' % anomaly_name]

In [None]:
df_swf_aqua_bbp = pd.concat((df_swf_bbp))

In [38]:
# synchronize data
df_swf_aqua = df_swf_aqua.resample('M').mean()

In [39]:
df_swf_aqua['mean'] = df_swf_aqua.mean(axis=1)
df_swf_aqua.dropna(inplace=True)
df_swf_aqua['diff'] = np.abs(df_swf_aqua.swf_phyto_C_anomaly.values - df_swf_aqua.aqua_phyto_C_anomaly.values)
df_swf_aqua['diff_pos'] = df_swf_aqua['mean'] + df_swf_aqua['diff'].mean()
df_swf_aqua['diff_neg'] = df_swf_aqua['mean'] - df_swf_aqua['diff'].mean()
df_swf_aqua.head()

Unnamed: 0_level_0,swf_phyto_C_anomaly,aqua_phyto_C_anomaly,mean,diff,diff_pos,diff_neg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-07-31,-1.308665,1.015909,-0.146378,2.324574,1.054594,-1.347351
2002-08-31,-0.667404,0.679411,0.006003,1.346814,1.206976,-1.194969
2002-09-30,-0.728367,0.973191,0.122412,1.701558,1.323385,-1.078561
2002-10-31,-1.626527,0.08813,-0.769198,1.714657,0.431774,-1.970171
2002-11-30,-2.258678,-0.191825,-1.225251,2.066854,-0.024279,-2.426224


In [23]:
# Pickle the dataframes 
with open('../PklJar/dict_df_phytoC.pkl', 'wb') as f:
    datadict = {'swf': df_swf_bbp,
                'aqua': df_aqua_bbp,
                'viirs': df_viirs_bbp,
                'swf_aqua': df_swf_aqua,
                }
    pickle.dump(datadict, f, protocol=pickle.HIGHEST_PROTOCOL)