This notebook loads data stored in text files, formats and stored into pandas dataframes. Some additional producs are computed and stored in dataframes, including uncertainty band and chl anomaly. The dataframes are then pickled for plotting in a later notebook.

In [1]:
import pandas as pd
import pathlib
from datetime import datetime
import pickle
from scipy.stats import mode
import numpy as np
from IPython.core.display import HTML, display
from matplotlib import rcParams

In [2]:
%matplotlib inline

In [3]:
def convert_secs2dt(sec):
    """
    Converts seconds to python datetime object.
    :param sec 
    :return: datetime object
    """
    zd00 = datetime(2000, 1, 1)
    zd70 = datetime(1970, 1, 1)
    offset = (zd00 - zd70).total_seconds()
    z = datetime.utcfromtimestamp(sec + offset)
    return z


def get_doy(secs):
    """
    Converts seconds to fractional day of year.
    :param secs
    :return: fractional day of year 
    """
    z = convert_secs2dt(secs)
    y = z.year
    return (secs + (datetime(2000, 1, 1) - datetime(y, 1, 1)).total_seconds()) / 86400


def load_format_data(filepath, minimal=True, columns=None, quantity='chl_a'):
    """
    Loads chlorophyll data into a pandas dataframe,
    formats time entries, and creates a datetime index.
    :param filepath: string or pathlib object  
    :param minimal: if True returns only chl_a_mean; drops the rest.
    :return: pandas datetime indexed dataframe
    """
    if columns is None:
        columns = ['time', 'nbins', 'mean', 'median', 'stdv']
    
    df = pd.read_csv(filepath, delim_whitespace=True, names=columns)
    df['datetime'] = df.time.apply(convert_secs2dt)
    df.set_index('datetime', inplace=True)
    if minimal:
        df = df[['mean']]
    df.rename(columns={'mean': '%s_mean' % quantity}, inplace=True)
    return df


def regress_phyto_c(df, mean_col_label='bbp_443_giop_mean',
                   reg_slope=12128, reg_bias=0.59):
    df['phyto_c'] = df[mean_col_label] * reg_slope + reg_bias
    return df


def get_monthly_means(df, **kwargs):
    """
    Groups data by month and compute annual cycle based on monthly means.
    :param df: 
        datetime indexed pandas dataframe
    :param kwargs:
        year_start (optional): string, slice start
        year_end (optional): string, slice end
    :return: 
        month-indexed pandas dataframe with monthly means
    """
    year_start = kwargs.pop('year_start', df.index.year[0])
    year_end = kwargs.pop('year_end', df.index.year[-1])
    return df.loc[str(year_start): str(year_end)].groupby(lambda x: x.month).aggregate('mean')


def get_anomaly(df, df_ann_cycle, name='chl_a_mean'):
    """
    Computes annomaly by removing monthly mean for a given month
    :param df:
        pandas dataframe with [name] parameter column
    :param df_ann_cycle:
        pandas dataframe of length 12 containing monthly means
    :param name:
        str, label of quantity to get anomaly from
    :return:
        None
    """

    for month in df_ann_cycle.index:
        idx = df.index.month == month
        df.loc[idx, 'chl_anomaly'] = df.loc[idx, name] - df_ann_cycle.loc[month, name]


def test(datadir):
    testfile = datadir / 'ar2018.0m_AtlN55_chlor_a.txt'
    df_test = load_format_data(testfile, minimal=False)
    t0 = df_test.time[0]    
    z0 = convert_secs2dt(t0)
    zstr0 = z0.strftime('%Y%j%H%M%S')
    try:
        assert zstr0 + '000' == str(2002197194740000)
    except AssertionError as e:
        print(e)
        print(zstr0)
    assert z0.year == 2002
    doy0 = get_doy(t0)
    assert doy0 == 196.82476851851851
    tl = df_test.tail(1).time.values
    zl = convert_secs2dt(tl)
    zstrl = zl.strftime('%Y%j%H%M%S')
    assert zstrl + '000' == str(2016321013320000)
    assert zl.year == 2016
    doyl = get_doy(tl)
    assert doyl == 320.06481481481484
    print("all tests passed")

In [4]:
# Global graphics setup
display(HTML("<style>.container {width: 90%}</style>"))
plot_colors = {'swf': '#000000',
               'aqua': '#348ABD',
               'viirs': '#A60628',
               'mei': '#467821',
               }
rcParams['axes.labelsize'] = 15
rcParams['xtick.labelsize'] = 15
rcParams['ytick.labelsize'] = 15
rcParams['font.size'] = 15
rcParams['ytick.major.size'] = 8
rcParams['ytick.minor.size'] = 3
rcParams['xtick.major.size'] = 8
rcParams['xtick.minor.size'] = 3
rcParams['xtick.top'] = True
rcParams['ytick.direction'] = 'in'
rcParams['xtick.direction'] = 'in'

In [5]:
# Paths
home = pathlib.Path.home()
dataMain = home / 'DEV-ALL/State_of_the_Climate/soc2018/TIMESERIES/'
test_aquadir = dataMain / 'ar2018.0m_ar2018.0m/stats'
aquadir = dataMain / 'ar2018.0m_ar2018.0m/stats'
aquaIOPdir = dataMain / 'ar2018.0IOPm_ar2018.0IOPm/stats'
swfdir = dataMain / 'sr2018.0m_sr2018.0m/stats'
swfIOPdir = dataMain / 'sr2018.0IOPm_sr2018.0IOPm/stats'
viirsdir = dataMain / 'vr2018.0m_vr2018.0m/stats'
viirsIOPdir = dataMain / 'vr2018.0IOPm_vr2018.0IOPm/stats'
fp_chlor_a_aqua = aquadir / 'ar2018.0m_eqsst_chlor_a.txt'
fp_chlor_a_swf = swfdir / 'sr2018.0m_eqsst_chlor_a.txt'
fp_chlor_a_viirs = viirsdir / 'vr2018.0m_eqsst_chlor_a.txt'
fp_bbp_443_giop_aqua = aquaIOPdir / 'ar2018.0IOPm_eqsst_bbp_443_giop.txt'
fp_bbp_443_giop_swf = swfIOPdir / 'sr2018.0IOPm_eqsst_bbp_443_giop.txt'
fp_bbp_443_giop_viirs = viirsIOPdir / 'vr2018.0IOPm_eqsst_bbp_443_giop.txt'
#test(test_aquadir)

In [7]:
# Load the data into pandas dataframes, with some datetime formatting
df_swf = load_format_data(fp_chlor_a_swf,)
df_aqua = load_format_data(fp_chlor_a_aqua)

df_swf_bbp = load_format_data(fp_bbp_443_giop_swf, quantity='bbp_443_giop')
df_aqua_bbp = load_format_data(fp_bbp_443_giop_aqua, quantity='bbp_443_giop')

In [8]:
# Compute annual cycle and monthly anomalies
aqua_ann_cycle = get_monthly_means(df_aqua[['chl_a_mean']],
                                   year_start=2003, year_end=2011)
for dfi in [df_swf, df_aqua, df_viirs]:
        get_anomaly(dfi, aqua_ann_cycle)

In [20]:
df_aqua.chl_a_mean.mean()

0.14039351308900525

In [17]:
aqua_ann_cycle.mean()

chl_a_mean    0.141453
dtype: float64

In [9]:
df_aqua_bbp.head()

Unnamed: 0_level_0,bbp_443_giop_mean
datetime,Unnamed: 1_level_1
2002-07-16 22:10:00,0.001823
2002-08-19 09:03:30,0.001815
2002-09-16 01:52:30,0.001882
2002-10-16 13:17:30,0.001882
2002-11-16 01:15:00,0.001899


In [10]:
df_aqua_bbp = regress_phyto_c(df_aqua_bbp)

In [11]:
df_aqua_bbp.head()

Unnamed: 0_level_0,bbp_443_giop_mean,phyto_c
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2002-07-16 22:10:00,0.001823,22.701891
2002-08-19 09:03:30,0.001815,22.599288
2002-09-16 01:52:30,0.001882,23.419383
2002-10-16 13:17:30,0.001882,23.416473
2002-11-16 01:15:00,0.001899,23.621193


In [12]:
# Create a new dataframe to compute and store an "uncertainty" band based on SWF/Aqua overlapping data.

df_swf_aqua = pd.concat((df_swf[['chl_anomaly']], df_aqua[['chl_anomaly']]), axis=1)
df_swf_aqua.columns=['swf_chl_anom', 'aqua_chl_anom']
# synchronize data
df_swf_aqua = df_swf_aqua.resample('M').mean()

df_swf_aqua['mean'] = df_swf_aqua.mean(axis=1)
df_swf_aqua.dropna(inplace=True)
df_swf_aqua['diff'] = np.abs(df_swf_aqua.swf_chl_anom.values - df_swf_aqua.aqua_chl_anom.values)
df_swf_aqua['diff_pos'] = df_swf_aqua['mean'] + df_swf_aqua['diff'].mean()
df_swf_aqua['diff_neg'] = df_swf_aqua['mean'] - df_swf_aqua['diff'].mean()
df_swf_aqua.head()

Unnamed: 0_level_0,swf_chl_anom,aqua_chl_anom,mean,diff,diff_pos,diff_neg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2002-07-31,0.00536,-0.011477,-0.003058,0.016837,0.002401,-0.008517
2002-08-31,0.012545,-0.000376,0.006085,0.012921,0.011544,0.000626
2002-09-30,0.009496,0.013813,0.011655,0.004317,0.017114,0.006196
2002-10-31,0.001834,0.005326,0.00358,0.003492,0.009039,-0.001879
2002-11-30,0.00429,0.004793,0.004541,0.000503,0.01,-0.000918


In [13]:
# Pickle the dataframes 
with open('../PklJar/dict_df_chl.pkl', 'wb') as f:
    datadict = {'swf': df_swf,
                'aqua': df_aqua,
                'viirs': df_viirs,
                'swf_aqua': df_swf_aqua,
                }
    pickle.dump(datadict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
df_swf_aqua.tail()

Unnamed: 0_level_0,swf_chl_anom,aqua_chl_anom,mean,diff,diff_pos,diff_neg
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-08-31,0.016137,0.017594,0.016866,0.001457,0.022325,0.011407
2010-09-30,0.001314,-0.001443,-6.4e-05,0.002757,0.005395,-0.005523
2010-10-31,0.01135,0.005521,0.008435,0.005829,0.013894,0.002977
2010-11-30,0.006326,0.003743,0.005034,0.002583,0.010493,-0.000425
2010-12-31,0.006421,0.008752,0.007587,0.002331,0.013046,0.002128


In [15]:
df_aqua.tail()

Unnamed: 0_level_0,chl_a_mean,chl_anomaly
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-16 13:13:20,0.144528,0.005903
2018-02-15 01:31:40,0.144109,-0.000643
2018-03-16 13:40:00,0.14247,-0.001128
2018-04-16 01:21:40,0.139971,-0.001643
2018-05-16 13:50:00,0.144199,0.003849
