Following a previous [post](), I use the historically accurate dataset behind the development of NASA OBPG's chlorophyll algorithms.

In [7]:
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.linear_model import LinearRegression
import re
import os
import numpy as np
import seaborn as sb
from mpl_toolkits.basemap import Basemap
import pymc3 as pm
import warnings
warnings.filterwarnings('ignore')

In [2]:
% matplotlib inline

In [11]:
def ParseTextFile(textFileHandle, topickle=False, convert2DateTime=False, **kwargs):
    """
    * topickle: pickle resulting DataFrame if True
    * convert2DateTime: join date/time columns and convert entries to datetime objects
    * kwargs:
        pkl_fname: pickle file name to save DataFrame by, if topickle=True
    """
    # Pre-compute some regex
    columns = re.compile('^/fields=(.+)') # to get field/column names
    units = re.compile('^/units=(.+)') # to get units -- optional
    endHeader = re.compile('^/end_header') # to know when to start storing data
    # Set some milestones
    noFields = True
    getData = False
    # loop through the text data
    for line in textFileHandle:
        if noFields:
            fieldStr = columns.findall(line)
            if len(fieldStr)>0:
                noFields = False
                fieldList = fieldStr[0].split(',')
                dataDict = dict.fromkeys(fieldList)
                continue # nothing left to do with this line, keep looping
        if not getData:
            if endHeader.match(line):
                # end of header reached, start acquiring data
                getData = True 
        else:
            dataList = line.split(',')
            for field,datum in zip(fieldList, dataList):
                if not dataDict[field]:
                    dataDict[field] = []
                dataDict[field].append(datum)
    df = pd.DataFrame(dataDict, columns=fieldList)
    if convert2DateTime:
        datetimelabels=['year', 'month', 'day', 'hour', 'minute', 'second']
        df['Datetime']= pd.to_datetime(df[datetimelabels],
                                       format='%Y-%m-%dT%H:%M:%S')
        df.drop(datetimelabels, axis=1, inplace=True)
    if topickle:
        fname=kwargs.pop('pkl_fname', 'dfNomad2.pkl')
        df.to_pickle(fname)
    return df

def FindNaNs(df):
    for col in df.columns:
        sn = np.where(df[col].values=='NaN', True, False).sum()
        s9 = np.where('-999' in df[col].values, True, False).sum()
        print("%s: %d NaNs & %d -999s" % (col, sn, s9))

In [4]:
with open('/accounts/ekarakoy/DATA/ocprep_v4_iop.txt') as fdata:
    df = ParseTextFile(fdata, topickle=True, convert2DateTime=True,
                       pkl_fname='JeremyOCx_data')

In [None]:
df.info() # skipping output which shows a lot of unnecessary features for this exercise

In [6]:
basicCols = ['cruise', 'lat', 'lon', 'type', 'chl', 'Datetime']
IwantCols = basicCols + [col for col in df.columns if 'rrs' in col]
dfRrs = df[IwantCols]
swflbls = ['rrs411','rrs443','rrs489','rrs510','rrs555','rrs670']
swfCols = basicCols + swflbls
dfSwf = dfRrs[swfCols]

In [9]:
savDir = '/accounts/ekarakoy/DEV-ALL/BLOGS/DataScienceCorner/posts/bayesianChl_stuff/'
df.to_pickle(os.path.join(savDir, 'dfOcPrepHistoric.pkl'))
dfRrs.to_pickle(os.path.join(savDir, 'dfOcPrepRrs.pkl'))
del df, dfRrs

In [10]:
dfSwf.info() # skipping the output which shows that most columns are object type...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2483 entries, 0 to 2482
Data columns (total 12 columns):
cruise      2483 non-null object
lat         2483 non-null object
lon         2483 non-null object
type        2483 non-null object
chl         2483 non-null object
Datetime    2483 non-null datetime64[ns]
rrs411      2483 non-null object
rrs443      2483 non-null object
rrs489      2483 non-null object
rrs510      2483 non-null object
rrs555      2483 non-null object
rrs670      2483 non-null object
dtypes: datetime64[ns](1), object(11)
memory usage: 232.9+ KB


In [12]:
FindNaNs(dfSwf)

cruise: 0 NaNs & 0 -999s
lat: 0 NaNs & 0 -999s
lon: 0 NaNs & 0 -999s
type: 0 NaNs & 0 -999s
chl: 535 NaNs & 0 -999s
Datetime: 0 NaNs & 0 -999s
rrs411: 22 NaNs & 0 -999s
rrs443: 0 NaNs & 0 -999s
rrs489: 0 NaNs & 0 -999s
rrs510: 0 NaNs & 0 -999s
rrs555: 0 NaNs & 0 -999s
rrs670: 0 NaNs & 0 -999s


In [14]:
dfSwf.replace(to_replace='NaN',value=np.NaN,inplace=True)
dfSwf.dropna(inplace=True)
numCols = ['chl','lat','lon','rrs411','rrs443','rrs489','rrs510','rrs555','rrs670']
dfSwf[numCols] = dfSwf[numCols].apply(pd.to_numeric)
dfSwf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1931 entries, 0 to 2482
Data columns (total 12 columns):
cruise      1931 non-null object
lat         1931 non-null float64
lon         1931 non-null float64
type        1931 non-null object
chl         1931 non-null float64
Datetime    1931 non-null datetime64[ns]
rrs411      1931 non-null float64
rrs443      1931 non-null float64
rrs489      1931 non-null float64
rrs510      1931 non-null float64
rrs555      1931 non-null float64
rrs670      1931 non-null float64
dtypes: datetime64[ns](1), float64(9), object(2)
memory usage: 196.1+ KB
