In [178]:
import pandas as pd
import numpy as np
from matplotlib import rcParams
import seaborn as sb
import os
from tqdm import tqdm_notebook

In [2]:
%matplotlib inline

In [4]:
sb.reset_defaults()
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 14
rcParams['figure.figsize'] = (10, 5)
rcParams['axes.formatter.limits'] = (-2, 3)

In [5]:
sb.set_style('darkgrid')

In [307]:
def Cleanup(dataframe):
    dataframe.set_index('date', inplace=True)
    dataframe.drop(['MWD', 'DEWP', 'VIS'], axis=1, inplace=True)
    dataframe.loc[:,['WD', 'ATMP', 'WTMP']].replace('999.0', np.NaN, inplace=True)
    dataframe.loc[dataframe.WD==999, 'WD'] = np.NaN
    dataframe.loc[dataframe.WSPD==99, 'WSPD'] = np.NaN
    dataframe.loc[dataframe.GST==99, 'GST'] = np.NaN
    dataframe.loc[dataframe.WVHT==99, 'WVHT'] = np.NaN
    dataframe.loc[dataframe.DPD==99, 'DPD'] = np.NaN
    dataframe.loc[dataframe.APD==99, 'APD'] = np.NaN
    if 'BAR' in dataframe.columns:
        dataframe.loc[dataframe.BAR==9999, 'BAR'] = np.NaN
    dataframe.loc[dataframe.ATMP==999, 'ATMP'] = np.NaN
    dataframe.loc[dataframe.WTMP==999, 'WTMP'] = np.NaN
    try:
        dataframe.loc[dataframe.TIDE==99, 'TIDE'] = np.NaN
        print('corrected TIDE')
    except:
        pass
    #return dataframe

def GetWindPersistence(dataframe):
    dataframe['SoutherlyPersistence'] = np.cos(np.deg2rad(dataframe.WD)) * dataframe.WSPD
    dataframe['EasterlyPersistence'] = np.sin(np.deg2rad(dataframe.WD)) * dataframe.WSPD
    dataframe.dropna(subset=['WD', 'WSPD'], inplace=True)
    #return dataframe

def AggregateDF(dataframe):
    dfMonthMean = dataframe.drop(['WD', 'WSPD'], axis=1).resample('M').mean()
    dfMonthStd = dataframe.drop(['WD', 'WSPD'], axis=1).resample('M').std()
    return dfMonthMean, dfMonthStd

def RunPrep(frame):
    frame = Cleanup(frame)
    frame = GetWindPersistence(frame)

In [308]:
dataMain = '/home/madhatter106/DATA/Beaching_Project/CapeCodData/'
pklDir = os.path.join(dataMain, 'PklJar')
windDir = 'wind_44013/'
fprfx = '44013_'
years = [str(year) for year in range(1999, 2015)] # generator

In [323]:
df = None
dfUnits = None
for yr in years: 
    fp = '%s%s%s.txt' %(os.path.join(dataMain, windDir), fprfx,yr)
    dfyear = pd.read_csv(fp, delim_whitespace=True,
                         parse_dates={'date':[0,1,2,3]}, infer_datetime_format=True)
    if int(yr) >= 2007:
        dfyear = dfyear.iloc[1:]
        dfyear['date'] = pd.to_datetime(dfyear.date, infer_datetime_format=True)
        dfyear.iloc[:,1:] = dfyear.iloc[:,1:].astype('f8')
        dfyear.rename(columns={'WDIR': 'WD'}, inplace=True)
    Cleanup(dfyear)
    GetWindPersistence(dfyear)
    df = pd.concat([df, dfyear])
    del dfyear

corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE
corrected TIDE


50.0    54679
0.0     24391
Name: mm, dtype: int64

In [325]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 128793 entries, 1999-01-01 00:00:00 to 2014-12-31 22:00:00
Data columns (total 14 columns):
APD                     127182 non-null float64
ATMP                    128675 non-null float64
BAR                     64653 non-null float64
DPD                     125702 non-null float64
EasterlyPersistence     128793 non-null float64
GST                     128621 non-null float64
PRES                    64081 non-null float64
SoutherlyPersistence    128793 non-null float64
TIDE                    0 non-null float64
WD                      128793 non-null float64
WSPD                    128793 non-null float64
WTMP                    128442 non-null float64
WVHT                    127182 non-null float64
mm                      79070 non-null float64
dtypes: float64(14)
memory usage: 14.7 MB


In [327]:
df.head()

Unnamed: 0_level_0,APD,ATMP,BAR,DPD,EasterlyPersistence,GST,PRES,SoutherlyPersistence,TIDE,WD,WSPD,WTMP,WVHT,mm
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1999-01-01 00:00:00,5.25,-4.3,1017.4,11.11,-3.542719,7.2,,-4.075432,,221.0,5.4,5.6,0.33,
1999-01-01 01:00:00,5.51,-4.0,1016.5,11.11,-3.447704,7.3,,-4.41286,,218.0,5.6,5.7,0.31,
1999-01-01 02:00:00,6.53,-3.9,1015.8,12.5,-4.100237,7.0,,-3.959553,,226.0,5.7,5.7,0.32,
1999-01-01 03:00:00,6.17,-3.9,1015.3,11.11,-4.235926,7.3,,-3.814044,,228.0,5.7,5.7,0.31,
1999-01-01 04:00:00,5.02,-3.8,1015.0,11.11,-4.948156,7.7,,-3.21337,,237.0,5.9,5.6,0.39,


In [328]:
df.drop(['mm', 'TIDE', 'PRES'], axis=1, inplace=True)

In [329]:
df.to_pickle(os.path.join(pklDir,'dfAgg.pkl'))

In [337]:
dfVars = ['EasterlyPersistence', 'SoutherlyPersistence', 'WTMP', 'WVHT']

In [334]:
dfMonthlyMean = df[dfVars].resample('M').mean()

In [335]:
dfMonthlyStd = df[dfVars].resample('M').std()

In [336]:
dfMonthlyMean.head()

Unnamed: 0_level_0,EasterlyPersistence,SoutherlyPersistence,WTMP,WVHT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999-01-31,-2.412786,-0.296956,3.994452,1.100014
1999-02-28,-2.006042,2.114959,2.931061,1.263561
1999-03-31,-2.582759,1.144871,2.608287,1.33243
1999-04-30,-1.331365,1.005358,5.605139,0.669292
1999-05-31,0.807158,0.033573,10.294332,0.60897


In [372]:
indexbymonth = df[dfVars].groupby(df.index.month).mean()
indexbymonth

Unnamed: 0_level_0,EasterlyPersistence,SoutherlyPersistence,WTMP,WVHT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-4.06444,1.355612,4.897222,1.096931
2,-3.350717,1.404965,3.337487,1.15016
3,-1.50537,1.269199,3.451145,1.212966
4,-0.208256,0.21897,5.834349,1.007061
5,0.222151,-0.03746,9.930629,0.795442
6,-0.008352,-0.526092,14.783339,0.614636
7,-0.286936,-1.184806,18.320355,0.50551
8,-0.17738,-0.868926,18.927826,0.543007
9,-0.16108,-0.28821,16.978047,0.737372
10,-1.293812,0.642169,13.438311,0.998127


In [361]:
dfMonthlyIndex = pd.DataFrame(index=dfMonthlyMean.index, columns=dfMonthlyMean.columns, dtype='f8')
for i in indexbymonth.index:
    dfMonthlyIndex.loc[dfMonthlyMean.index.month==i] = dfMonthlyMean.loc[dfMonthlyMean.index.month==i].subtract(indexbymonth.loc[i])

In [362]:
dfMonthlyIndex.head()

Unnamed: 0_level_0,EasterlyPersistence,SoutherlyPersistence,WTMP,WVHT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1999-01-31,1.651654,-1.652568,-0.90277,0.003083
1999-02-28,1.344675,0.709994,-0.406426,0.113401
1999-03-31,-1.077389,-0.124328,-0.842859,0.119464
1999-04-30,-1.123109,0.786388,-0.22921,-0.337769
1999-05-31,0.585007,0.071033,0.363703,-0.186472


In [363]:
dfMonthlyIndex.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 192 entries, 1999-01-31 to 2014-12-31
Freq: M
Data columns (total 4 columns):
EasterlyPersistence     183 non-null float64
SoutherlyPersistence    183 non-null float64
WTMP                    183 non-null float64
WVHT                    183 non-null float64
dtypes: float64(4)
memory usage: 12.5 KB


In [373]:
for i in range(dfMonthlyIndex.shape[0]):
    np.testing.assert_array_equal(dfMonthlyIndex.iloc[i].values,
                                  (dfMonthlyMean.iloc[i] - indexbymonth.iloc[i%12]).values)
print("test passed")

test passed


In [374]:
dfMonthlyIndex.to_pickle(os.path.join(pklDir, 'dfMonthlyIndex.pkl'))