Tidal Analysis:
Read tidal files and look at monthly aggregated seasonal distribtuion for extreme low and extreme high tides.

Also, look for extreme tides that occur on same day as stranding events.

In [1]:
import pandas as pd
import os
import glob
import matplotlib.pyplot as pl
import seaborn as sb
import matplotlib.colors as mcolors
from calendar import month_name
import numpy as np
import warnings
from IPython.display import Image
import math as m
import pandas as pd
#import matplotlib as plt
%matplotlib notebook 
import numpy as np
import scipy as sp
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
import datetime as dt
import statsmodels.api as sm
idx=pd.IndexSlice

In [2]:
# Get list of tide files, read them and produce a dataframe of tide data
filelist = glob.glob('./data/*.txt')
Tide_data = pd.concat([pd.read_csv(file, delim_whitespace=True, header=12, usecols=[0,2,3,4], parse_dates=[[0,1]], index_col=0) for file in filelist])

In [None]:
#pickle_file = './pickles/Tide_Data.pickle'
#Tide_data = pd.read_pickle(pickle_file)

In [3]:
Tide_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 22583 entries, 1999-01-01 03:45:00 to 2014-12-31 19:26:00
Data columns (total 2 columns):
Pred        22583 non-null float64
High/Low    22583 non-null object
dtypes: float64(1), object(1)
memory usage: 529.3+ KB


In [4]:
# Separate hi tide data from low tide data
Hi_data = Tide_data.loc[Tide_data['High/Low'] == 'H']
Lo_data = Tide_data.loc[Tide_data['High/Low'] == 'L']

In [5]:
Hi_data.describe()

Unnamed: 0,Pred
count,11292.0
mean,2.937385
std,0.263696
min,2.25
25%,2.75
50%,2.92
75%,3.11
max,3.67


In [6]:
Lo_data.describe()

Unnamed: 0,Pred
count,11291.0
mean,0.088642
std,0.258647
min,-0.63
25%,-0.08
50%,0.11
75%,0.28
max,0.7


In [7]:
# Produce a frequency count for the months that hi tides were at least 1 stdev above the mean high.
pd.DataFrame(pd.Series(Hi_data[Hi_data['Pred']>=(2.937385+0.263696)].index.month).value_counts()).sort_index()

Unnamed: 0,0
1,121
2,126
3,167
4,158
5,152
6,147
7,169
8,198
9,188
10,165


In [8]:
ax = pd.DataFrame(pd.Series(Hi_data[Hi_data['Pred']>=(2.937385+0.263696)].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Hi Tide: Num. of Tides > Mean High + 1 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x112395828>

In [9]:
pd.DataFrame(pd.Series(Hi_data[Hi_data['Pred']>=(2.937385+(2*0.263696))].index.month).value_counts()).sort_index()

Unnamed: 0,0
1,28
2,10
3,20
4,30
5,43
6,49
7,43
8,29
9,30
10,33


In [10]:
ax = pd.DataFrame(pd.Series(Hi_data[Hi_data['Pred']>=(2.937385+(2*0.263696))].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Hi Tide: Num. of Tides > Mean High + 2 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x112616278>

In [11]:
ax = pd.DataFrame(pd.Series(Hi_data[Hi_data['Pred']>=(2.937385+(2.2*0.263696))].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Hi Tide: Num. of Tides > Mean High + 2.2 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x112ccd080>

In [12]:
# Produce a frequency count for the months that lo tides were at least 1 stdev below the mean low.
ax = pd.DataFrame(pd.Series(Lo_data[Lo_data['Pred']<=(0.088642-(1*0.258647))].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Lo Tide: Num. of Tides < Mean Low - 1 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x112d95c88>

In [13]:
ax = pd.DataFrame(pd.Series(Lo_data[Lo_data['Pred']<=(0.088642-(2*0.258647))].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Lo Tide: Num. of Tides < Mean Low - 2 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x1130c9b00>

In [14]:
ax = pd.DataFrame(pd.Series(Lo_data[Lo_data['Pred']<=(0.088642-(2.2*0.258647))].index.month).value_counts()).sort_index().plot(kind='bar',title='Monthly Agg. Lo Tide: Num. of Tides < Mean Low - 2.2 Stdev',legend=False)
ax.set_xlabel("Month")
ax.set_ylabel("Count")

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x1133f7c50>

#--------
Although the Hi and Lo extreme tides don't have a seasonal distribution like the MS events, a MS event may still happen when an extremem tide occurs.  To see if MS events are coupled to extreme tides, I'll select extreme tides with about the same number of MS Events, then look to see if Extreme Tide dates match the MS dates.  

In [15]:
#Find a cutoff value for the tide height that produces about the same number of MS events. (2.25 stdevs).
Lo_data[Lo_data['Pred']<=(0.088642-(2.25*0.258647))].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 170 entries, 1999-04-17 06:47:00 to 2014-08-13 07:44:00
Data columns (total 2 columns):
Pred        170 non-null float64
High/Low    170 non-null object
dtypes: float64(1), object(1)
memory usage: 4.0+ KB


In [16]:
# Select low tides that are 2.25 stdev below mean low tide.  About same number of MS events.
Lo_tides = Lo_data[Lo_data['Pred']<=(0.088642-(2.25*0.258647))]

In [17]:
# I've already looked up the dates that MS events happen in an Excel sheet.  Read in those dates.
MS_dates = pd.read_excel('./MSID Dates.xlsx',sheetname=6,parse_cols=[1],names=['Date'])

In [None]:
#pickle_file = './pickles/MS_Dates.pickle'
#MS_dates = pd.read_pickle(pickle_file)

In [18]:
MS_dates.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 1 columns):
Date    165 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 1.4 KB


In [19]:
MS_dates.head()

Unnamed: 0,Date
0,1999-03-07
1,1999-03-18
2,1999-03-19
3,1999-03-20
4,1999-03-21


In [20]:
#Initizlize new col to FALSE.  Will indicate if we find a MS date in the Low Tide dates.
MS_dates['Found'] = False

In [21]:
# For each MS date, look for any extreme low tides in the low tides df.  Select any low tides in the range
# from start of the MS event date to 1 day later.  len() > 0 (True) indicates one or more dates were found.
for row in range(0,165,1) :
    MS_dates.loc[row,'Found'] = len(Lo_tides.loc[MS_dates.loc[row,'Date']:MS_dates.loc[row,'Date']+pd.Timedelta(1,'D')])

In [22]:
MS_dates.loc[MS_dates['Found']]


Unnamed: 0,Date,Found
31,2003-10-27,True
143,2012-03-11,True
160,2014-01-01,True


---------- Now look for MS events that occurred during Extereme Hi Tide events. ------------

In [23]:
Hi_data[Hi_data['Pred']>=(2.937385+(2.24*0.263696))].info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 180 entries, 1999-04-17 00:31:00 to 2014-09-10 00:19:00
Data columns (total 2 columns):
Pred        180 non-null float64
High/Low    180 non-null object
dtypes: float64(1), object(1)
memory usage: 4.2+ KB


In [24]:
# Select hi tides that are 2.24 stdev below mean low tide.  About same number of MS events.
Hi_tides = Hi_data[Hi_data['Pred']>=(2.937385+(2.24*0.263696))]

In [25]:
# Reset MS dates Found col to False.
MS_dates['Found'] = False

In [26]:
# For each MS date, look for any extreme hi tides in the hi tides df.  Select any hi tides in the range
# from start of the MS event date to 1 day later.  len() > 0 (True) indicates one or more dates were found.
for row in range(0,165,1) :
    MS_dates.loc[row,'Found'] = len(Hi_tides.loc[MS_dates.loc[row,'Date']:MS_dates.loc[row,'Date']+pd.Timedelta(1,'D')])

In [27]:
MS_dates.loc[MS_dates['Found']]

Unnamed: 0,Date,Found
31,2003-10-27,True
160,2014-01-01,True


#--------- Expand date lookup to include extreme tides from one day before  -----------

In [28]:
#Initizlize Found col to FALSE.  Will indicate if we find a MS date in the Low Tide dates.
MS_dates['Found'] = False

In [29]:
# For each MS date, look for any extreme low tides in the low tides df.  Select any low tides in the range
# from one day prior to start of the MS event date to 1 day later.  len() > 0 (True) indicates one or more dates were found.
for row in range(0,165,1) :
    MS_dates.loc[row,'Found'] = len(Lo_tides.loc[MS_dates.loc[row,'Date']-pd.Timedelta(1,'D'):MS_dates.loc[row,'Date']+pd.Timedelta(1,'D')])

In [30]:
MS_dates.loc[MS_dates['Found']]


Unnamed: 0,Date,Found
31,2003-10-27,True
56,2006-02-01,True
143,2012-03-11,True
144,2012-03-12,True
160,2014-01-01,True


In [31]:
#Initizlize Found col to FALSE.  Will indicate if we find a MS date in the Low Tide dates.
MS_dates['Found'] = False

In [32]:
# For each MS date, look for any extreme hi tides in the hi tides df.  Select any hi tides in the range
# from one day prior to start of the MS event date to 1 day later.  len() > 0 (True) indicates one or more dates were found.
for row in range(0,165,1) :
    MS_dates.loc[row,'Found'] = len(Hi_tides.loc[MS_dates.loc[row,'Date']-pd.Timedelta(1,'D'):MS_dates.loc[row,'Date']+pd.Timedelta(1,'D')])

In [33]:
MS_dates.loc[MS_dates['Found']]


Unnamed: 0,Date,Found
31,2003-10-27,True
160,2014-01-01,True


In [34]:
# Pickle the data
pickle_file = './pickles/MS_Dates.pickle'
del MS_dates['Found']
MS_dates.to_pickle(pickle_file)
pickle_file = './pickles/Tide_Data.pickle'
Tide_data.to_pickle(pickle_file)