Here I try a variety of bayesian models on a few of the features to get a sense of what might be more important

In [2]:
import pandas as pd
import numpy as np
import pymc3 as pm
import pickle
import matplotlib.pyplot as pl
from sklearn.preprocessing import StandardScaler
from matplotlib import rcParams
import theano.tensor as tt

In [3]:
% matplotlib inline
rcParams['xtick.labelsize'] = 14
rcParams['ytick.labelsize'] = 14
rcParams['axes.formatter.limits'] = (-2, 3)
rcParams['font.size'] = 14
rcParams['axes.titlesize'] = 18
rcParams['figure.titlesize'] = 20

In [4]:
from IPython.core.display import HTML, display
display(HTML("<style>.container {width: 90% !important}</style>"))

In [71]:
# Load the dataset:
with open('../PklJar/CapeCodDataDict', 'rb') as f:
    dataDict = pickle.load(f)

I'll start with the wind data as feature and the total number of strandings as target

In [72]:
dataDict['Features'].keys()

dict_keys(['CENSUS', 'MAG', 'SUN', 'TIDE', 'WATER', 'WIND'])

In [63]:
df = pd.DataFrame(dataDict['Features']['WIND'])

In [67]:
df = pd.concat([dfi for dfi in dataDict['Features'].values()], axis=1)

In [73]:
df = df.loc['1999-03-07':]

In [74]:
df.head()

Unnamed: 0,Stene_count,Stene_roll_W_sum,Gramp_count,Gramp_roll_W_sum,Delph_count,Delph_roll_W_sum,Lagen_count,Lagen_roll_W_sum,Tursi_count,Tursi_roll_W_sum,...,EasterlyWS_1d,SoutherlyWS_1d,EasterlyWS_2d,SoutherlyWS_2d,EasterlyWS_3d,SoutherlyWS_3d,EasterlyWS_7d,SoutherlyWS_7d,EasterlyWS_30d,SoutherlyWS_30d
1999-03-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,102.0,0.0,0.0,...,3.570062,2.563992,-9.224087,-0.04657,3.17975,-9.103259,2.03688,-1.265833,-5.350695,5.959278
1999-03-08,0.0,0.0,0.0,0.0,0.0,0.0,10.0,62.0,0.0,0.0,...,-1.597614,11.183859,3.570062,2.563992,-9.224087,-0.04657,2.733924,-6.260351,-4.862086,1.981418
1999-03-09,0.0,0.0,14.0,14.0,350.0,350.0,0.0,62.0,30.0,30.0,...,-10.374381,7.568804,-1.597614,11.183859,3.570062,2.563992,-4.567401,-4.513101,-7.616287,2.540035
1999-03-10,0.0,0.0,0.0,14.0,0.0,350.0,10.0,22.0,0.0,30.0,...,-8.124423,5.41686,-10.374381,7.568804,-1.597614,11.183859,-1.074112,-1.885289,-0.946676,4.56945
1999-03-11,0.0,0.0,0.0,14.0,0.0,350.0,50.0,72.0,0.0,30.0,...,-4.053519,2.885376,-8.124423,5.41686,-10.374381,7.568804,3.17975,-9.103259,-2.245282,-2.983975


In [75]:
df['Month'] = df.index.month
df['Year' ] = df.index.year

In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5779 entries, 1999-03-07 to 2014-12-31
Freq: D
Data columns (total 58 columns):
Stene_count            5779 non-null float64
Stene_roll_W_sum       5779 non-null float64
Gramp_count            5779 non-null float64
Gramp_roll_W_sum       5779 non-null float64
Delph_count            5779 non-null float64
Delph_roll_W_sum       5779 non-null float64
Lagen_count            5779 non-null float64
Lagen_roll_W_sum       5779 non-null float64
Tursi_count            5779 non-null float64
Tursi_roll_W_sum       5779 non-null float64
Globi_count            5779 non-null float64
Globi_roll_W_sum       5779 non-null float64
Sum_Counts             5779 non-null float64
Sum_W_Rolls            5779 non-null float64
FRDH                   5771 non-null float64
FRDV                   5759 non-null float64
OTTH                   5414 non-null float64
OTTV                   5400 non-null float64
Kp_index               5779 non-null float64
Sunspot_No  

In [None]:
strandings = dataDict['Targets']['TOTAL']

In [79]:
strandings.head()

1999-01-01    0.0
1999-01-02    0.0
1999-01-03    0.0
1999-01-04    0.0
1999-01-05    0.0
Freq: D, dtype: float64

In [8]:
strandings = pd.DataFrame(strandings, columns=[strandings.name])

In [None]:
strandings.head()

In [10]:
wind.head()

Unnamed: 0,EasterlyWS,SoutherlyWS,EasterlyWS_1d,SoutherlyWS_1d,EasterlyWS_2d,SoutherlyWS_2d,EasterlyWS_3d,SoutherlyWS_3d,EasterlyWS_7d,SoutherlyWS_7d,EasterlyWS_30d,SoutherlyWS_30d
1999-01-01,-8.369086,-1.962736,,,,,,,,,,
1999-01-02,-10.358338,2.124278,-8.369086,-1.962736,,,,,,,,
1999-01-03,5.530758,-3.071964,-10.358338,2.124278,-8.369086,-1.962736,,,,,,
1999-01-04,-6.726604,-2.53516,5.530758,-3.071964,-10.358338,2.124278,-8.369086,-1.962736,,,,
1999-01-05,-8.472319,-0.380683,-6.726604,-2.53516,5.530758,-3.071964,-10.358338,2.124278,,,,


The stranding dataset starts on March 7, 1997. So I'll trim the dataset used here to start then:

In [11]:
strandings = strandings.loc['1999-03-07':]
wind = wind.loc['1999-03-07':]

In [12]:
strandings.head()

Unnamed: 0,None
1999-03-07,6.0
1999-03-08,0.0
1999-03-09,0.0
1999-03-10,0.0
1999-03-11,0.0


In [13]:
wind.head()

Unnamed: 0,EasterlyWS,SoutherlyWS,EasterlyWS_1d,SoutherlyWS_1d,EasterlyWS_2d,SoutherlyWS_2d,EasterlyWS_3d,SoutherlyWS_3d,EasterlyWS_7d,SoutherlyWS_7d,EasterlyWS_30d,SoutherlyWS_30d
1999-03-07,-1.597614,11.183859,3.570062,2.563992,-9.224087,-0.04657,3.17975,-9.103259,2.03688,-1.265833,-5.350695,5.959278
1999-03-08,-10.374381,7.568804,-1.597614,11.183859,3.570062,2.563992,-9.224087,-0.04657,2.733924,-6.260351,-4.862086,1.981418
1999-03-09,-8.124423,5.41686,-10.374381,7.568804,-1.597614,11.183859,3.570062,2.563992,-4.567401,-4.513101,-7.616287,2.540035
1999-03-10,-4.053519,2.885376,-8.124423,5.41686,-10.374381,7.568804,-1.597614,11.183859,-1.074112,-1.885289,-0.946676,4.56945
1999-03-11,-7.269797,4.485525,-4.053519,2.885376,-8.124423,5.41686,-10.374381,7.568804,3.17975,-9.103259,-2.245282,-2.983975


My first model will be that of a logistic regression, with the goal of predicting when any strandings (TOTAL>0) occurs. Note here that a stranding event was originally labeled as such when more than one animal had stranded.

There are some nans among the wind, so I'll make a new dataframe with both wind and strandings and drop rows where nans appear

In [51]:
df = pd.concat([wind[['EasterlyWS', 'SoutherlyWS']], strandings], axis=1)
df.rename(columns={None: 'TotalStranded'}, inplace=True)

In [52]:
df.head()

Unnamed: 0,EasterlyWS,SoutherlyWS,TotalStranded
1999-03-07,-1.597614,11.183859,6.0
1999-03-08,-10.374381,7.568804,0.0
1999-03-09,-8.124423,5.41686,0.0
1999-03-10,-4.053519,2.885376,0.0
1999-03-11,-7.269797,4.485525,0.0


In [43]:
boolStrand = strandings>0
boolStrand.rename(columns={None: 'stranded'}, inplace=True)

In [44]:
boolStrand.head()

Unnamed: 0,stranded
1999-03-07,True
1999-03-08,False
1999-03-09,False
1999-03-10,False
1999-03-11,False


In [45]:
dfs = pd.concat([wind, boolStrand], axis=1)

In [46]:
dfs.head()

Unnamed: 0,EasterlyWS,SoutherlyWS,EasterlyWS_1d,SoutherlyWS_1d,EasterlyWS_2d,SoutherlyWS_2d,EasterlyWS_3d,SoutherlyWS_3d,EasterlyWS_7d,SoutherlyWS_7d,EasterlyWS_30d,SoutherlyWS_30d,stranded
1999-03-07,-1.597614,11.183859,3.570062,2.563992,-9.224087,-0.04657,3.17975,-9.103259,2.03688,-1.265833,-5.350695,5.959278,True
1999-03-08,-10.374381,7.568804,-1.597614,11.183859,3.570062,2.563992,-9.224087,-0.04657,2.733924,-6.260351,-4.862086,1.981418,False
1999-03-09,-8.124423,5.41686,-10.374381,7.568804,-1.597614,11.183859,3.570062,2.563992,-4.567401,-4.513101,-7.616287,2.540035,False
1999-03-10,-4.053519,2.885376,-8.124423,5.41686,-10.374381,7.568804,-1.597614,11.183859,-1.074112,-1.885289,-0.946676,4.56945,False
1999-03-11,-7.269797,4.485525,-4.053519,2.885376,-8.124423,5.41686,-10.374381,7.568804,3.17975,-9.103259,-2.245282,-2.983975,False


In [55]:
df_clean = df.dropna()
x_wind = df_clean.as_matrix(columns=['EasterlyWS', 'SoutherlyWS'])
y_bool = df_clean.TotalStranded.values
ss = StandardScaler()
x_wind_s = ss.fit_transform(x_wind)

In [28]:
num_samples = y_bool.size

In [24]:
# Just easterly wind
with pm.Model() as mPoiss0:
    intercept = pm.Normal('intercept', mu=0, sd=10)
    coef = pm.Normal('coef', mu=0, sd=10)
    mu = pm.path.exp(intercept + pm.math.dot(x_wind_s[:,0], coef))
    y_pred = pm.Binomial('y_pred', n=num_samples, p=p, observed=y_bool)
    trace_mlog0 = pm.sample(2000, tune=1000)
pm.traceplot(trace_mlog0);

         Current function value: 3332867.581616
         Iterations: 23
         Function evaluations: 120
         Gradient evaluations: 108


Problem occurred during compilation with the command line below:
/usr/bin/g++ -shared -g -O3 -fno-math-errno -Wno-unused-label -Wno-unused-variable -Wno-write-strings -march=broadwell -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mbmi2 -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi -mno-clwb -mno-pcommit -mno-mwaitx --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=8192 -mtune=generic -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION -m64 -fPIC -I/accounts/ekarakoy/anaconda3/lib/python3.6/site-packages/numpy/core/include -I/accounts/ekarakoy/anaconda3/include/python3.6m -I/accounts/ekarakoy/ana

OSError: [Errno 12] Cannot allocate memory

In [29]:
# Just southerly wind
with pm.Model() as mlog1:
    intercept = pm.Normal('intercept', mu=0, sd=10)
    coef = pm.Normal('coef', mu=0, sd=10)
    mu = intercept = pm.math.dot(x_wind_s[:, 1], coef)
    p = pm.math.invlogit(mu)
    y_pred = pm.Binomial('y_pred', n=num_samples, p=p, observed=y_bool)
    trace_mlog1 = pm.sample(2000, tune=1000)
pm.traceplot(trace_mlog1);

INFO (theano.gof.compilelock): Refreshing lock /accounts/ekarakoy/.theano/compiledir_Linux-4.4--generic-x86_64-with-debian-stretch-sid-x86_64-3.6.3-64/lock_dir/lock
Problem occurred during compilation with the command line below:
/usr/bin/g++ -shared -g -O3 -fno-math-errno -Wno-unused-label -Wno-unused-variable -Wno-write-strings -march=broadwell -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mbmi2 -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mno-avx512f -mno-avx512er -mno-avx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512ifma -mno-avx512vbmi -mno-clwb -mno-pcommit -mno-mwaitx --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=8192 -mtune=generic -DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSIO

OSError: [Errno 12] Cannot allocate memory