In [95]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np

from keras.layers import Dense, LSTM, Input
from keras.callbacks import ModelCheckpoint

import matplotlib.pylab as plt

In [163]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 20 

# Number of identifying classes 
#   WE have two, Bloom and no bloom 1/0
NUM_CLASSES = 2 

# number of times to repeat process
EPOCHS = 100

In [97]:
df = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df.head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,4.4,1.3,90.2,9.04,0.4
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,4.2,1.2,90.2,9.04,0.4
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,4.3,1.3,90.1,9.04,0.4
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,4.5,1.3,90.0,9.03,0.4
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,4.5,1.3,89.8,9.02,0.4


In [98]:
target = df['BGA-Phycocyanin RFU'].apply(lambda x : x / 0.2334)
target
dataset = df.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
dataset['BGA (ug/L)'] = target
dataset.head(5)

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L)
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796


In [99]:
from datetime import datetime

timestamp = dataset['Date (mm.dd.yyyy)'] + ' '+ dataset['Time 24hr']
timestamp = pd.to_datetime(timestamp)
dataset['Timestamp'] = timestamp
dataset.head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L),Timestamp
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796,2017-05-05 00:00:00
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796,2017-05-05 00:15:00
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796,2017-05-05 00:30:00
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796,2017-05-05 00:45:00
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796,2017-05-05 01:00:00


In [100]:
# dont need data and time now that we have Timestamp. Lets remove them

dataset = dataset.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
dataset.head(10)


Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L),Timestamp
0,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796,2017-05-05 00:00:00
1,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796,2017-05-05 00:15:00
2,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796,2017-05-05 00:30:00
3,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796,2017-05-05 00:45:00
4,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796,2017-05-05 01:00:00
5,14.92,1850,-100.1,8.36,16.43,89.8,9.02,0.4,1.713796,2017-05-05 01:15:00
6,14.9,1851,-100.1,8.36,16.35,89.7,9.01,0.4,1.713796,2017-05-05 01:30:00
7,14.88,1852,-100.0,8.36,16.4,89.6,9.0,0.4,1.713796,2017-05-05 01:45:00
8,14.84,1850,-99.9,8.36,16.82,89.4,8.99,0.4,1.713796,2017-05-05 02:00:00
9,14.83,1851,-100.0,8.36,16.5,89.4,8.99,0.4,1.713796,2017-05-05 02:15:00


In [101]:
target = dataset['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
dataset['Bloom'] = target
dataset.dtypes

Temp C                        float64
Sp Cond (uS/cm)                 int64
pH (mV)                       float64
pH                            float64
Turbidity (NTU)               float64
ODOSat%                       float64
ODO (mg/L)                    float64
BGA-Phycocyanin RFU           float64
BGA (ug/L)                    float64
Timestamp              datetime64[ns]
Bloom                           int64
dtype: object

In [102]:
# lets try to normalize this now....
from sklearn.preprocessing import MinMaxScaler
dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)', 'Bloom']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(dataset[dataset_columns])
dataset = pd.DataFrame(ds_scaled,columns=dataset_columns)
dataset.describe()

  return self.partial_fit(X, y)


Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),Bloom
count,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0
mean,0.560516,0.893366,0.477889,0.541089,0.090355,0.165126,0.273546,0.00512
std,0.267413,0.070246,0.116241,0.113254,0.085879,0.093899,0.11622,0.071369
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.308195,0.853771,0.426166,0.461538,0.045893,0.117086,0.19,0.0
50%,0.640366,0.895882,0.492228,0.553846,0.065216,0.13887,0.264545,0.0
75%,0.784656,0.956502,0.53886,0.592308,0.102407,0.177672,0.370909,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [103]:
'''
determines the window size for the daata set
@param dataset - The dataset to get windows for
@param window_size - the size of the window  
@param shift - the amout to shift the window
'''
def windows(dataset, window_size, shift):
    start = 0
    while start+window_size < dataset.shape[0]: 
        yield (int(start), int(start+window_size))
        # shift the window five blocks of time
        start += shift
        if start % 300 == 0:
            print('Window Segmentation {0:.2f}% done'.format(((start+window_size) / dataset.shape[0]) * 100 ))


'''

Segments the dataset based on the parameters that are passed in.
@param dataset - the dataset to segment into window
@param columns - the array of columns from the dataset to be looked at
@param window_size - the size of the window you would like to be looked at. Defualt is 10

'''
def segment_dataset(dataset, columns, target, window_size=10):    
    print('WINDOW SIZE',window_size)
    print('NUMBER OF COULUMNS',len(columns))
    segments = np.empty((0, window_size, len(columns)))
    labels = np.empty((0))
    count = 0
    for (start, end) in windows(dataset, window_size, 1):
        count+=1
        values = dataset[columns][start:end]
        if(values.shape[0] == window_size):
            segments = np.vstack([segments, np.stack([values])])
            # Takes the larger of the two variables if there are more than one. 
            # This makes it more likly to predict a bloom. Can be changed to iloc[0] to
            # be less likly to predict a bloom (more 0s in the label array)
            
            labels = np.append(labels, dataset[target][start:end].mode().iloc[-1])
        else:
            print("No more Windows available... Exiting")
            break
    return (segments, labels)

In [69]:
feature_columns = dataset_columns[:-1]
(segments, labels) = segment_dataset(dataset, feature_columns, 'Bloom', 5)
print('done')

WINDOW SIZE 5
NUMBER OF COULUMNS 7
Window Segmentation 1.61% done
Window Segmentation 3.19% done
Window Segmentation 4.78% done
Window Segmentation 6.36% done
Window Segmentation 7.94% done
Window Segmentation 9.53% done
Window Segmentation 11.11% done
Window Segmentation 12.69% done
Window Segmentation 14.28% done
Window Segmentation 15.86% done
Window Segmentation 17.44% done
Window Segmentation 19.03% done
Window Segmentation 20.61% done
Window Segmentation 22.19% done
Window Segmentation 23.78% done
Window Segmentation 25.36% done
Window Segmentation 26.94% done
Window Segmentation 28.53% done
Window Segmentation 30.11% done
Window Segmentation 31.69% done
Window Segmentation 33.28% done
Window Segmentation 34.86% done
Window Segmentation 36.44% done
Window Segmentation 38.03% done
Window Segmentation 39.61% done
Window Segmentation 41.19% done
Window Segmentation 42.78% done
Window Segmentation 44.36% done
Window Segmentation 45.94% done
Window Segmentation 47.53% done
Window Segm

In [105]:
print(dataset.shape)
target = dataset['Bloom']
features = dataset.drop(columns={'Bloom'})

(18947, 8)


In [71]:
#labels.shape

(18942,)

In [72]:
#labels

array([0., 0., 0., ..., 0., 0., 0.])

In [16]:
#labels.shape

(18938,)

In [74]:
#labels = labels.reshape(labels.shape[0],1)
#labels.shape

(18942, 1)

In [106]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [107]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

x_train shape: (13262, 7)
x_test shape: (5685, 7)
y_train shape: (13262,)
y_test shape: (5685,)


In [123]:
y_train_mod = ks.utils.to_categorical(y_train, NUM_CLASSES)
y_test_mod = ks.utils.to_categorical(y_test, NUM_CLASSES)

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

## RNN 

In [109]:
from keras import backend as K

def precision(y_true, y_pred):
    '''
     Calculates the precision, a metric for multi-label classification of
     how many selected items are relevant.
    '''
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """
     Recall metric.
     Only computes a batch-wise average of recall.
     Computes the recall, a metric for multi-label classification of
     how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [110]:
df.shape

(18947, 12)

In [265]:
use_dropout = True
hidden_size = 500
num_steps = 30

model = Sequential()

model.add(Embedding(x_train.shape[1],128,input_length=x_train.shape[1],dropout=0.2))
model.add(LSTM(hidden_size, dropout=0.2))
model.add(Dense(2,activation='softmax'))


  import sys


In [270]:
model.compile(loss=ks.losses.categorical_crossentropy,
              optimizer=ks.optimizers.Adam(lr=0.0001),
              metrics=[recall])

In [271]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 7, 128)            896       
_________________________________________________________________
lstm_51 (LSTM)               (None, 500)               1258000   
_________________________________________________________________
dense_26 (Dense)             (None, 2)                 1002      
Total params: 1,259,898
Trainable params: 1,259,898
Non-trainable params: 0
_________________________________________________________________


In [272]:
print(x_train.shape)
print(y_train_mod.shape)
x_train_reshaped = x_train.values.reshape(x_train.shape[0],x_train.shape[1],1)
print(x_train_reshaped.shape)

(13262, 7)
(13262, 2)
(13262, 7, 1)


In [273]:
model.fit(x_train, 
          y_train_mod,
          batch_size=BATCH_SIZE,          
          epochs=EPOCHS)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x2a029bbc518>

In [275]:
# score = model.evaluate(x_train, y_train_mod, verbose=0)
x_test_reshaped = x_test.values.reshape(x_test.shape[0],x_test.shape[1],1)
predictions = model.predict(x_test)

In [277]:
print(predictions)

[[0.9956938  0.00430618]
 [0.9956938  0.00430618]
 [0.9956938  0.00430618]
 ...
 [0.9956938  0.00430618]
 [0.9956938  0.00430618]
 [0.9956938  0.00430619]]
