In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np

from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Flatten
from keras.callbacks import ModelCheckpoint

import matplotlib.pylab as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 20 

# Number of identifying classes 
#   WE have two, Bloom and no bloom 1/0
NUM_CLASSES = 2 

# number of times to repeat process
EPOCHS = 100

In [3]:
df = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df.head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,4.4,1.3,90.2,9.04,0.4
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,4.2,1.2,90.2,9.04,0.4
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,4.3,1.3,90.1,9.04,0.4
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,4.5,1.3,90.0,9.03,0.4
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,4.5,1.3,89.8,9.02,0.4


In [4]:
target = df['BGA-Phycocyanin RFU'].apply(lambda x : x / 0.2334)
target
dataset = df.drop(columns=['Chlorophyll (ug/L)', 'Chlorophyll RFU'])
dataset['BGA (ug/L)'] = target
dataset.head(5)

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L)
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796


In [5]:
from datetime import datetime

timestamp = dataset['Date (mm.dd.yyyy)'] + ' '+ dataset['Time 24hr']
timestamp = pd.to_datetime(timestamp)
dataset['Timestamp'] = timestamp
dataset.head()

Unnamed: 0,Date (mm.dd.yyyy),Time 24hr,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L),Timestamp
0,5/5/2017,0:00,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796,2017-05-05 00:00:00
1,5/5/2017,0:15,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796,2017-05-05 00:15:00
2,5/5/2017,0:30,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796,2017-05-05 00:30:00
3,5/5/2017,0:45,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796,2017-05-05 00:45:00
4,5/5/2017,1:00,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796,2017-05-05 01:00:00


In [6]:
# dont need data and time now that we have Timestamp. Lets remove them

dataset = dataset.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
dataset.head(10)


Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,BGA (ug/L),Timestamp
0,15.02,1848,-100.1,8.36,16.84,90.2,9.04,0.4,1.713796,2017-05-05 00:00:00
1,14.99,1847,-100.1,8.36,16.76,90.2,9.04,0.4,1.713796,2017-05-05 00:15:00
2,14.96,1847,-100.1,8.36,16.82,90.1,9.04,0.4,1.713796,2017-05-05 00:30:00
3,14.95,1848,-100.1,8.36,17.19,90.0,9.03,0.4,1.713796,2017-05-05 00:45:00
4,14.92,1848,-100.0,8.36,16.85,89.8,9.02,0.4,1.713796,2017-05-05 01:00:00
5,14.92,1850,-100.1,8.36,16.43,89.8,9.02,0.4,1.713796,2017-05-05 01:15:00
6,14.9,1851,-100.1,8.36,16.35,89.7,9.01,0.4,1.713796,2017-05-05 01:30:00
7,14.88,1852,-100.0,8.36,16.4,89.6,9.0,0.4,1.713796,2017-05-05 01:45:00
8,14.84,1850,-99.9,8.36,16.82,89.4,8.99,0.4,1.713796,2017-05-05 02:00:00
9,14.83,1851,-100.0,8.36,16.5,89.4,8.99,0.4,1.713796,2017-05-05 02:15:00


In [7]:
target = dataset['BGA (ug/L)'].apply(lambda x: 1 if x > 20 else 0)
dataset['Bloom'] = target
dataset.dtypes

Temp C                        float64
Sp Cond (uS/cm)                 int64
pH (mV)                       float64
pH                            float64
Turbidity (NTU)               float64
ODOSat%                       float64
ODO (mg/L)                    float64
BGA-Phycocyanin RFU           float64
BGA (ug/L)                    float64
Timestamp              datetime64[ns]
Bloom                           int64
dtype: object

In [8]:
# lets try to normalize this now....
from sklearn.preprocessing import MinMaxScaler
dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)', 'Bloom']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(dataset[dataset_columns])
dataset = pd.DataFrame(ds_scaled,columns=dataset_columns)
dataset.describe()

  return self.partial_fit(X, y)


Unnamed: 0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),ODOSat%,ODO (mg/L),Bloom
count,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0,18947.0
mean,0.560516,0.893366,0.477889,0.541089,0.090355,0.165126,0.273546,0.00512
std,0.267413,0.070246,0.116241,0.113254,0.085879,0.093899,0.11622,0.071369
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.308195,0.853771,0.426166,0.461538,0.045893,0.117086,0.19,0.0
50%,0.640366,0.895882,0.492228,0.553846,0.065216,0.13887,0.264545,0.0
75%,0.784656,0.956502,0.53886,0.592308,0.102407,0.177672,0.370909,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
'''
determines the window size for the daata set
@param dataset - The dataset to get windows for
@param window_size - the size of the window  
@param shift - the amout to shift the window
'''
def windows(dataset, window_size, shift):
    start = 0
    while start+window_size < dataset.shape[0]: 
        yield (int(start), int(start+window_size))
        # shift the window five blocks of time
        start += shift
        if start % 300 == 0:
            print('Window Segmentation {0:.2f}% done'.format(((start+window_size) / dataset.shape[0]) * 100 ))


'''

Segments the dataset based on the parameters that are passed in.
@param dataset - the dataset to segment into window
@param columns - the array of columns from the dataset to be looked at
@param window_size - the size of the window you would like to be looked at. Defualt is 10

'''
def segment_dataset(dataset, columns, target, window_size=10):    
    print('WINDOW SIZE',window_size)
    print('NUMBER OF COULUMNS',len(columns))
    segments = np.empty((0, window_size, len(columns)))
    labels = np.empty((0))
    count = 0
    for (start, end) in windows(dataset, window_size, 1):
        count+=1
        values = dataset[columns][start:end]
        if(values.shape[0] == window_size):
            segments = np.vstack([segments, np.stack([values])])
            # Takes the larger of the two variables if there are more than one. 
            # This makes it more likly to predict a bloom. Can be changed to iloc[0] to
            # be less likly to predict a bloom (more 0s in the label array)
            
            labels = np.append(labels, dataset[target][start:end].mode().iloc[-1])
        else:
            print("No more Windows available... Exiting")
            break
    return (segments, labels)

In [10]:
feature_columns = dataset_columns[:-1]
(segments, labels) = segment_dataset(dataset, feature_columns, 'Bloom', 5)
print('done')

WINDOW SIZE 5
NUMBER OF COULUMNS 7
Window Segmentation 1.61% done
Window Segmentation 3.19% done
Window Segmentation 4.78% done
Window Segmentation 6.36% done
Window Segmentation 7.94% done
Window Segmentation 9.53% done
Window Segmentation 11.11% done
Window Segmentation 12.69% done
Window Segmentation 14.28% done
Window Segmentation 15.86% done
Window Segmentation 17.44% done
Window Segmentation 19.03% done
Window Segmentation 20.61% done
Window Segmentation 22.19% done
Window Segmentation 23.78% done
Window Segmentation 25.36% done
Window Segmentation 26.94% done
Window Segmentation 28.53% done
Window Segmentation 30.11% done
Window Segmentation 31.69% done
Window Segmentation 33.28% done
Window Segmentation 34.86% done
Window Segmentation 36.44% done
Window Segmentation 38.03% done
Window Segmentation 39.61% done
Window Segmentation 41.19% done
Window Segmentation 42.78% done
Window Segmentation 44.36% done
Window Segmentation 45.94% done
Window Segmentation 47.53% done
Window Segm

In [11]:
print(dataset.shape)
target = dataset['Bloom']
features = dataset.drop(columns={'Bloom'})

(18947, 8)


In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

In [13]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

x_train shape: (13262, 7)
x_test shape: (5685, 7)
y_train shape: (13262,)
y_test shape: (5685,)


In [14]:
y_train_mod = ks.utils.to_categorical(y_train, NUM_CLASSES)
y_test_mod = ks.utils.to_categorical(y_test, NUM_CLASSES)

In [15]:
from keras import backend as K

def precision(y_true, y_pred):
    '''
     Calculates the precision, a metric for multi-label classification of
     how many selected items are relevant.
    '''
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def recall(y_true, y_pred):
    """
     Recall metric.
     Only computes a batch-wise average of recall.
     Computes the recall, a metric for multi-label classification of
     how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [16]:
df.shape

(18947, 12)

## RNN 

In [17]:
use_dropout = True
hidden_size = 500
num_steps = 30

model = Sequential()

model.add(LSTM(hidden_size, dropout=0.2))
model.add(LSTM(hidden_size, dropout=0.2))
model.add(LSTM(hidden_size, dropout=0.2))
model.add(Flatten())
Dense(NUM_CLASSES, activation='softmax', input_dim=2)

<keras.layers.core.Dense at 0x290a7e7e5f8>

In [18]:
model.compile(loss=ks.losses.categorical_crossentropy,
              optimizer=ks.optimizers.Adam(lr=0.0001),
              metrics=[recall, 'accuracy', 'precision'])

In [19]:
print(x_train.shape)
print(y_train_mod.shape)
x_train_reshaped = x_train.values.reshape(x_train.shape[0],x_train.shape[1],1)
print(x_train_reshaped.shape)

(13262, 7)
(13262, 2)
(13262, 7, 1)


In [20]:
model.fit(x=x_train, 
          y=y_train_mod, 
          batch_size=BATCH_SIZE, 
          epochs=EPOCHS, 
          verbose=0)


ValueError: Please provide as model inputs either a single array or a list of arrays. You passed: x=         Temp C  Sp Cond (uS/cm)   pH (mV)        pH  Turbidity (NTU)  \
18935  0.009154         0.989820  0.502591  0.569231         0.165005   
11661  0.759372         0.912540  0.313472  0.692308         0.087164   
15423  0.219268         0.970847  0.487047  0.569231         0.165933   
12970  0.539669         0.946321  0.413212  0.607692         0.141060   
18743  0.068003         0.993984  0.507772  0.561538         0.065169   
17483  0.208806         0.993059  0.488342  0.569231         0.136469   
10274  0.788579         0.894031  0.252591  0.746154         0.039369   
6903   0.940715         0.854697  0.323834  0.661538         0.063487   
1612   0.426330         0.792226  0.638601  0.384615         0.034636   
4955   0.746731         0.779269  0.500000  0.500000         0.044180   
7811   0.843505         0.861638  0.430052  0.561538         0.057591   
15766  0.136007         0.975937  0.511658  0.546154         0.058911   
7744   0.863121         0.857936  0.334197  0.653846         0.063628   
10383  0.769398         0.894493  0.322539  0.676923         0.082007   
15669  0.126417         0.975012  0.510363  0.553846         0.098767   
18122  0.074978         0.984729  0.465026  0.607692         0.055783   
14090  0.330427         0.960204  0.455959  0.584615         0.045783   
13043  0.588492         0.938917  0.190415  0.838462         0.054179   
2675   0.643854         0.763535  0.598446  0.407692         0.072228   
11103  0.784656         0.903748  0.265544  0.738462         0.052528   
16989  0.207062         0.986580  0.494819  0.561538         0.051711   
12811  0.600262         0.943545  0.428756  0.584615         0.299981   
12923  0.545336         0.945396  0.440415  0.584615         0.070389   
14169  0.360506         0.959741  0.440415  0.600000         0.045264   
15595  0.140366         0.973623  0.503886  0.553846         0.238004   
15696  0.127724         0.975012  0.503886  0.561538         0.075153   
12429  0.707498         0.944933  0.295337  0.715385         0.051711   
1744   0.444638         0.821842  0.648964  0.376923         0.038189   
14692  0.278989         0.962517  0.507772  0.538462         0.137507   
4862   0.735833         0.693198  0.509067  0.492308         0.053676   
...         ...              ...       ...       ...              ...   
3556   0.631648         0.914854  0.580311  0.430769         0.144614   
11394  0.789451         0.901435  0.316062  0.684615         0.078564   
17912  0.087184         0.976863  0.481865  0.584615         0.298142   
1267   0.350915         0.899584  0.658031  0.376923         0.100654   
1899   0.604621         0.848218  0.585492  0.423077         0.011713   
3005   0.697908         0.827857  0.590674  0.415385         0.093956   
189    0.471665         0.884313  0.695596  0.323077         0.082778   
2747   0.686138         0.772790  0.588083  0.415385         0.026209   
18431  0.059721         0.984267  0.485751  0.584615         0.048032   
8666   0.872276         0.872744  0.514249  0.476923         0.024825   
6396   0.857890         0.832022  0.515544  0.476923         0.071002   
17568  0.201395         0.941231  0.512953  0.538462         0.154063   
6420   0.898867         0.831559  0.418394  0.569231         0.070184   
5051   0.829555         0.811199  0.465026  0.530769         0.034495   
5311   0.746731         0.828320  0.523316  0.476923         0.057827   
2433   0.545336         0.819991  0.602332  0.415385         0.030281   
769    0.593723         0.851920  0.673575  0.338462         0.223712   
1685   0.460331         0.867191  0.624352  0.400000         0.027137   
8322   0.941151         0.866266  0.427461  0.553846         0.024212   
16023  0.214037         0.975937  0.485751  0.569231         0.044038   
11363  0.764603         0.902360  0.367876  0.630769         0.096629   
14423  0.285527         0.954188  0.494819  0.553846         0.127398   
4426   0.711857         0.767700  0.527202  0.476923         0.040831   
16850  0.181343         0.995835  0.496114  0.561538         0.075042   
6265   0.841761         0.824155  0.510363  0.484615         0.115968   
11284  0.793374         0.902823  0.304404  0.692308         0.054415   
11964  0.778553         0.925035  0.208549  0.792308         0.053865   
5390   0.743243         0.816289  0.515544  0.484615         0.059635   
860    0.485615         0.863026  0.661917  0.353846         0.369065   
15795  0.137751         0.976400  0.496114  0.561538         0.052277   

        ODOSat%  ODO (mg/L)  
18935  0.135466    0.463636  
11661  0.125936    0.159091  
15423  0.163376    0.400909  
12970  0.093261    0.184545  
18743  0.121852    0.410000  
17483  0.140231    0.371818  
10274  0.385977    0.444545  
6903   0.605174    0.619091  
1612   0.162015    0.312727  
4955   0.170184    0.214545  
7811   0.091219    0.099091  
15766  0.129340    0.389091  
7744   0.287270    0.308182  
10383  0.204901    0.246364  
15669  0.129340    0.393636  
18122  0.164738    0.475455  
14090  0.142274    0.323636  
13043  0.384615    0.527273  
2675   0.149081    0.220909  
11103  0.244384    0.286364  
16989  0.153165    0.391818  
12811  0.042886    0.104545  
12923  0.057862    0.138182  
14169  0.158611    0.332727  
15595  0.138870    0.400909  
15696  0.137509    0.405455  
12429  0.184479    0.241818  
1744   0.140231    0.277273  
14692  0.125936    0.320909  
4862   0.161334    0.207273  
...         ...         ...  
3556   0.113683    0.181818  
11394  0.157931    0.187273  
17912  0.144997    0.437273  
1267   0.102110    0.260000  
1899   0.235534    0.339091  
3005   0.110279    0.159091  
189    0.157250    0.289091  
2747   0.166099    0.228182  
18431  0.136147    0.437273  
8666   0.063989    0.063636  
6396   0.150442    0.160909  
17568  0.130701    0.360909  
6420   0.350579    0.365455  
5051   0.224643    0.250000  
5311   0.132063    0.170909  
2433   0.138870    0.240000  
769    0.127297    0.210000  
1685   0.168822    0.308182  
8322   0.166099    0.156364  
16023  0.162015    0.401818  
11363  0.067393    0.091818  
14423  0.097345    0.277273  
4426   0.186521    0.244545  
16850  0.144997    0.391818  
6265   0.136828    0.150000  
11284  0.168822    0.199091  
11964  0.282505    0.330909  
5390   0.138870    0.179091  
860    0.123894    0.240909  
15795  0.145677    0.412727  

[13262 rows x 7 columns]

In [None]:
model.summary()

In [None]:
# score = model.evaluate(x_train, y_train_mod, verbose=0)
x_test_reshaped = x_test.values.reshape(x_test.shape[0],x_test.shape[1],1)
predictions = model.predict(x_test)

In [None]:
print(predictions)