In [60]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from scripts.model_functions import *
from scripts.helper_functions import *
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, Flatten, Activation

In [61]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 400 

# number of times to repeat process
EPOCHS = 100

#The name that the time column wil be
TIME_COL = 'Timestamp'

In [62]:
# Load the data
df_train = pd.read_csv('../../data/cleaned/utah_2017_marina.csv')
df_train.head()

df_test = pd.read_csv('../../data/cleaned/utah_2018_marina.csv')
df_test.head()

Unnamed: 0,Time America/Boise UTC-06:00,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V,FDOM RFU
0,4/11/2018 10:30,11.88,1782,-146.1,8.2,42.49,15.3,4.1,89.1,9.58,0.6,1,13,5.2,1.2
1,4/11/2018 10:45,11.78,1783,-143.7,8.15,42.89,16.6,4.4,87.4,9.42,0.6,1,13,5.3,1.1
2,4/11/2018 11:00,11.79,1784,-144.3,8.17,41.24,18.2,4.9,89.0,9.58,0.7,1,13,5.2,1.2
3,4/11/2018 11:15,11.66,1783,-143.1,8.14,42.65,20.3,5.4,87.1,9.41,0.7,1,13,5.2,1.2
4,4/11/2018 11:30,11.68,1784,-143.5,8.15,42.06,15.8,4.2,87.5,9.45,0.7,1,13,5.2,1.2


In [63]:
# Adding time stamp 
from datetime import datetime

timestamp = df_train['Date (mm.dd.yyyy)'] + ' '+ df_train['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_train[TIME_COL] = timestamp
df_train['datetime'] = timestamp
df_train = df_train.set_index(TIME_COL)
df_train.head()

timestamp = pd.to_datetime(df_test['Time America/Boise UTC-06:00'])
df_test[TIME_COL] = timestamp
df_test['datetime'] = timestamp
df_test = df_test.set_index(TIME_COL)
df_test.head()


Unnamed: 0_level_0,Time America/Boise UTC-06:00,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V,FDOM RFU,datetime
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2018-04-11 10:30:00,4/11/2018 10:30,11.88,1782,-146.1,8.2,42.49,15.3,4.1,89.1,9.58,0.6,1,13,5.2,1.2,2018-04-11 10:30:00
2018-04-11 10:45:00,4/11/2018 10:45,11.78,1783,-143.7,8.15,42.89,16.6,4.4,87.4,9.42,0.6,1,13,5.3,1.1,2018-04-11 10:45:00
2018-04-11 11:00:00,4/11/2018 11:00,11.79,1784,-144.3,8.17,41.24,18.2,4.9,89.0,9.58,0.7,1,13,5.2,1.2,2018-04-11 11:00:00
2018-04-11 11:15:00,4/11/2018 11:15,11.66,1783,-143.1,8.14,42.65,20.3,5.4,87.1,9.41,0.7,1,13,5.2,1.2,2018-04-11 11:15:00
2018-04-11 11:30:00,4/11/2018 11:30,11.68,1784,-143.5,8.15,42.06,15.8,4.2,87.5,9.45,0.7,1,13,5.2,1.2,2018-04-11 11:30:00


In [64]:
# dont need data and time now that we have Timestamp. Lets remove them

df_train = df_train.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
df_train.head(5)

df_test = df_test.drop(columns=['Time America/Boise UTC-06:00'])
df_test.head(5)

Unnamed: 0_level_0,Temp C,Sp Cond (uS/cm),pH (mV),pH,Turbidity (NTU),Chlorophyll (ug/L),Chlorophyll RFU,ODOSat%,ODO (mg/L),BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V,FDOM RFU,datetime
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-04-11 10:30:00,11.88,1782,-146.1,8.2,42.49,15.3,4.1,89.1,9.58,0.6,1,13,5.2,1.2,2018-04-11 10:30:00
2018-04-11 10:45:00,11.78,1783,-143.7,8.15,42.89,16.6,4.4,87.4,9.42,0.6,1,13,5.3,1.1,2018-04-11 10:45:00
2018-04-11 11:00:00,11.79,1784,-144.3,8.17,41.24,18.2,4.9,89.0,9.58,0.7,1,13,5.2,1.2,2018-04-11 11:00:00
2018-04-11 11:15:00,11.66,1783,-143.1,8.14,42.65,20.3,5.4,87.1,9.41,0.7,1,13,5.2,1.2,2018-04-11 11:15:00
2018-04-11 11:30:00,11.68,1784,-143.5,8.15,42.06,15.8,4.2,87.5,9.45,0.7,1,13,5.2,1.2,2018-04-11 11:30:00


In [65]:
# These are now are now a list of dataframes.
(x_train, y_train) = segment_dataset(df_train, 'datetime')
(x_test, y_test) = segment_dataset(df_test, 'datetime')


In [66]:
print(type(x_train))
print(type(x_test))
print(type(y_train))
print(type(y_test))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


First lets extract the values for all of the y-columns

In [167]:
def _extract_target(targets, target, percentile):
    """
    Extracts the target into a single 1D array
    
    :param targets: the list of targets to extract values for
    :param target: the target to be extracted
    :param percentile: the prercentile to get the quantile of
    :param time_col: 
    """
    ret_val = np.array([])
    print("Starting segmentation")
    for df in targets:
        ret_val = np.append(ret_val, df.quantile(percentile)[target])
    return ret_val



def convert_to_np_array(df):
    ret_val = np.empty((len(df), df[0].shape[0], df[0].shape[1]-1))
    for i in range(len(df)):
        ret_val[i] =  df[i].drop(columns='datetime').values
        print("converstion {:.2f}% complete".format((i / len(df) * 100)))
    return ret_val

In [100]:
y_train_arr = _extract_target(y_train, 'BGA-Phycocyanin RFU', 0.95)
y_test_arr = _extract_target(y_test, 'BGA-Phycocyanin RFU', 0.95)


Starting segmentation
Starting segmentation


In [1]:
x_train[610]

NameError: name 'x_train' is not defined

In [168]:
x_train_np = convert_to_np_array(x_train)
x_test_np = convert_to_np_array(x_test)

0
converstion 0.00% complete
1
converstion 0.02% complete
2
converstion 0.04% complete
3
converstion 0.06% complete
4
converstion 0.09% complete
5
converstion 0.11% complete
6
converstion 0.13% complete
7
converstion 0.15% complete
8
converstion 0.17% complete
9
converstion 0.19% complete
10
converstion 0.21% complete
11
converstion 0.24% complete
12
converstion 0.26% complete
13
converstion 0.28% complete
14
converstion 0.30% complete
15
converstion 0.32% complete
16
converstion 0.34% complete
17
converstion 0.36% complete
18
converstion 0.39% complete
19
converstion 0.41% complete
20
converstion 0.43% complete
21
converstion 0.45% complete
22
converstion 0.47% complete
23
converstion 0.49% complete
24
converstion 0.51% complete
25
converstion 0.54% complete
26
converstion 0.56% complete
27
converstion 0.58% complete
28
converstion 0.60% complete
29
converstion 0.62% complete
30
converstion 0.64% complete
31
converstion 0.66% complete
32
converstion 0.69% complete
33
converstion 0.71%

447
converstion 9.58% complete
448
converstion 9.60% complete
449
converstion 9.62% complete
450
converstion 9.64% complete
451
converstion 9.67% complete
452
converstion 9.69% complete
453
converstion 9.71% complete
454
converstion 9.73% complete
455
converstion 9.75% complete
456
converstion 9.77% complete
457
converstion 9.79% complete
458
converstion 9.82% complete
459
converstion 9.84% complete
460
converstion 9.86% complete
461
converstion 9.88% complete
462
converstion 9.90% complete
463
converstion 9.92% complete
464
converstion 9.94% complete
465
converstion 9.97% complete
466
converstion 9.99% complete
467
converstion 10.01% complete
468
converstion 10.03% complete
469
converstion 10.05% complete
470
converstion 10.07% complete
471
converstion 10.09% complete
472
converstion 10.12% complete
473
converstion 10.14% complete
474
converstion 10.16% complete
475
converstion 10.18% complete
476
converstion 10.20% complete
477
converstion 10.22% complete
478
converstion 10.24% compl

ValueError: could not broadcast input array from shape (191,11) into shape (193,11)

In [163]:
ret_val = np.empty((len(x_test), x_test[0].shape[0], x_test[0].shape[1]))
ret_val[0] =  x_test[0].values
print("converstion {:.2f}% complete".format((i / len(x_test) * 100)))

TypeError: float() argument must be a string or a number, not 'Timestamp'

Now that we have something a little bit more manageable, we can then keep only the columns that we need for training.

Lets first take a look at what columns are differing. Because we can see that x_test_df has a larger second dimension, we can use that to see what additional columns it has that x_train_df does not have

In [49]:
for col in x_test_df.columns:
    if col not in x_train_df:
        print(col)

Battery V
Cable Pwr V
Wiper Pos V


In [50]:
# These are useless stats when it comes to predicting algae blooms, lets just remove them and start the training
cols = ['Battery V', 'Cable Pwr V', 'Wiper Pos V', 'BGA-Phycocyanin RFU','datetime']
x_test_df = x_test_df.drop(columns=cols)
x_train_df = x_train_df.drop(columns=['BGA-Phycocyanin RFU','datetime'])

In [51]:
print(x_test_df.columns)
print(x_train_df.columns)

Index(['Chlorophyll (ug/L)', 'Chlorophyll RFU', 'FDOM RFU', 'ODO (mg/L)',
       'ODOSat%', 'Sp Cond (uS/cm)', 'Temp C', 'Turbidity (NTU)', 'pH',
       'pH (mV)'],
      dtype='object')
Index(['Chlorophyll (ug/L)', 'Chlorophyll RFU', 'FDOM RFU', 'ODO (mg/L)',
       'ODOSat%', 'Sp Cond (uS/cm)', 'Temp C', 'Turbidity (NTU)', 'pH',
       'pH (mV)'],
      dtype='object')


In [55]:
# time to normalize this!
dataset_columns = [ 'FDOM RFU', 'ODO (mg/L)', 'ODOSat%', 'Sp Cond (uS/cm)', 'Temp C', 
                   'Turbidity (NTU)', 'pH']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(x_train_df[dataset_columns])
x_train = pd.DataFrame(ds_scaled,columns=dataset_columns)
df_train.describe()

# lets try to normalize this now....

ds_scaled = scaler.fit_transform(x_test_df[dataset_columns])
x_test = pd.DataFrame(ds_scaled,columns=dataset_columns)
x_test.describe()

Unnamed: 0,FDOM RFU,ODO (mg/L),ODOSat%,Sp Cond (uS/cm),Temp C,Turbidity (NTU),pH
count,4801.0,4801.0,4801.0,4801.0,4801.0,4801.0,4801.0
mean,0.223649,0.371208,0.327573,0.552027,0.586564,0.156474,0.501846
std,0.172719,0.206651,0.221779,0.247621,0.265814,0.16493,0.195866
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.110749,0.222742,0.141146,0.357625,0.425512,0.059906,0.391631
50%,0.173724,0.335786,0.271623,0.483131,0.645141,0.100693,0.515021
75%,0.287731,0.495875,0.459051,0.813765,0.788043,0.182834,0.603004
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [56]:
print("TRAIN SHAPE",x_train.shape)
print("TEST SHAPE",x_test.shape)

TRAIN SHAPE (4666, 7)
TEST SHAPE (4801, 7)


In [59]:
print("TRAIN TARGETS", y_train_arr.shape)
print("TEST TARGETS", y_test_arr.shape)

TRAIN TARGETS (4666,)
TEST TARGETS (4801,)


## Shaping the data to be used in the model.

In [None]:
x_train = x_train.reshape(len(x_train),7,8)
x_train.shape

x_test = x_test.reshape(len(x_test),7,8)
x_test.shape

In [None]:
print(x_train[0])

In [None]:
plt.imshow(x_train[1])

## Breaking apart training and test data

In [None]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

In [None]:
if POINTS_AHEAD > 0:
    y_train = y_train.reshape(y_train.shape[0],y_train.shape[1])
    y_test = y_test.reshape(y_test.shape[0],y_test.shape[1])
else:
    y_train = y_train.reshape(y_train.shape[0], 1)
    y_test = y_test.reshape(y_test.shape[0], 1)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

## Build the CNN model

The determining factor of a CNN model is at least one Convolutional layer in the model. so lets try to build something to see what kind of metrics that we are getting. 

In [None]:
model = Sequential()
model.add(Conv1D(40,kernel_size=(4),input_shape=(x_test.shape[1],x_test.shape[2])))
model.add(Dense(40))
model.add(Dropout(0.2))
model.add(Dense(20))
model.add(Flatten())
model.add(Dense(y_test.shape[1]))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mse']
             )
model.summary()

In [None]:
print(y_test[0])

In [None]:
model.fit(x=x_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
# What is our score?

score = model.evaluate(x_test, y_test, verbose=1)
cnn_predictions = model.predict(x_test)
print("Loss {} MSE {}".format(score[0], score[1]))

## Building a regression model

Experimenting to see if a regression model will help with doing predicting a 

In [None]:
model = Sequential()
model.add(Dense(40, input_shape=(x_test.shape[1],x_test.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(20))
model.add(Flatten())
model.add(Dense(y_test.shape[1]))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mse']
             )
model.summary()

In [None]:
model.fit(x=x_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
# What is our score?
score = model.evaluate(x_test, y_test, verbose=1)
linear_predictions = model.predict(x_test)
print("Loss {} MSE {}".format(score[0], score[1]))

In [None]:
lines = pd.DataFrame({'True':y_test.reshape(y_test.shape[0]), 'Prediction':linear_predictions.reshape(linear_predictions.shape[0])})
lines.plot()

In [None]:
lines = pd.DataFrame({'True':y_test.reshape(y_test.shape[0]), 'Prediction':cnn_predictions.reshape(cnn_predictions.shape[0])})
lines.plot()