In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from scripts.model_functions import *
from scripts.helper_functions import *
from sklearn.preprocessing import MinMaxScaler
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, Flatten, Activation

Using TensorFlow backend.


In [2]:
# Create new variables to be used in Keras and the CNN

# number of items to use for training
BATCH_SIZE = 400 

# number of times to repeat process
EPOCHS = 100

# how many points ahead to look
POINTS_AHEAD = 0


In [3]:
# Load the data
df_train = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df_train.head()

df_test = pd.read_csv('../../data/cleaned/utah_2018_vineyard.csv')
df_test.head()

Unnamed: 0,Time America/Boise UTC-06:00,Temperature C,Sp Cond uS/cm,pH mV,pH,Turbidity NTU,Chlorophyll ug/L,Chlorophyll RFU,ODOSat %,ODO mg/L,BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V
0,4/11/2018 12:00,10.39,1885,-87.7,8.45,47.62,10.8,2.7,84.9,9.44,0.6,1,12.26,6.4
1,4/11/2018 12:15,11.67,1887,-89.3,8.48,43.17,9.1,2.3,90.4,9.75,0.5,1,12.23,6.4
2,4/11/2018 12:30,11.42,1890,-89.4,8.48,43.86,9.4,2.3,90.0,9.78,0.6,1,12.28,6.4
3,4/11/2018 12:45,11.4,1887,-89.7,8.49,43.42,10.2,2.5,89.6,9.73,0.6,1,12.43,6.39
4,4/11/2018 13:00,11.08,1885,-89.8,8.49,44.26,10.7,2.7,88.4,9.67,0.6,1,12.26,6.4


In [4]:
# Adding time stamp 
from datetime import datetime

timestamp = df_train['Date (mm.dd.yyyy)'] + ' '+ df_train['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df_train['Timestamp'] = timestamp
df_train.head()

timestamp = pd.to_datetime(df_test['Time America/Boise UTC-06:00'])
df_test['Timestamp'] = timestamp
df_test.head()


Unnamed: 0,Time America/Boise UTC-06:00,Temperature C,Sp Cond uS/cm,pH mV,pH,Turbidity NTU,Chlorophyll ug/L,Chlorophyll RFU,ODOSat %,ODO mg/L,BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V,Timestamp
0,4/11/2018 12:00,10.39,1885,-87.7,8.45,47.62,10.8,2.7,84.9,9.44,0.6,1,12.26,6.4,2018-04-11 12:00:00
1,4/11/2018 12:15,11.67,1887,-89.3,8.48,43.17,9.1,2.3,90.4,9.75,0.5,1,12.23,6.4,2018-04-11 12:15:00
2,4/11/2018 12:30,11.42,1890,-89.4,8.48,43.86,9.4,2.3,90.0,9.78,0.6,1,12.28,6.4,2018-04-11 12:30:00
3,4/11/2018 12:45,11.4,1887,-89.7,8.49,43.42,10.2,2.5,89.6,9.73,0.6,1,12.43,6.39,2018-04-11 12:45:00
4,4/11/2018 13:00,11.08,1885,-89.8,8.49,44.26,10.7,2.7,88.4,9.67,0.6,1,12.26,6.4,2018-04-11 13:00:00


In [5]:
# dont need data and time now that we have Timestamp. Lets remove them

df_train = df_train.drop(columns=['Date (mm.dd.yyyy)', 'Time 24hr'])
df_train.head(5)

df_test = df_test.drop(columns=['Time America/Boise UTC-06:00'])
df_test.head(5)

Unnamed: 0,Temperature C,Sp Cond uS/cm,pH mV,pH,Turbidity NTU,Chlorophyll ug/L,Chlorophyll RFU,ODOSat %,ODO mg/L,BGA-Phycocyanin RFU,Wiper Pos V,Cable Pwr V,Battery V,Timestamp
0,10.39,1885,-87.7,8.45,47.62,10.8,2.7,84.9,9.44,0.6,1,12.26,6.4,2018-04-11 12:00:00
1,11.67,1887,-89.3,8.48,43.17,9.1,2.3,90.4,9.75,0.5,1,12.23,6.4,2018-04-11 12:15:00
2,11.42,1890,-89.4,8.48,43.86,9.4,2.3,90.0,9.78,0.6,1,12.28,6.4,2018-04-11 12:30:00
3,11.4,1887,-89.7,8.49,43.42,10.2,2.5,89.6,9.73,0.6,1,12.43,6.39,2018-04-11 12:45:00
4,11.08,1885,-89.8,8.49,44.26,10.7,2.7,88.4,9.67,0.6,1,12.26,6.4,2018-04-11 13:00:00


In [6]:
# lets try to normalize this now....
train_target = df_train['BGA-Phycocyanin RFU']
dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)','Chlorophyll RFU']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(df_train[dataset_columns])
df_train = pd.DataFrame(ds_scaled,columns=dataset_columns)
df_train['BGA-Phycocyanin RFU'] = train_target
df_train.describe()

# lets try to normalize this now....
test_target = df_test['BGA-Phycocyanin RFU']
dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)','Chlorophyll RFU']
scaler = MinMaxScaler()
ds_scaled = scaler.fit_transform(df_test[dataset_columns])
df_test = pd.DataFrame(ds_scaled,columns=dataset_columns)
df_test['BGA-Phycocyanin RFU'] = test_target
df_test.describe()

  return self.partial_fit(X, y)


KeyError: "['Sp Cond (uS/cm)', 'pH (mV)', 'ODOSat%', 'Temp C', 'ODO (mg/L)', 'Turbidity (NTU)'] not in index"

In [None]:
dataset_columns = ['Temp C','Sp Cond (uS/cm)', 'pH (mV)','pH', 'Turbidity (NTU)', 'ODOSat%','ODO (mg/L)', 'Chlorophyll RFU']
(x_train, y_train) = segment_dataset(df_train, dataset_columns , 'BGA-Phycocyanin RFU', 7, pts_ahead=POINTS_AHEAD)
(x_test, y_test) = segment_dataset(df_test, dataset_columns , 'BGA-Phycocyanin RFU', 7, pts_ahead=POINTS_AHEAD)



In [None]:
print("TRAIN SHAPE",x_train.shape)
print("TEST SHAPE",x_test.shape)

In [None]:
print("TRAIN TARGETS", y_train.shape)
print("TEST TARGETS", y_test.shape)

## Shaping the data to be used in the model.

In [None]:
x_train = x_train.reshape(len(x_train),7,8)
x_train.shape

x_test = x_test.reshape(len(x_test),7,8)
x_test.shape

In [None]:
print(x_train[0])

In [None]:
plt.imshow(x_train[1])

## Breaking apart training and test data

In [None]:
print("x_train shape:",x_train.shape)
print("x_test shape:",x_test.shape)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

In [None]:
if POINTS_AHEAD > 0:
    y_train = y_train.reshape(y_train.shape[0],y_train.shape[1])
    y_test = y_test.reshape(y_test.shape[0],y_test.shape[1])
else:
    y_train = y_train.reshape(y_train.shape[0], 1)
    y_test = y_test.reshape(y_test.shape[0], 1)
print("y_train shape:",y_train.shape)
print("y_test shape:",y_test.shape)

## Build the CNN model

The determining factor of a CNN model is at least one Convolutional layer in the model. so lets try to build something to see what kind of metrics that we are getting. 

In [None]:
model = Sequential()
model.add(Conv1D(40,kernel_size=(4),input_shape=(x_test.shape[1],x_test.shape[2])))
model.add(Dense(40))
model.add(Dropout(0.2))
model.add(Dense(20))
model.add(Flatten())
model.add(Dense(y_test.shape[1]))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mse']
             )
model.summary()

In [None]:
print(y_test[0])

In [None]:
model.fit(x=x_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
# What is our score?

score = model.evaluate(x_test, y_test, verbose=1)
cnn_predictions = model.predict(x_test)
print("Loss {} MSE {}".format(score[0], score[1]))

## Building a regression model

Experimenting to see if a regression model will help with doing predicting a 

In [None]:
model = Sequential()
model.add(Dense(40, input_shape=(x_test.shape[1],x_test.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(20))
model.add(Flatten())
model.add(Dense(y_test.shape[1]))
model.compile(loss='mean_squared_error',
              optimizer='rmsprop',
              metrics=['mse']
             )
model.summary()

In [None]:
model.fit(x=x_train, y=y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1)
# What is our score?
score = model.evaluate(x_test, y_test, verbose=1)
linear_predictions = model.predict(x_test)
print("Loss {} MSE {}".format(score[0], score[1]))

In [None]:
lines = pd.DataFrame({'True':y_test.reshape(y_test.shape[0]), 'Prediction':linear_predictions.reshape(linear_predictions.shape[0])})
lines.plot()

In [None]:
lines = pd.DataFrame({'True':y_test.reshape(y_test.shape[0]), 'Prediction':cnn_predictions.reshape(cnn_predictions.shape[0])})
lines.plot()