In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import datetime
import time
from math import floor

from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, Flatten, Activation, MaxPooling2D
import matplotlib.pylab as plt
import seaborn as sns

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load the data
df = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df.dtypes

Date (mm.dd.yyyy)       object
Time 24hr               object
Temp C                 float64
Sp Cond (uS/cm)          int64
pH (mV)                float64
pH                     float64
Turbidity (NTU)        float64
Chlorophyll (ug/L)     float64
Chlorophyll RFU        float64
ODOSat%                float64
ODO (mg/L)             float64
BGA-Phycocyanin RFU    float64
dtype: object

In [3]:
# Create a single datetime column from the strings provided in our csv's
# (We're already doing this bit for our models)
from datetime import datetime

timestamp = df['Date (mm.dd.yyyy)'] + ' '+ df['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df['timestamp'] = timestamp

# I'll drop the other columns for simple demo purposes
df = df[['timestamp']]
df['temp'] = [x for x in range(0, df.shape[0])]

df.dtypes

timestamp    datetime64[ns]
temp                  int64
dtype: object

In [4]:
# Now we set our datetime to be the index of the df for awesome indexing options
# Note, the column is removed, so it might be good to keep a copy by another
# name for feature engineering.
df['datetime'] = df['timestamp']
df = df.set_index('timestamp')
df.dtypes


temp                 int64
datetime    datetime64[ns]
dtype: object

In [5]:
df

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-05 00:00:00,0,2017-05-05 00:00:00
2017-05-05 00:15:00,1,2017-05-05 00:15:00
2017-05-05 00:30:00,2,2017-05-05 00:30:00
2017-05-05 00:45:00,3,2017-05-05 00:45:00
2017-05-05 01:00:00,4,2017-05-05 01:00:00
2017-05-05 01:15:00,5,2017-05-05 01:15:00
2017-05-05 01:30:00,6,2017-05-05 01:30:00
2017-05-05 01:45:00,7,2017-05-05 01:45:00
2017-05-05 02:00:00,8,2017-05-05 02:00:00
2017-05-05 02:15:00,9,2017-05-05 02:15:00


In [6]:
# Index into a range of dates
df['2017-05-05 00:15:00':'2017-05-05 00:45:00']

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-05 00:15:00,1,2017-05-05 00:15:00
2017-05-05 00:30:00,2,2017-05-05 00:30:00
2017-05-05 00:45:00,3,2017-05-05 00:45:00


In [7]:
# Index with an offset
start = df['datetime'][0]
delta = pd.Timedelta(1, unit='h')
print("start:", start)
print(delta)
offset = pd.Timedelta(1, 'm')
# grab a slice over 1 hour
df[start:start + delta]

start: 2017-05-05 00:00:00
0 days 01:00:00


Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-05 00:00:00,0,2017-05-05 00:00:00
2017-05-05 00:15:00,1,2017-05-05 00:15:00
2017-05-05 00:30:00,2,2017-05-05 00:30:00
2017-05-05 00:45:00,3,2017-05-05 00:45:00
2017-05-05 01:00:00,4,2017-05-05 01:00:00


In [8]:
df[start+delta+offset: start+delta+delta]

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-05 01:15:00,5,2017-05-05 01:15:00
2017-05-05 01:30:00,6,2017-05-05 01:30:00
2017-05-05 01:45:00,7,2017-05-05 01:45:00
2017-05-05 02:00:00,8,2017-05-05 02:00:00


In [9]:
def create_windows(dataset, window_size, shift, time_col, days_ahead=None, hours_ahead=None):
    """
    determines the window size for the data set
    :param dataset: The dataset to get windows for
    :param window_size: the size of the window
    :param shift: the amout to shift the window
    :param time_col: the time column to determine the window size on
    :param days_ahead: the numer of days ahead to look for the target window
    :param hours_ahead: the number of hours ahead to look for the target window
   
    :yield: the indexes for the next window 
    """
    start = dataset[time_col][0]
    
    if hours_ahead != None:
        end_delta = pd.Timedelta(window_size,unit='h')
    else: 
        end_delta = pd.Timedelta(window_size, unit='D')    
    if hours_ahead != None:
        ahead_delta = end_delta + pd.Timedelta(hours_ahead,unit='h')
    else:
        ahead_delta = end_delta + pd.Timedelta(days_ahead, unit='D')
    print(start)
    print(end_delta)
    print(ahead_delta)
    input()
    while ( start+ahead_delta < dataset[time_col][dataset.shape[0]-1]): 
        yield (  start , end_delta, ahead_delta )
        # shift the window 'shift' hour blocks of time
        start = update_indicies( shift, start )
        
def segment_dataset(dataset, time_col, shift=1, window_multiplier=2, days_ahead=1, hours_ahead=None):
    """
    Segments the dataset based on the parameters that are passed in.
    
    :param dataset: the dataset to segment into windows
    :param time_col: the name of the time column in the dataset
    :param window_multiplier: the size times larger for the window of features to be compared to the target. Default is twice the size
    :param hours_ahead: the number of hours ahead for the window to get back
    :param days_ahead: the number of days ahead of the window to get back. If no other  Default is 1
    :return: An array of Dataframes windowed for features and targets
    """
    
    if hours_ahead != None:
        window_size = floor(hours_ahead * window_multiplier)
    else:
        window_size = floor(days_ahead * window_multiplier)
    segments = []
    targets  = []
    if hours_ahead != None:
        for (start, end_delta, ahead_delta) in create_windows(dataset, window_size, shift, time_col, hours_ahead=hours_ahead):
            segments.append(dataset[start:start + end_delta])
            targets.append(dataset[start + end_delta: start+ahead_delta])
    else:
        # if no option is selected will default to days ahead
        for (start, end_delta, ahead_delta) in create_windows(dataset, window_size, shift, time_col, days_ahead=days_ahead):
            segments.append(dataset[start:start + end_delta])
            targets.append(dataset[start + end_delta: start+ahead_delta])
    return segments, targets

def update_indicies(shift, value):
    """
    Updates the indicies with the newest indicies based on the shift value passed in
    
    :param shift: the amount to shift the window by
    :param value: the value to shift
    
    :return: the next index for the specified value
    """
    return value + pd.Timedelta(shift, unit='h')
  

In [10]:
df['datetime'][len(df['datetime'])-1]

Timestamp('2017-11-18 08:30:00')

In [11]:
start = df['datetime'][0]
df[start:start + pd.Timedelta(1, 'm')]

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-05-05,0,2017-05-05


In [37]:
df.describe()

Unnamed: 0,temp
count,18947.0
mean,9473.0
std,5469.672111
min,0.0
25%,4736.5
50%,9473.0
75%,14209.5
max,18946.0


In [42]:
df.quantile(.5)

temp    9473.0
Name: 0.5, dtype: float64

In [16]:
def segment(df, time_col, x_win, y_win, shift):
    segments = []
    targets = []
    start = df[time_col][0]
    end = df[time_col][len(df[time_col])-1]
    offset = pd.Timedelta(1, unit='s') # to remove overlap of x and y
    while start + x_win + y_win <= end:
        segments.append(df[start:start + x_win])
        targets.append(df[start  + x_win + offset : start + x_win + y_win])
        start += shift
    return segments, targets
        

In [21]:
s, t = segment(df, 
               'datetime', 
               pd.Timedelta(2, unit='d'), pd.Timedelta(1, unit='d'),
        pd.Timedelta(6, unit='h'))

In [24]:
s[-1]

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-15 06:00:00,18648,2017-11-15 06:00:00
2017-11-15 06:15:00,18649,2017-11-15 06:15:00
2017-11-15 06:30:00,18650,2017-11-15 06:30:00
2017-11-15 06:45:00,18651,2017-11-15 06:45:00
2017-11-15 07:00:00,18652,2017-11-15 07:00:00
2017-11-15 07:15:00,18653,2017-11-15 07:15:00
2017-11-15 07:30:00,18654,2017-11-15 07:30:00
2017-11-15 07:45:00,18655,2017-11-15 07:45:00
2017-11-15 08:00:00,18656,2017-11-15 08:00:00
2017-11-15 08:15:00,18657,2017-11-15 08:15:00


In [25]:
t[-1]

Unnamed: 0_level_0,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-17 06:15:00,18841,2017-11-17 06:15:00
2017-11-17 06:30:00,18842,2017-11-17 06:30:00
2017-11-17 06:45:00,18843,2017-11-17 06:45:00
2017-11-17 07:00:00,18844,2017-11-17 07:00:00
2017-11-17 07:15:00,18845,2017-11-17 07:15:00
2017-11-17 07:30:00,18846,2017-11-17 07:30:00
2017-11-17 07:45:00,18847,2017-11-17 07:45:00
2017-11-17 08:00:00,18848,2017-11-17 08:00:00
2017-11-17 08:15:00,18849,2017-11-17 08:15:00
2017-11-17 08:30:00,18850,2017-11-17 08:30:00
