In [1]:
import pandas as pd
import tensorflow as tf
import keras as ks
import numpy as np
from sklearn.metrics import recall_score, precision_score, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import datetime
import time
from math import floor

from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, Flatten, Activation, MaxPooling2D
import matplotlib.pylab as plt
import seaborn as sns

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load the data
df = pd.read_csv('../../data/cleaned/site1_vineyard.csv')
df.dtypes

Date (mm.dd.yyyy)       object
Time 24hr               object
Temp C                 float64
Sp Cond (uS/cm)          int64
pH (mV)                float64
pH                     float64
Turbidity (NTU)        float64
Chlorophyll (ug/L)     float64
Chlorophyll RFU        float64
ODOSat%                float64
ODO (mg/L)             float64
BGA-Phycocyanin RFU    float64
dtype: object

In [3]:
# Create a single datetime column from the strings provided in our csv's
# (We're already doing this bit for our models)
from datetime import datetime

timestamp = df['Date (mm.dd.yyyy)'] + ' '+ df['Time 24hr']
timestamp = pd.to_datetime(timestamp)
df['timestamp'] = timestamp

# I'll drop the other columns for simple demo purposes
df = df[['timestamp', 'pH', 'Temp C']]
df['temp'] = [x for x in range(0, df.shape[0])]

df.dtypes

timestamp    datetime64[ns]
pH                  float64
Temp C              float64
temp                  int64
dtype: object

In [4]:
# Now we set our datetime to be the index of the df for awesome indexing options
# Note, the column is removed, so it might be good to keep a copy by another
# name for feature engineering.
df['datetime'] = df['timestamp']
df = df.set_index('timestamp')
df.dtypes


pH                 float64
Temp C             float64
temp                 int64
datetime    datetime64[ns]
dtype: object

In [5]:
# Index into a range of dates
df['2017-05-05 00:15:00':'2017-05-05 00:45:00']

Unnamed: 0_level_0,pH,Temp C,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:15:00,8.36,14.99,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,14.96,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,14.95,3,2017-05-05 00:45:00


In [6]:
# Index with an offset
start = df['datetime'][0]
delta = pd.Timedelta(1, unit='h')
print("start:", start)
print(delta)
offset = pd.Timedelta(1, 'm')
# grab a slice over 1 hour
df[start:start + delta]

start: 2017-05-05 00:00:00
0 days 01:00:00


Unnamed: 0_level_0,pH,Temp C,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:00:00,8.36,15.02,0,2017-05-05 00:00:00
2017-05-05 00:15:00,8.36,14.99,1,2017-05-05 00:15:00
2017-05-05 00:30:00,8.36,14.96,2,2017-05-05 00:30:00
2017-05-05 00:45:00,8.36,14.95,3,2017-05-05 00:45:00
2017-05-05 01:00:00,8.36,14.92,4,2017-05-05 01:00:00


In [7]:
df[start+delta+offset: start+delta+delta]

Unnamed: 0_level_0,pH,Temp C,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 01:15:00,8.36,14.92,5,2017-05-05 01:15:00
2017-05-05 01:30:00,8.36,14.9,6,2017-05-05 01:30:00
2017-05-05 01:45:00,8.36,14.88,7,2017-05-05 01:45:00
2017-05-05 02:00:00,8.36,14.84,8,2017-05-05 02:00:00


In [8]:
df[start:start + delta].describe()

Unnamed: 0,pH,Temp C,temp
count,5.0,5.0,5.0
mean,8.36,14.968,2.0
std,0.0,0.038341,1.581139
min,8.36,14.92,0.0
25%,8.36,14.95,1.0
50%,8.36,14.96,2.0
75%,8.36,14.99,3.0
max,8.36,15.02,4.0


In [9]:
df[start:start + delta].quantile(.5, numeric_only=False)['datetime']

Timestamp('2017-05-05 00:30:00')

In [10]:
temp = df[start:start + delta]
a = temp[0:0]
a = a.append(temp.quantile(.5, numeric_only=False))

In [11]:
dfs = [df[start:start + delta], df[start+delta+offset: start+delta+delta]]


In [41]:
# extracted = dfs[0][0:0] # start with a blank data frame with the same setup
extracted = pd.DataFrame()
for df in dfs:
    extracted = extracted.append(df.quantile(.5, numeric_only=False))
extracted['datetimeIndex'] = extracted['datetime']
extracted = extracted.set_index('datetimeIndex')

In [42]:
extracted

Unnamed: 0_level_0,Temp C,datetime,pH,temp
datetimeIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 00:30:00,14.96,2017-05-05 00:30:00,8.36,2.0
2017-05-05 01:37:30,14.89,2017-05-05 01:37:30,8.36,6.5


In [31]:
a = pd.DataFrame()

pandas.core.frame.DataFrame

In [14]:
df['datetime'][len(df['datetime'])-1]

Timestamp('2017-05-05 02:00:00')

In [15]:
start = df['datetime'][0]
df[start:start + pd.Timedelta(1, 'm')]

Unnamed: 0_level_0,pH,Temp C,temp,datetime
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-05-05 01:15:00,8.36,14.92,5,2017-05-05 01:15:00


In [16]:
df.describe()

Unnamed: 0,pH,Temp C,temp
count,4.0,4.0,4.0
mean,8.36,14.885,6.5
std,0.0,0.034157,1.290994
min,8.36,14.84,5.0
25%,8.36,14.87,5.75
50%,8.36,14.89,6.5
75%,8.36,14.905,7.25
max,8.36,14.92,8.0


In [17]:
df.columns

Index(['pH', 'Temp C', 'temp', 'datetime'], dtype='object')

In [18]:
print(type(df.quantile(.5)))
df['datetime'].quantile(.5)

<class 'pandas.core.series.Series'>


Timestamp('2017-05-05 01:37:30')

In [19]:
def segment(df, time_col, x_win, y_win, shift):
    segments = []
    targets = []
    start = df[time_col][0]
    end = df[time_col][len(df[time_col])-1]
    offset = pd.Timedelta(1, unit='s') # to remove overlap of x and y
    while start + x_win + y_win <= end:
        segments.append(df[start:start + x_win])
        targets.append(df[start  + x_win + offset : start + x_win + y_win])
        start += shift
    return segments, targets
        

In [20]:
s, t = segment(df, 
               'datetime', 
               pd.Timedelta(2, unit='d'), pd.Timedelta(1, unit='d'),
        pd.Timedelta(6, unit='h'))