In [1]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os
import requests
import base64


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


# Encode text values to a single dummy variable.  The new columns (which do not replace the old) will have a 1
# at every location where the original column (name) matches each of the target_values.  One column is added for
# each target value(tv).
def encode_text_single_dummy(df, name, target_values):
    for tv in target_values:
        l = list(df[name].astype(str))
        l = [1 if str(x) == str(tv) else 0 for x in l]
        name2 = "{}-{}".format(name, tv)
        df[name2] = l


# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32)
    else:
        # Regression
        return df.as_matrix(result).astype(np.float32), df.as_matrix([target]).astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [2]:
from keras.utils.data_utils import get_file

path = "./data/"

filename_read = os.path.join(path,"Raw Data Gas Pipline.csv")
df = pd.read_csv(filename_read)

print("Read {} rows.".format(len(df)))

# display 5 rows
df[0:5]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Read 97019 rows.


Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time,result
0,4,4,183,233,9,18,3,10,3,10,...,0,20,2,1,0,0,0,0.528736,1.106868,0
1,4,4,183,233,9,18,3,10,3,10,...,0,20,2,1,0,0,0,0.54023,1.043098,0
2,4,4,183,233,9,18,3,10,3,10,...,0,20,2,1,0,0,0,0.54023,1.266332,0
3,4,4,183,233,9,18,3,10,3,10,...,0,20,2,1,0,0,0,0.54023,1.11628,0
4,4,4,183,233,9,18,3,10,3,10,...,0,20,2,1,0,0,0,0.528736,1.257916,0


In [4]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

In [5]:
# Analyze Raw data set for water tank storage

import tensorflow.contrib.learn as skflow
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

path = "./data/"

filename_read = os.path.join(path,"Raw Data Gas Pipline.csv")
analyze(filename_read)


Analyzing: ./data/Raw Data Gas Pipline.csv
97019 rows
** command_address:149 (0%)
** response_address:[4:92.99%,0:7.01%]
** command_memory:[183:99.88%,255:0.02%,147:0.0%,42:0.0%,0:0.0%,120:0.0%,83:0.0%,15:0.0%,79:0.0%,143:0.0%,16:0.0%,80:0.0%,17:0.0%,81:0.0%,145:0.0%,18:0.0%,121:0.0%,89:0.0%,20:0.0%,84:0.0%,148:0.0%,22:0.0%,150:0.0%,87:0.0%,151:0.0%,88:0.0%,152:0.0%,14:0.0%,141:0.0%,13:0.0%,140:0.0%,64:0.0%,128:0.0%,129:0.0%,66:0.0%,130:0.0%,3:0.0%,67:0.0%,68:0.0%,132:0.0%,5:0.0%,134:0.0%,7:0.0%,71:0.0%,8:0.0%,136:0.0%,73:0.0%,138:0.0%,11:0.0%,75:0.0%,25:0.0%,153:0.0%,119:0.0%,174:0.0%,105:0.0%,57:0.0%,43:0.0%,171:0.0%,172:0.0%,45:0.0%,173:0.0%,46:0.0%,110:0.0%,47:0.0%,91:0.0%,111:0.0%,112:0.0%,49:0.0%,178:0.0%,51:0.0%,115:0.0%,53:0.0%,118:0.0%,55:0.0%,168:0.0%,40:0.0%,103:0.0%,102:0.0%,123:0.0%,28:0.0%,156:0.0%,29:0.0%,30:0.0%,158:0.0%,31:0.0%,95:0.0%,32:0.0%,160:0.0%,33:0.0%,97:0.0%,34:0.0%,35:0.0%,99:0.0%,100:0.0%,164:0.0%,37:0.0%,165:0.0%,155:0.0%]
** response_memory:[233:92.99%,0

In [6]:
# Now encode the feature vector

encode_numeric_zscore(df, 'command_address')
encode_numeric_zscore(df, 'response_address')
encode_numeric_zscore(df, 'command_memory') 
encode_numeric_zscore(df, 'response_memory')
encode_numeric_zscore(df, 'command_memory_count')
encode_numeric_zscore(df, 'response_memory_count')
encode_numeric_zscore(df, 'comm_read_function')
encode_numeric_zscore(df, 'comm_write_fun')
encode_numeric_zscore(df, 'resp_read_fun')
encode_numeric_zscore(df, 'resp_write_fun')
encode_numeric_zscore(df, 'sub_function')
encode_numeric_zscore(df, 'command_length')
encode_numeric_zscore(df, 'resp_length')
encode_numeric_zscore(df, 'gain')
encode_numeric_zscore(df, 'reset')
encode_numeric_zscore(df, 'deadband')
encode_numeric_zscore(df, 'cycletime')
encode_numeric_zscore(df, 'rate')
encode_numeric_zscore(df, 'setpoint')
encode_numeric_zscore(df, 'control_mode')
encode_numeric_zscore(df, 'control_scheme')
encode_numeric_zscore(df, 'pump')
encode_numeric_zscore(df, 'solenoid')
encode_numeric_zscore(df, 'crc_rate')
encode_numeric_zscore(df, 'measurement')
encode_numeric_zscore(df, 'time')
results = encode_text_index(df, 'result')
num_classes = len(results)

# display 5 rows

df.dropna(inplace=True,axis=1)
df[0:5]
# This is the numeric feature vector, as it goes to the neural net

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,resp_read_fun,resp_write_fun,sub_function,resp_length,reset,setpoint,control_mode,control_scheme,pump,solenoid,measurement,time,result
0,-0.06538,0.274647,0.022292,0.274647,0.035335,0.274647,-0.063322,0.615907,0.274647,-0.066974,-0.274647,-0.999995,-0.290916,1.11033,0.088973,-0.244436,-0.167736,0.015245,-0.358575,0
1,-0.06538,0.274647,0.022292,0.274647,0.035335,0.274647,-0.063322,0.615907,0.274647,-0.066974,-0.274647,-0.999995,-0.290916,1.11033,0.088973,-0.244436,-0.167736,0.015245,-1.056613,0
2,-0.06538,0.274647,0.022292,0.274647,0.035335,0.274647,-0.063322,0.615907,0.274647,-0.066974,-0.274647,-0.999995,-0.290916,1.11033,0.088973,-0.244436,-0.167736,0.015245,1.386956,0
3,-0.06538,0.274647,0.022292,0.274647,0.035335,0.274647,-0.063322,0.615907,0.274647,-0.066974,-0.274647,-0.999995,-0.290916,1.11033,0.088973,-0.244436,-0.167736,0.015245,-0.255541,0
4,-0.06538,0.274647,0.022292,0.274647,0.035335,0.274647,-0.063322,0.615907,0.274647,-0.066974,-0.274647,-0.999995,-0.290916,1.11033,0.088973,-0.244436,-0.167736,0.015245,1.294832,0


In [3]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping

# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'result')

# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)

Train on 72764 samples, validate on 24255 samples
Epoch 1/1000
 - 11s - loss: 0.9930 - val_loss: 0.5454
Epoch 2/1000
 - 6s - loss: 0.5256 - val_loss: 0.4099
Epoch 3/1000
 - 4s - loss: 0.3977 - val_loss: 0.3579
Epoch 4/1000
 - 4s - loss: 0.3728 - val_loss: 0.3430
Epoch 5/1000
 - 4s - loss: 0.3559 - val_loss: 0.3318
Epoch 6/1000
 - 3s - loss: 0.3474 - val_loss: 0.3156
Epoch 7/1000
 - 3s - loss: 0.3350 - val_loss: 0.2938
Epoch 8/1000
 - 3s - loss: 0.3382 - val_loss: 0.2811
Epoch 9/1000
 - 3s - loss: 0.3037 - val_loss: 0.2685
Epoch 10/1000
 - 3s - loss: 0.2928 - val_loss: 0.2729
Epoch 11/1000
 - 3s - loss: 0.3058 - val_loss: 0.2631
Epoch 12/1000
 - 3s - loss: 0.2798 - val_loss: 0.2519
Epoch 13/1000
 - 3s - loss: 0.2834 - val_loss: 0.2603
Epoch 14/1000
 - 3s - loss: 0.2745 - val_loss: 0.2549
Epoch 15/1000
 - 3s - loss: nan - val_loss: nan


  if self.monitor_op(current - self.min_delta, self.best):


Epoch 16/1000
 - 3s - loss: nan - val_loss: nan
Epoch 17/1000
 - 3s - loss: nan - val_loss: nan
Epoch 00017: early stopping


<keras.callbacks.History at 0xca2724e080>

In [4]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.6349206349206349
