## Import Library

In [1]:
import os
import re
import string
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.python.keras.callbacks import Callback

pd.set_option('display.max_columns', None)

## Load dataset

In [2]:
import pickle

data_filename = 'GSPP11_data_merge'

try:
    df = pickle.load(open(data_filename + '.pickle', 'rb'))
except (OSError, IOError) as e:
    df = pd.read_excel(data_filename + '.xlsx')
    pickle.dump(df, open(data_filename + '.pickle', 'wb'))

df.head()

Unnamed: 0,Date,Conditions,GT Number,GT ACTIVE POWER,AMBIENT PRESSURE SELECTED VALUE 1,AMBIENT PRESSURE SELECTED VALUE 2,AMBIENT PRESSURE Point 1,AMBIENT PRESSURE Point 2,GT INLET SCREEN DIFF. PRESS.,GT INLET AIR DIFF. PRESS.,GT INLET FILTER HOUSE AIR TEMP. 1,GT INLET FILTER HOUSE AIR TEMP. 2,GT INLET FILTER HOUSE AIR HUMIDITY 1,GT INLET FILTER HOUSE AIR HUMIDITY 2,No.1 GT CHILLED WATER SUPPLY TEMP.,SPRINT WATER FLOW,LP SPRINT TOTAL WATER FLOW,LP SPRINT OPERATION HOUR,VIGV POSITION SELECTED,#1 VIGV POSITION,#2 VIGV POSITION,VBVBEL POSITION SELECTED,VSVSEL POSITION SELECTED,No.1 INLET AIR HEATER LEVEL CV POS. DEMAND,LP COMP. INLET TEMP. (T2) SELECTED,GT INLET TEMP. (T10) SELECTED,HP COMP. INLET TEMP. (T25) SELECTED,HP COMP. DISCH. TEMP. (T3) SELECTED,#1 HP COMP. INLET PRESS. (P25),#2 HP COMP. INLET PRESS. (P25),#1 HP COMP. DISCH. PRESS. (PS3),#2 HP COMP. DISCH. PRESS. (PS3),LP TURBINE INLET TEMP. (T48) SELECTED,DIFFERENACE OF T48MAX AND T48MIN,HP ROTOR SPEED (XN25) SELECTED,LP ROTOR SPEED (XNSD) SELECTED,Fuel Gas Volume Flow,LOWER HEATING VALUE 1,LOWER HEATING VALUE 2,Fuel Specific Gravity Selected,Compressibility Factor Selected,FUEL GAS SUPPLY PRESS.,FUEL GAS SUPPLY TEMP. SELECTED,Curnt. Methane (C1),Curnt. Ethane (C2),Curnt. Propane (C3),Curnt. i-Butane (iC4),Curnt. n-Butene (nC4),Curnt. i-Pentane (iC5),Curnt. n-Pentane (nC5),Curnt. Hexane Plus (C6+),Curnt. Nitrogen (N2),Curnt. Carbon Dioxide (CO2),HRSG INLET FUEL GAS PRESS.,HRSG INLET GAS DUCT TEMP. #1,HRSG INLET GAS DUCT TEMP. #2,HRSG OUTLET FUEL GAS PRESS.,HRSG OUTLET GAS DUCT TEMP. #1,HRSG OUTLET GAS DUCT TEMP. #2,LP TURBINE INLET TEMP. (T48) A,LP TURBINE INLET TEMP. (T48) B,LP TURBINE INLET TEMP. (T48) C,LP TURBINE INLET TEMP. (T48) D,LP TURBINE INLET TEMP. (T48) E,LP TURBINE INLET TEMP. (T48) F,LP TURBINE INLET TEMP. (T48) G,LP TURBINE INLET TEMP. (T48) H
0,2019-01-01 00:00:00,Normal,1,24.961611,14.445444,14.466763,1002.279208,1002.764703,0.579197,-0.876408,24.942641,24.955787,58.258275,59.77905,44.426224,0.00557,98132.634463,465.72837,28.269185,28.804726,27.735972,13.471984,72.954904,72.223297,40.599394,41.138421,121.961115,530.589712,201.459786,201.734075,2061.697022,2064.66486,852.376481,24.54763,10089.833398,3629.223758,193.406986,45657.459423,45700.30186,0.594959,0.943591,4.36025,58.322947,95.12428,0.993676,0.109514,0.02765,0.026884,0.003666,0.00289,0.009909,1.968795,1.732711,0.097487,488.504492,486.230573,0.041327,89.307232,102.802245,847.066968,863.186131,853.090418,858.288423,838.633159,859.917953,847.319691,850.996004
1,2019-01-01 01:00:00,Normal,1,24.955444,14.449742,14.466154,1002.073566,1002.430513,0.577017,-0.874152,24.495615,24.543611,58.275939,60.006663,44.912449,0.005458,98132.645325,465.729958,28.258613,28.784903,27.733255,13.214311,72.702084,69.044871,40.933119,41.474002,122.625872,531.528906,201.763867,202.032457,2062.207612,2064.943401,852.605385,24.563439,10089.025464,3629.442142,193.345791,45608.743144,45649.098929,0.595003,0.943467,4.360398,58.372839,95.081885,1.016808,0.102702,0.025571,0.024688,0.002276,0.001772,0.009802,1.969856,1.764604,0.110897,488.85658,486.5455,0.029931,89.15165,102.703886,847.225598,863.341621,853.268686,858.476324,838.720686,860.321558,847.653442,851.321655
2,2019-01-01 02:00:00,Normal,1,25.000464,14.452401,14.465545,1001.667664,1002.095941,0.576373,-0.874066,23.527854,23.712178,60.139831,61.588245,44.869438,0.005346,98132.656186,465.731547,28.235135,28.751349,27.717824,13.00669,72.62577,70.603622,40.795232,41.37323,122.755831,531.591234,202.296723,202.565203,2064.342381,2067.248927,852.410639,24.377013,10087.886271,3630.881414,192.98656,45572.619074,45612.402519,0.595048,0.9435,4.360448,57.630118,95.046427,1.030336,0.104356,0.025849,0.025095,0.00242,0.001761,0.004824,1.960737,1.79818,0.099417,488.327492,486.01103,0.018535,89.075548,102.628714,847.218651,863.074201,853.051051,857.873665,838.672786,860.240772,847.339965,851.294517
3,2019-01-01 03:00:00,Normal,1,24.991005,14.45504,14.464936,1001.346888,1001.791034,0.577296,-0.875245,23.437874,23.408884,61.75929,63.575023,44.769728,0.005234,98132.667047,465.733136,28.271122,28.787484,27.746527,13.205728,72.627522,66.960869,40.646761,41.177816,122.578655,531.221143,202.214404,202.485291,2064.332157,2067.008883,852.278768,24.58375,10085.513636,3632.125342,193.454552,45578.502964,45616.255091,0.595168,0.943538,4.360257,58.098353,95.072193,1.003221,0.103988,0.025653,0.025048,0.002473,0.001836,0.003524,1.974173,1.787917,0.104294,488.179049,485.829699,0.007138,89.280449,102.797421,847.135828,863.151972,853.03812,857.905328,838.617167,859.957468,846.830988,851.09892
4,2019-01-01 04:00:00,Normal,1,24.997137,14.45768,14.464328,1001.380376,1001.838734,0.575811,-0.873603,23.488287,23.156522,59.996988,62.821445,45.163687,0.005016,98132.677294,465.734635,28.257159,28.772606,27.744851,13.041475,72.401568,66.266544,40.977436,41.500466,123.142678,532.210247,202.454875,202.718491,2064.448622,2067.297641,852.446505,24.337068,10084.308876,3631.749749,193.056589,45598.746949,45631.582338,0.595501,0.943576,4.360392,57.926253,95.090598,0.999284,0.114693,0.027928,0.028202,0.002865,0.002018,0.002983,1.970817,1.760651,0.026045,488.224161,485.939179,-0.003747,89.084875,102.578282,847.241493,863.114046,853.204758,857.918985,838.874133,860.301252,847.035949,851.422351


## Prepare dataset

In [10]:
features = df.drop(columns=['Date', 'GT Number'])
print('Total samples: {:d}'.format(len(features)))

label = features.pop('Conditions')
label = pd.Series([0 if y == 'Normal' else 1 for y in label])
print('Normal samples: {:d}, Abnormal samples: {:d}'.format(label.value_counts()[0], label.value_counts()[1]))

features = features.apply(pd.to_numeric, errors='coerce')
nan_rows = features[features.isnull().any(axis=1)]

# Fill all NaN with value = 0
features.fillna(0)

features.head()

Total samples: 29232
Normal samples: 23517, Abnormal samples: 5715


Unnamed: 0,GT ACTIVE POWER,AMBIENT PRESSURE SELECTED VALUE 1,AMBIENT PRESSURE SELECTED VALUE 2,AMBIENT PRESSURE Point 1,AMBIENT PRESSURE Point 2,GT INLET SCREEN DIFF. PRESS.,GT INLET AIR DIFF. PRESS.,GT INLET FILTER HOUSE AIR TEMP. 1,GT INLET FILTER HOUSE AIR TEMP. 2,GT INLET FILTER HOUSE AIR HUMIDITY 1,GT INLET FILTER HOUSE AIR HUMIDITY 2,No.1 GT CHILLED WATER SUPPLY TEMP.,SPRINT WATER FLOW,LP SPRINT TOTAL WATER FLOW,LP SPRINT OPERATION HOUR,VIGV POSITION SELECTED,#1 VIGV POSITION,#2 VIGV POSITION,VBVBEL POSITION SELECTED,VSVSEL POSITION SELECTED,No.1 INLET AIR HEATER LEVEL CV POS. DEMAND,LP COMP. INLET TEMP. (T2) SELECTED,GT INLET TEMP. (T10) SELECTED,HP COMP. INLET TEMP. (T25) SELECTED,HP COMP. DISCH. TEMP. (T3) SELECTED,#1 HP COMP. INLET PRESS. (P25),#2 HP COMP. INLET PRESS. (P25),#1 HP COMP. DISCH. PRESS. (PS3),#2 HP COMP. DISCH. PRESS. (PS3),LP TURBINE INLET TEMP. (T48) SELECTED,DIFFERENACE OF T48MAX AND T48MIN,HP ROTOR SPEED (XN25) SELECTED,LP ROTOR SPEED (XNSD) SELECTED,Fuel Gas Volume Flow,LOWER HEATING VALUE 1,LOWER HEATING VALUE 2,Fuel Specific Gravity Selected,Compressibility Factor Selected,FUEL GAS SUPPLY PRESS.,FUEL GAS SUPPLY TEMP. SELECTED,Curnt. Methane (C1),Curnt. Ethane (C2),Curnt. Propane (C3),Curnt. i-Butane (iC4),Curnt. n-Butene (nC4),Curnt. i-Pentane (iC5),Curnt. n-Pentane (nC5),Curnt. Hexane Plus (C6+),Curnt. Nitrogen (N2),Curnt. Carbon Dioxide (CO2),HRSG INLET FUEL GAS PRESS.,HRSG INLET GAS DUCT TEMP. #1,HRSG INLET GAS DUCT TEMP. #2,HRSG OUTLET FUEL GAS PRESS.,HRSG OUTLET GAS DUCT TEMP. #1,HRSG OUTLET GAS DUCT TEMP. #2,LP TURBINE INLET TEMP. (T48) A,LP TURBINE INLET TEMP. (T48) B,LP TURBINE INLET TEMP. (T48) C,LP TURBINE INLET TEMP. (T48) D,LP TURBINE INLET TEMP. (T48) E,LP TURBINE INLET TEMP. (T48) F,LP TURBINE INLET TEMP. (T48) G,LP TURBINE INLET TEMP. (T48) H
0,24.961611,14.445444,14.466763,1002.279208,1002.764703,0.579197,-0.876408,24.942641,24.955787,58.258275,59.77905,44.426224,0.00557,98132.634463,465.72837,28.269185,28.804726,27.735972,13.471984,72.954904,72.223297,40.599394,41.138421,121.961115,530.589712,201.459786,201.734075,2061.697022,2064.66486,852.376481,24.54763,10089.833398,3629.223758,193.406986,45657.459423,45700.30186,0.594959,0.943591,4.36025,58.322947,95.12428,0.993676,0.109514,0.02765,0.026884,0.003666,0.00289,0.009909,1.968795,1.732711,0.097487,488.504492,486.230573,0.041327,89.307232,102.802245,847.066968,863.186131,853.090418,858.288423,838.633159,859.917953,847.319691,850.996004
1,24.955444,14.449742,14.466154,1002.073566,1002.430513,0.577017,-0.874152,24.495615,24.543611,58.275939,60.006663,44.912449,0.005458,98132.645325,465.729958,28.258613,28.784903,27.733255,13.214311,72.702084,69.044871,40.933119,41.474002,122.625872,531.528906,201.763867,202.032457,2062.207612,2064.943401,852.605385,24.563439,10089.025464,3629.442142,193.345791,45608.743144,45649.098929,0.595003,0.943467,4.360398,58.372839,95.081885,1.016808,0.102702,0.025571,0.024688,0.002276,0.001772,0.009802,1.969856,1.764604,0.110897,488.85658,486.5455,0.029931,89.15165,102.703886,847.225598,863.341621,853.268686,858.476324,838.720686,860.321558,847.653442,851.321655
2,25.000464,14.452401,14.465545,1001.667664,1002.095941,0.576373,-0.874066,23.527854,23.712178,60.139831,61.588245,44.869438,0.005346,98132.656186,465.731547,28.235135,28.751349,27.717824,13.00669,72.62577,70.603622,40.795232,41.37323,122.755831,531.591234,202.296723,202.565203,2064.342381,2067.248927,852.410639,24.377013,10087.886271,3630.881414,192.98656,45572.619074,45612.402519,0.595048,0.9435,4.360448,57.630118,95.046427,1.030336,0.104356,0.025849,0.025095,0.00242,0.001761,0.004824,1.960737,1.79818,0.099417,488.327492,486.01103,0.018535,89.075548,102.628714,847.218651,863.074201,853.051051,857.873665,838.672786,860.240772,847.339965,851.294517
3,24.991005,14.45504,14.464936,1001.346888,1001.791034,0.577296,-0.875245,23.437874,23.408884,61.75929,63.575023,44.769728,0.005234,98132.667047,465.733136,28.271122,28.787484,27.746527,13.205728,72.627522,66.960869,40.646761,41.177816,122.578655,531.221143,202.214404,202.485291,2064.332157,2067.008883,852.278768,24.58375,10085.513636,3632.125342,193.454552,45578.502964,45616.255091,0.595168,0.943538,4.360257,58.098353,95.072193,1.003221,0.103988,0.025653,0.025048,0.002473,0.001836,0.003524,1.974173,1.787917,0.104294,488.179049,485.829699,0.007138,89.280449,102.797421,847.135828,863.151972,853.03812,857.905328,838.617167,859.957468,846.830988,851.09892
4,24.997137,14.45768,14.464328,1001.380376,1001.838734,0.575811,-0.873603,23.488287,23.156522,59.996988,62.821445,45.163687,0.005016,98132.677294,465.734635,28.257159,28.772606,27.744851,13.041475,72.401568,66.266544,40.977436,41.500466,123.142678,532.210247,202.454875,202.718491,2064.448622,2067.297641,852.446505,24.337068,10084.308876,3631.749749,193.056589,45598.746949,45631.582338,0.595501,0.943576,4.360392,57.926253,95.090598,0.999284,0.114693,0.027928,0.028202,0.002865,0.002018,0.002983,1.970817,1.760651,0.026045,488.224161,485.939179,-0.003747,89.084875,102.578282,847.241493,863.114046,853.204758,857.918985,838.874133,860.301252,847.035949,851.422351


In [13]:
ds = tf.data.Dataset.from_tensor_slices((features.values, label.values))

from typing import Tuple
def split_dataset(dataset: tf.data.Dataset, 
                  dataset_size: int, 
                  train_ratio: float, 
                  validation_ratio: float,
                  shuffle=True) -> Tuple[tf.data.Dataset, tf.data.Dataset, tf.data.Dataset]:
    assert (train_ratio + validation_ratio) <= 1

    test_ratio = 1 - (train_ratio + validation_ratio)

    train_count = int(dataset_size * train_ratio)
    if test_ratio > 0:
        validation_count = int(dataset_size * validation_ratio)
        test_count = dataset_size - (train_count + validation_count)
    else:
        validation_count = dataset_size - train_count
        test_count = 0

    if shuffle:
        dataset = dataset.shuffle(dataset_size)

    train_dataset = dataset.take(train_count)
    validation_dataset = dataset.skip(train_count).take(validation_count)
    test_dataset = dataset.skip(validation_count + train_count).take(test_count)

    return train_dataset, validation_dataset, test_dataset

train_ds, val_ds, test_ds = split_dataset(ds, dataset_size=len(df), train_ratio=0.8, validation_ratio=0.2, shuffle=False)

batch_size = 32
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)
test_ds = test_ds.batch(batch_size)

train_batch_num = tf.data.experimental.cardinality(train_ds)
val_batch_num = tf.data.experimental.cardinality(val_ds)
test_batch_num = tf.data.experimental.cardinality(test_ds)
total_batch_num = train_batch_num + val_batch_num + test_batch_num

print('Number of batches in train_ds: {:d}'.format(train_batch_num))
print('Number of batches in val_ds: {:d}'.format(val_batch_num))
print('Number of batches in test_ds: {:d}'.format(test_batch_num))
print('Ratio for number of batches of train/val/test: {:.1f}/{:.1f}/{:.1f} %'.format(
    train_batch_num/total_batch_num*100,
    val_batch_num/total_batch_num*100,
    test_batch_num/total_batch_num*100,
))

train_ds = train_ds.prefetch(buffer_size=32)
val_ds = val_ds.prefetch(buffer_size=32)
test_ds = test_ds.prefetch(buffer_size=32)

Number of batches in train_ds: 731
Number of batches in val_ds: 183
Number of batches in test_ds: 0
Ratio for number of batches of train/val/test: 80.0/20.0/0.0 %


## Analyze class imbalance in the targets

In [None]:
counts = label.value_counts()
print('Number of positive samples in training data: {} ({:.2f}% of total)'.format(
    counts[1], 100 * float(counts[1]) / len(label))
)

weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]

## Normallizing numerical features

In [None]:
# Create a Normalization layer and set its internal state using the training data
normalizer = layers.experimental.preprocessing.Normalization()

feature_ds = train_ds.map(lambda x, y: x)
normalizer.adapt(feature_ds)

## Build a model

In [None]:
def make_model():
    inputs = keras.Input(shape=(30,))

    x = normalizer(inputs)

    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)

    outputs = layers.Dense(1, activation='sigmoid')(x)

    return keras.Model(inputs, outputs)

model = make_model()
keras.utils.plot_model(model, to_file='model_arch.png', show_shapes=True)

## Train the model

In [None]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-3),
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
        keras.metrics.TrueNegatives(name="tn"),
        keras.metrics.TruePositives(name="tp"),
        keras.metrics.Precision(name="precision"),
        keras.metrics.Recall(name="recall"),
    ],
)

# Load saved weight
old_model = keras.models.load_model('4_model.h5')
model.set_weights(old_model.get_weights())
old_epoch = 50

# Train the model with class_weight argument
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(train_ds, epochs=50, initial_epoch=old_epoch, validation_data=val_ds, class_weight=class_weight)
# model.save('4_model.h5')      # Save model

## Evaluate the model

In [None]:
from sklearn.metrics import classification_report

for ds, name in [(train_ds, 'train'), (val_ds, 'val')]:
    print('Result for {} data'.format(name))
    y_pred = model.predict(ds).flatten()
    y_pred_binary = [0 if y < 0.5 else 1 for y in y_pred]
    y_test = np.concatenate([y for x, y in ds], axis=0)

    lookup = {0:'Legit', 1:'Fraud'}
    y_test = pd.Series([lookup[_] for _ in y_test])
    y_pred_binary = pd.Series([lookup[_] for _ in y_pred_binary])

    print(pd.crosstab(y_test, y_pred_binary, rownames=['True'], colnames=['Predicted']))
    # print()
    # print(pd.crosstab(y_test, y_pred_binary, rownames=['True'], colnames=['Predicted']).apply(lambda r: 100.0 * r/r.sum()))
    print()
    print(classification_report(y_test, y_pred_binary, digits=4))