# Classification on imbalanced data

Part of the code was taken from the [tensorflow tutorial](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data).
We use [Keras](../../guide/keras/overview.ipynb) to define the model and [class weights](https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model) to help the model learn from the imbalanced data.

This demonstrates how to classify a highly imbalanced dataset in which the number of examples in one class greatly outnumbers the examples in another. The aim is to detect a mere 10001 attack packets from 298,278 packets in total.

## Setup

In [1]:
#%%writefile anomaly-detector.py
import tensorflow as tf
from tensorflow import keras
from datetime import datetime
from packaging import version

import os
import tempfile

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix, plot_precision_recall_curve
from sklearn.utils.multiclass import unique_labels
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
import mlflow
import mlflow.keras
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

import time
mlflow.keras.autolog()

#np.random.seed(5)



In [2]:
#%%writefile -a anomaly-detector.py

mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  16


In [24]:
#%%writefile -a anomaly-detector.py

def print_table(title, scores_orig, scores_under, scores_over):
    metrics = {
        'F1': 'test_f1',
        'ROC-AUC': 'test_roc_auc',
        'Average Precision': 'test_average_precision',
        'Balanced Accuracy': 'test_balanced_accuracy',
        'Precision': 'test_precision',
        'Recall': 'test_recall',
        'Fit Time': 'fit_time',
    }

    print("== %s: == " % (title))
    print()
    print("Metric,Original Mean,Original Std,Undersampled Mean,Undersampled Std,Oversampled Mean,Oversampled Std")
    
    for key, value in metrics.items():
        print("%s,%.5f,%.5f,%.5f,%.5f,%.5f,%.5f" % (key, scores_orig[value].mean(), scores_orig[value].std(), scores_under[value].mean(), scores_under[value].std(), scores_over[value].mean(), scores_over[value].std()))

## Data processing and exploration

In [4]:
#%%writefile -a anomaly-detector.py

# Import training data
dataset_train = pd.read_csv("../data/raw/energy-informatics-2020/csvDataFeaturesTrain.csv", delimiter=';')
dataset_test = pd.read_csv("../data/raw/energy-informatics-2020/csvDataFeaturesTest.csv", delimiter=';')

#data_train = dataset_test.append(dataset_test[dataset_test['attack']==1])
X = dataset_train.append(dataset_test)
X.groupby('attack').count()

Unnamed: 0_level_0,frameLen,vlan50,vlan60,vlan70,vlan80,isnanVlan,ipFlag,isnanIP,ipTtl,portFeatureSrc2404,...,isnanTPC,xTcpTdrLen,tcpWinSize,tcpPduSize,isnanPduSize,asduTypeid013,asduTypeid036,x104NaN,asduCause01,asduCause03
attack,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,288277,288277,288277,288277,288277,288277,288277,288277,288277,288277,...,288277,288277,288277,288277,288277,288277,288277,288277,288277,288277
1,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500,...,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500
2,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500,...,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500
3,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500,...,2500,2500,2500,2500,2500,2500,2500,2500,2500,2500
4,2501,2501,2501,2501,2501,2501,2501,2501,2501,2501,...,2501,2501,2501,2501,2501,2501,2501,2501,2501,2501


# Create Training Dataset
- attack = 0 means normal
- attack = 1 means anomaly

In [5]:
#%%writefile -a anomaly-detector.py

X['attack'] = X['attack'].map({0:0, 1:1, 2:1, 3:1, 4:1})
print(np.bincount(X['attack']))

[288277  10001]


### Examine the class label imbalance

Let's look at the dataset imbalance:

In [6]:
#%%writefile -a anomaly-detector.py

neg, pos = np.bincount(X['attack'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 298278
    Positive: 10001 (3.35% of total)



This shows the small fraction of positive samples.

In [7]:
#%%writefile -a anomaly-detector.py

# Use a utility from sklearn to split and shuffle our dataset.
y = X.pop('attack')
X

X_under, y_under = RandomUnderSampler(sampling_strategy='majority').fit_resample(X, y)
X_over, y_over = RandomOverSampler().fit_resample(X, y)

print(X.shape)
print(X_under.shape)
print(np.bincount(y_under))
print(X_over.shape)
print(np.bincount(y_over))

(298278, 21)
(20002, 21)
[10001 10001]
(576554, 21)
[288277 288277]


In [8]:
#%%writefile -a anomaly-detector.py

def print_balance(y, title):
    classes = np.unique(y)
    total = len(y)
    print(title)
    for c in classes:
        n_examples = len(y[y==c])
        percent = n_examples / total * 100
        print('  > Class=%d : %d/%d (%.1f%%)' % (c, n_examples, total, percent))

print_balance(y, "Original Dataset")
print_balance(y_under, "Undersampled Dataset")
print_balance(y_over, "Oversampled Dataset")

Original Dataset
  > Class=0 : 288277/298278 (96.6%)
  > Class=1 : 10001/298278 (3.4%)
Undersampled Dataset
  > Class=0 : 10001/20002 (50.0%)
  > Class=1 : 10001/20002 (50.0%)
Oversampled Dataset
  > Class=0 : 288277/576554 (50.0%)
  > Class=1 : 288277/576554 (50.0%)


## Define the model and metrics

Define a function that creates a simple neural network with a densly connected hidden layer, a [dropout](https://developers.google.com/machine-learning/glossary/#dropout_regularization) layer to reduce overfitting, and an output sigmoid layer that returns the probability of a transaction being fraudulent: 

In [9]:
#%%writefile -a anomaly-detector.py

# Create a TensorBoard callback
logs = "./logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")

tboard_callback = tf.keras.callbacks.TensorBoard(log_dir = logs,
                                                 histogram_freq = 1,
                                                 profile_batch = '500,520')

METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve,      
]

def make_model(metrics=METRICS):   
    model = keras.Sequential([
        keras.layers.Dense(64, activation='relu', input_shape=(21,)),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(16, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(32, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss=keras.losses.BinaryCrossentropy(), metrics=metrics)
    return model

In [10]:
print(make_model().summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                1408      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                5

### Understanding useful metrics

Notice that there are a few metrics defined above that can be computed by the model that will be helpful when evaluating the performance.



*   **False** negatives and **false** positives are samples that were **incorrectly** classified
*   **True** negatives and **true** positives are samples that were **correctly** classified
*   **Accuracy** is the percentage of examples correctly classified
>   $\frac{\text{true samples}}{\text{total samples}}$
*   **Precision** is the percentage of **predicted** positives that were correctly classified
>   $\frac{\text{true positives}}{\text{true positives + false positives}}$
*   **Recall** is the percentage of **actual** positives that were correctly classified
>   $\frac{\text{true positives}}{\text{true positives + false negatives}}$
*   **AUC** refers to the Area Under the Curve of a Receiver Operating Characteristic curve (ROC-AUC). This metric is equal to the probability that a classifier will rank a random positive sample higher than a random negative sample.

Note: Accuracy is not a helpful metric for this task. You can 99.8%+ accuracy on this task by predicting False all the time.  

Read more:
*  [True vs. False and Positive vs. Negative](https://developers.google.com/machine-learning/crash-course/classification/true-false-positive-negative)
*  [Accuracy](https://developers.google.com/machine-learning/crash-course/classification/accuracy)
*   [Precision and Recall](https://developers.google.com/machine-learning/crash-course/classification/precision-and-recall)
*   [ROC-AUC](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc)

## Baseline model

### Build the model

In [11]:
#%%writefile -a anomaly-detector.py

EPOCHS = 500
BATCH_SIZE = 4096

### Train the model

In [12]:
#%%writefile -a anomaly-detector.py

# Source: https://scikit-learn.org/0.21/auto_examples/model_selection/plot_confusion_matrix.html

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

np.set_printoptions(precision=2)

In [13]:
#%%writefile -a anomaly-detector.py

def cross_validate_model(pipeline, X, y, cv, title):
    print_balance(y, title)
    scores = cross_validate(pipeline, X.values, y.values, cv=kfold, scoring=('f1', 'roc_auc', 'average_precision', 'balanced_accuracy', 'precision', 'recall'), return_estimator=True, return_train_score=True)
    print("    > F1: %.3f%% (+/-%.3f%%)" % (scores['test_f1'].mean()*100, scores['test_f1'].std()*100))        
    print("    > ROC-AUC: %.3f%% (+/-%.3f%%)" % (scores['test_roc_auc'].mean()*100, scores['test_roc_auc'].std()*100))
    print("    > Average Precision: %.3f%% (+/-%.3f%%)" % (scores['test_average_precision'].mean()*100, scores['test_average_precision'].std()*100))
    print("    > Balanced Accuracy: %.3f%% (+/-%.3f%%)" % (scores['test_balanced_accuracy'].mean()*100, scores['test_balanced_accuracy'].std()*100))
    print("    > Precision: %.3f%% (+/-%.3f%%)" % (scores['test_precision'].mean()*100, scores['test_precision'].std()*100))
    print("    > Recall: %.3f%% (+/-%.3f%%)" % (scores['test_recall'].mean()*100, scores['test_recall'].std()*100))
    print("    > Fit Time: %.3f s (+/-%.3f s)" % (scores['fit_time'].mean(), scores['fit_time'].std()))
    return scores

In [14]:
#%%writefile -a anomaly-detector.py

# Create stratified KFold Splits
kfold = StratifiedKFold(n_splits=10, shuffle=True)

# Random Forest

In [15]:
#%%writefile -a anomaly-detector.py

from sklearn.ensemble import RandomForestClassifier

rf_original = cross_validate_model(make_pipeline(StandardScaler(), RandomForestClassifier()), X, y, kfold, "Original Dataset")
rf_under = cross_validate_model(make_pipeline(StandardScaler(), RandomForestClassifier()), X_under, y_under, kfold, "Undersampled Dataset")
rf_over = cross_validate_model(make_pipeline(StandardScaler(), RandomForestClassifier()), X_over, y_over, kfold, "Oversampled Dataset")

print_table("Random Forest Classifier", rf_original, rf_under, rf_over)

Original Dataset
  > Class=0 : 288277/298278 (96.6%)
  > Class=1 : 10001/298278 (3.4%)
    > F1: 99.880% (+/-0.068%)
    > ROC-AUC: 99.994% (+/-0.016%)
    > Average Precision: 99.953% (+/-0.045%)
    > Balanced Accuracy: 99.880% (+/-0.068%)
    > Precision: 100.000% (+/-0.000%)
    > Recall: 99.760% (+/-0.136%)
    > Fit Time: 6.200 s (+/-0.156 s)
Undersampled Dataset
  > Class=0 : 10001/20002 (50.0%)
  > Class=1 : 10001/20002 (50.0%)
    > F1: 99.880% (+/-0.068%)
    > ROC-AUC: 99.994% (+/-0.015%)
    > Average Precision: 99.993% (+/-0.015%)
    > Balanced Accuracy: 99.880% (+/-0.068%)
    > Precision: 100.000% (+/-0.000%)
    > Recall: 99.760% (+/-0.136%)
    > Fit Time: 0.412 s (+/-0.006 s)
Oversampled Dataset
  > Class=0 : 288277/576554 (50.0%)
  > Class=1 : 288277/576554 (50.0%)
    > F1: 99.881% (+/-0.013%)
    > ROC-AUC: 99.999% (+/-0.000%)
    > Average Precision: 99.998% (+/-0.000%)
    > Balanced Accuracy: 99.882% (+/-0.013%)
    > Precision: 100.000% (+/-0.000%)
    > Recal

In [25]:
print_table("Random Forest Classifier", rf_original, rf_under, rf_over)

== Random Forest Classifier: == 

Metric,Original Mean,Original Std,Undersampled Mean,Undersampled Std,Oversampled Mean,Oversampled Std
F1,0.99880,0.00068,0.99880,0.00068,0.99881,0.00013
ROC-AUC,0.99994,0.00016,0.99994,0.00015,0.99999,0.00000
Average Precision,0.99953,0.00045,0.99993,0.00015,0.99998,0.00000
Balanced Accuracy,0.99880,0.00068,0.99880,0.00068,0.99882,0.00013
Precision,1.00000,0.00000,1.00000,0.00000,1.00000,0.00000
Recall,0.99760,0.00136,0.99760,0.00136,0.99763,0.00027
Fit Time,6.19979,0.15602,0.41161,0.00589,13.45703,0.10737


# Decision Tree

In [17]:
#%%writefile -a anomaly-detector.py

from sklearn import tree

dt_orig = cross_validate_model(make_pipeline(StandardScaler(), tree.DecisionTreeClassifier()), X, y, kfold, "Original Dataset")
dt_under = cross_validate_model(make_pipeline(StandardScaler(), tree.DecisionTreeClassifier()), X_under, y_under, kfold, "Undersampled Dataset")
dt_over = cross_validate_model(make_pipeline(StandardScaler(), tree.DecisionTreeClassifier()), X_over, y_over, kfold, "Oversampled Dataset")

print_table("Decision Tree", dt_orig, dt_under, dt_over)

Original Dataset
  > Class=0 : 288277/298278 (96.6%)
  > Class=1 : 10001/298278 (3.4%)
    > F1: 99.880% (+/-0.075%)
    > ROC-AUC: 99.994% (+/-0.015%)
    > Average Precision: 99.953% (+/-0.040%)
    > Balanced Accuracy: 99.880% (+/-0.075%)
    > Precision: 100.000% (+/-0.000%)
    > Recall: 99.760% (+/-0.150%)
    > Fit Time: 0.397 s (+/-0.020 s)
Undersampled Dataset
  > Class=0 : 10001/20002 (50.0%)
  > Class=1 : 10001/20002 (50.0%)
    > F1: 99.880% (+/-0.072%)
    > ROC-AUC: 99.994% (+/-0.015%)
    > Average Precision: 99.993% (+/-0.015%)
    > Balanced Accuracy: 99.880% (+/-0.071%)
    > Precision: 100.000% (+/-0.000%)
    > Recall: 99.760% (+/-0.143%)
    > Fit Time: 0.016 s (+/-0.001 s)
Oversampled Dataset
  > Class=0 : 288277/576554 (50.0%)
  > Class=1 : 288277/576554 (50.0%)
    > F1: 99.881% (+/-0.011%)
    > ROC-AUC: 99.999% (+/-0.000%)
    > Average Precision: 99.998% (+/-0.000%)
    > Balanced Accuracy: 99.882% (+/-0.011%)
    > Precision: 100.000% (+/-0.000%)
    > Recal

In [26]:
print_table("Decision Tree", dt_orig, dt_under, dt_over)

== Decision Tree: == 

Metric,Original Mean,Original Std,Undersampled Mean,Undersampled Std,Oversampled Mean,Oversampled Std
F1,0.99880,0.00075,0.99880,0.00072,0.99881,0.00011
ROC-AUC,0.99994,0.00015,0.99994,0.00015,0.99999,0.00000
Average Precision,0.99953,0.00040,0.99993,0.00015,0.99998,0.00000
Balanced Accuracy,0.99880,0.00075,0.99880,0.00071,0.99882,0.00011
Precision,1.00000,0.00000,1.00000,0.00000,1.00000,0.00000
Recall,0.99760,0.00150,0.99760,0.00143,0.99763,0.00022
Fit Time,0.39661,0.02043,0.01617,0.00081,0.77534,0.00924


# DNN Model (5 Hidden Layers, 1 Output Layer)

In [18]:
#%%writefile -a anomaly-detector.py

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

print("Epochs: %s, Batch Size: %s" % (EPOCHS, BATCH_SIZE))
dnn_original = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)), X, y, kfold, "Original Dataset")
dnn_under = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)), X_under, y_under, kfold, "Undersampled Dataset")
dnn_over = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0)), X_over, y_over, kfold, "Oversampled Dataset")

print_table("DNN", dnn_original, dnn_under, dnn_over)

Epochs: 500, Batch Size: 4096
Original Dataset
  > Class=0 : 288277/298278 (96.6%)
  > Class=1 : 10001/298278 (3.4%)
    > F1: 91.076% (+/-8.564%)
    > ROC-AUC: 99.991% (+/-0.016%)
    > Average Precision: 99.862% (+/-0.105%)
    > Balanced Accuracy: 99.452% (+/-0.228%)
    > Precision: 85.114% (+/-14.870%)
    > Recall: 99.640% (+/-0.441%)
    > Fit Time: 275.737 s (+/-2.700 s)
Undersampled Dataset
  > Class=0 : 10001/20002 (50.0%)
  > Class=1 : 10001/20002 (50.0%)
    > F1: 99.699% (+/-0.095%)
    > ROC-AUC: 99.989% (+/-0.021%)
    > Average Precision: 99.991% (+/-0.012%)
    > Balanced Accuracy: 99.700% (+/-0.095%)
    > Precision: 100.000% (+/-0.000%)
    > Recall: 99.400% (+/-0.190%)
    > Fit Time: 26.596 s (+/-0.308 s)
Oversampled Dataset
  > Class=0 : 288277/576554 (50.0%)
  > Class=1 : 288277/576554 (50.0%)
    > F1: 99.715% (+/-0.019%)
    > ROC-AUC: 99.996% (+/-0.001%)
    > Average Precision: 99.993% (+/-0.002%)
    > Balanced Accuracy: 99.716% (+/-0.019%)
    > Precision:

In [27]:
print_table("DNN", dnn_original, dnn_under, dnn_over)

== DNN: == 

Metric,Original Mean,Original Std,Undersampled Mean,Undersampled Std,Oversampled Mean,Oversampled Std
F1,0.91076,0.08564,0.99699,0.00095,0.99715,0.00019
ROC-AUC,0.99991,0.00016,0.99989,0.00021,0.99996,0.00001
Average Precision,0.99862,0.00105,0.99991,0.00012,0.99993,0.00002
Balanced Accuracy,0.99452,0.00228,0.99700,0.00095,0.99716,0.00019
Precision,0.85114,0.14870,1.00000,0.00000,0.99999,0.00002
Recall,0.99640,0.00441,0.99400,0.00190,0.99433,0.00039
Fit Time,275.73685,2.70026,26.59632,0.30760,525.86692,3.54290


# DNN Model with adjusted weights

In [19]:
#%%writefile -a anomaly-detector.py

# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0 (normal): {:.2f}'.format(weight_for_0))
print('Weight for class 1 (attack): {:.2f}'.format(weight_for_1))

#estimators = []
#model = KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, class_weight=class_weight)
#model._estimator_type = "classifier"
#estimators.append(('standardize', StandardScaler()))
#estimators.append(('weighted-model', model))
#pipeline = Pipeline(estimators)

weighted_orig = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, class_weight=class_weight)), X, y, kfold, "Original Dataset")
weighted_under = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, class_weight=class_weight)), X_under, y_under, kfold, "Undersampled Dataset")
weighted_over = cross_validate_model(make_pipeline(StandardScaler(), KerasClassifier(build_fn=make_model, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, class_weight=class_weight)), X_over, y_over, kfold, "Oversampled Dataset")

print_table("Weighted DNN Model", weighted_orig, weighted_under, weighted_over)

Weight for class 0 (normal): 0.52
Weight for class 1 (attack): 14.91
Original Dataset
  > Class=0 : 288277/298278 (96.6%)
  > Class=1 : 10001/298278 (3.4%)
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
    > F1: 99.694% (+/-0.097%)
    > ROC-AUC: 99.991% (+/-0.016%)
    > Average Precision: 99.820% (+/-0.067%)
    > Balanced Accuracy: 99.710% (+/-0.099%)
    > Precision: 99.970% (+/-0.046%)
    > Recall: 99.420% (+/-0.199%)
    > Fit Time: 272.545 s (+/-2.139 s)
Undersampled Dataset
  > Class=0 : 10001/20002 (50.0%)
  > Class=1 : 10001/20002 (50.0%)
    > F1: 99.256% (+/-0.228%)
    > ROC-AUC: 99.991% (+/-0.016%)
    > Average Precision: 99.990% (+/-0.016%)
    > Balanced Accuracy: 99.250% (+/-0.231%)
    > Precision: 98.584% (+/-0.578%)
    > Recall: 99.940% (+/-0.150%)
    > Fit Time: 26.430 s (+/-0.525 s)
Oversampled Dataset
  > Class=0 : 288277/576554 (50.0%)
  > Class=1 : 288277/576554 (50.

In [28]:
print_table("Weighted DNN Model", weighted_orig, weighted_under, weighted_over)

== Weighted DNN Model: == 

Metric,Original Mean,Original Std,Undersampled Mean,Undersampled Std,Oversampled Mean,Oversampled Std
F1,0.99694,0.00097,0.99256,0.00228,0.99716,0.00028
ROC-AUC,0.99991,0.00016,0.99991,0.00016,0.99991,0.00004
Average Precision,0.99820,0.00067,0.99990,0.00016,0.99990,0.00003
Balanced Accuracy,0.99710,0.00099,0.99250,0.00231,0.99717,0.00028
Precision,0.99970,0.00046,0.98584,0.00578,0.99999,0.00001
Recall,0.99420,0.00199,0.99940,0.00150,0.99434,0.00056
Fit Time,272.54514,2.13897,26.42985,0.52505,524.58239,8.27065
