In [1]:
# Imports

########################################################################
# Python Standard Libraries
import os
import multiprocessing
from timeit import default_timer as timer
import random
import math

########################################################################
# Numpy Library
import numpy as np # linear algebra

########################################################################
# Pandas Library
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

########################################################################
# MATPLOT Library
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.ticker import MaxNLocator
%matplotlib inline

########################################################################
# SKLearn Library
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_recall_curve, classification_report, confusion_matrix, average_precision_score, roc_curve, auc, multilabel_confusion_matrix

########################################################################
# SCIPY Library
from scipy.stats import gaussian_kde
import scipy.stats as st


########################################################################
# Keras Library
from keras.models import Sequential
from keras.layers import Dense

########################################################################
# Init random seed
#seed = 13
#np.random.seed(seed)

In [2]:
# Utility functions
########################################################################
# Print system information
def print_system_info():
    mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')  # e.g. 4015976448
    mem_gib = mem_bytes/(1024.**3)  # e.g. 3.74
    print("{:<23}{:f} GB".format('RAM:', mem_gib))
    print("{:<23}{:d}".format('CORES:', multiprocessing.cpu_count()))
    !lscpu

########################################################################
# Walk through input files
def print_input_files():
    # Input data files are available in the "../input/" directory.
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

########################################################################
# Dump text files
def dump_text_file(fname):
    with open(fname, 'r') as f:
        print(f.read())

########################################################################
# Dump CSV files
def dump_csv_file(fname, count=5):
    # count: 0 - column names only, -1 - all rows, default = 5 rows max
    df = pd.read_csv(fname)
    if count < 0:
        count = df.shape[0]
    return df.head(count)

########################################################################
# Dataset related functions
ds_nbaiot = '/kaggle/input/nbaiot-dataset'
dn_nbaiot = ['Danmini_Doorbell', 'Ecobee_Thermostat', 'Ennio_Doorbell', 'Philips_B120N10_Baby_Monitor', 'Provision_PT_737E_Security_Camera', 'Provision_PT_838_Security_Camera', 'Samsung_SNH_1011_N_Webcam', 'SimpleHome_XCS7_1002_WHT_Security_Camera', 'SimpleHome_XCS7_1003_WHT_Security_Camera']

def fname(ds, f):
    if '.csv' not in f:
        f = f'{f}.csv'
    return os.path.join(ds, f)

def fname_nbaiot(f):
    return fname(ds_nbaiot, f)

def get_nbaiot_device_files():
    nbaiot_all_files = dump_csv_file(fname_nbaiot('data_summary'), -1)
    nbaiot_all_files = nbaiot_all_files.iloc[:,0:1].values
    device_id = 1
    indices = []
    for j in range(len(nbaiot_all_files)):
        if str(device_id) not in str(nbaiot_all_files[j]):
            indices.append(j)
            device_id += 1
    nbaiot_device_files = np.split(nbaiot_all_files, indices)
    return nbaiot_device_files

def get_nbaiot_device_data(device_id, count_norm=-1, count_anom=-1):
    if device_id < 1 or device_id > 9:
        assert False, "Please provide a valid device ID 1-9, both inclusive"
    if count_anom == -1:
        count_anom = count_norm
    device_index = device_id -1
    device_files = get_nbaiot_device_files()
    device_file = device_files[device_index]
    df = pd.DataFrame()
    y = []
    for i in range(len(device_file)):
        fname = str(device_file[i][0])
        df_c = pd.read_csv(fname_nbaiot(fname))
        count = count_anom
        if 'benign' in fname:
            count = count_norm
        rows = count if count >=0 else df_c.shape[0]
        print("processing", fname, "rows =", rows)
        y_np = np.ones(rows) if 'benign' in fname else np.zeros(rows)
        y.extend(y_np.tolist())
        df = pd.concat([df.iloc[:,:].reset_index(drop=True),
                      df_c.iloc[:rows,:].reset_index(drop=True)], axis=0)
    X = df.iloc[:,:].values
    y = np.array(y)
    Xdf = df
    return (X, y, Xdf)

def get_nbaiot_devices_data():
    devices_data = []
    for i in range(9):
        device_id = i + 1
        (X, y) = get_nbaiot_device_data(device_id)
        devices_data.append((X, y))
    return devices_data
#print_input_files()
print_system_info()

RAM:                   17.587090 GB
CORES:                 4
Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              4
On-line CPU(s) list: 0-3
Thread(s) per core:  2
Core(s) per socket:  2
Socket(s):           1
NUMA node(s):        1
Vendor ID:           GenuineIntel
CPU family:          6
Model:               85
Model name:          Intel(R) Xeon(R) CPU @ 2.00GHz
Stepping:            3
CPU MHz:             2000.182
BogoMIPS:            4000.36
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            1024K
L3 cache:            39424K
NUMA node0 CPU(s):   0-3
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic mo

In [3]:
def compute_time_complexity_single_pass(neurons_input, structure, neurons_output):
    count_hidden_layers = len(structure)
    neurons = [neurons_input, *structure, neurons_output]
    complexity = 0
    for i in range(count_hidden_layers + 1):
        complexity += neurons[i] * neurons[i+1]
    return complexity

In [4]:
dev_1 = {
    'name': 'Danmini Doorbell',
    'features': 47,
    'structure': [97, 48, 73, 58, 64, 53, 83],
    'struct_heuristics': [2384, 476],
    'struct_genetic': [873, 1],
    'struct_random': [57, 16, 18, 61, 59, 59, 71, 46, 16, 47, 70, 83, 50, 18, 91, 26, 86, 94, 65, 18, 45],
}

dev_2 = {
    'name': 'Ecobee Thermostat',
    'features': 47,
    'structure': [88, 52, 66, 58, 75],
    'struct_heuristics': [2160, 432],
    'struct_genetic': [804, 40],
    'struct_random': [79, 33, 72, 42, 58, 75, 70, 28, 45, 23, 97, 55, 72, 41, 55],
}
dev_3 = {
    'name': 'Ennio Doorbell',
    'features': 59,
    'structure': [57, 31, 43, 38, 34, 49],
    'struct_heuristics': [1408, 281],
    'struct_genetic': [1676, 114],
    'struct_random': [17, 76, 13, 79, 64, 65, 24, 26, 20, 69, 79, 89, 76, 89, 50, 94],
}
dev_4 = {
    'name': 'Philips B120N10',
    'features': 51,
    'structure': [101, 46, 67, 55, 60, 50, 75],
    'struct_heuristics': [2476, 495],
    'struct_genetic': [805, 112],
    'struct_random': [66, 30, 81, 53, 55, 12, 96, 91, 21, 28, 71, 50, 22, 61, 15, 34, 80, 37, 95, 57, 97, 65, 89],
}
dev_5 = {
    'name': 'Provision PT737E',
    'features': 40,
    'structure': [87, 43, 58, 52, 47, 75],
    'struct_heuristics': [2150, 430],
    'struct_genetic': [1217, 127],
    'struct_random': [19, 10, 19, 57, 24, 71, 25, 17, 87, 74, 70, 27, 94, 19, 36, 27, 86, 59, 23, 64],
}
dev_6 = {
    'name': 'Provision PT838',
    'features': 42,
    'structure': [88, 44, 58, 52, 48, 75],
    'struct_heuristics': [2161, 432],
    'struct_genetic': [1223, 20],
    'struct_random': [74, 11, 57, 20, 97, 18, 37, 82, 40, 90, 17, 65, 25, 45, 59, 67, 30, 55],
}
dev_7 = {
    'name': 'Samsung SNH1011N',
    'features': 56,
    'structure': [59, 32, 44, 39, 35, 50],
    'struct_heuristics': [1447, 289],
    'struct_genetic': [430, 60],
    'struct_random': [51, 16, 85, 93, 62, 45, 79, 68, 66, 70, 64, 47, 23, 88, 38, 92, 48, 62, 37, 62, 34, 25, 74],
}
dev_8 = {
    'name': 'SimpleHome XCS71002',
    'features': 56,
    'structure': [76, 48, 59, 53, 67],
    'struct_heuristics': [2195, 439],
    'struct_genetic': [896, 25],
    'struct_random': [91, 12, 32, 87, 39, 96, 20, 73, 58, 37, 57, 71, 81, 35, 55, 53, 56, 42, 92, 58],
}
dev_9 = {
    'name': 'SimpleHome XCS71003',
    'features': 56,
    'structure_': [88.0, 48.0, 66.0, 59.0, 53.0, 76.0],
    'structure': [88, 48, 59, 53, 76],
    'struct_heuristics': [2179, 435],
    'struct_genetic': [1801, 24],
    'struct_random': [34, 60, 64, 29, 17, 60, 14, 63, 92, 65, 95, 64, 86, 29, 84, 14, 22, 44, 46, 86],
}
devices = [dev_1, dev_2, dev_3, dev_4, dev_5, dev_6, dev_7, dev_8, dev_9]

In [5]:
def construct_ae(i, o, b, s):
    ae = []
    '''
       i = number of input neurons
       o = number of output neurons
       b = number of neurons in the bottleneck
       s = layer structure
    '''
    sr = s[::-1]
    # left section
    ln = compute_time_complexity_single_pass(i, s, b)
    lr = compute_time_complexity_single_pass(i, sr, b)
    rn = compute_time_complexity_single_pass(b, s, o)
    rr = compute_time_complexity_single_pass(b, sr, o)
    if (ln <= lr):
        ae.extend(s)
    else:
        ae.extend(sr)
    ae.append(b)
    if (rn <= rr):
        ae.extend(s)
    else:
        ae.extend(sr)
    return ae

In [6]:
print("Evaluation for ANN")
print("Naive vs Optimized")
print("id,name,Naive,Optimized,Gain")
for i, d in enumerate(devices):
    cb = int(compute_time_complexity_single_pass(d['features'], sorted(d['structure'], reverse=True), 1))
    ca = int(compute_time_complexity_single_pass(d['features'], d['structure'], 1))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{cb},{ca},{g:0.2f}")

Evaluation for ANN
Naive vs Optimized
id,name,Naive,Optimized,Gain
1,Danmini Doorbell,32719,28539,12.78
2,Ecobee Thermostat,22582,20397,9.68
3,Ennio Doorbell,12274,11104,9.53
4,Philips B120N10,30167,26689,11.53
5,Provision PT737E,21879,18775,14.19
6,Provision PT838,22314,19307,13.48
7,Samsung SNH1011N,12687,11481,9.51
8,SimpleHome XCS71002,19020,17481,8.09
9,SimpleHome XCS71003,21819,19215,11.93


In [7]:
print("Evaluation for ANN")
print("Comparison with other techniques")
print("id,name,Dahlia,Heuristics,Genetic,Random")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['structure']
    h = d['struct_heuristics']
    g = d['struct_genetic']
    r = d['struct_random']
    cs = int(compute_time_complexity_single_pass(nbi, s, nbo))
    ch = int(compute_time_complexity_single_pass(nbi, h, nbo))
    cg = int(compute_time_complexity_single_pass(nbi, g, nbo))
    cr = int(compute_time_complexity_single_pass(nbi, r, nbo))
    print(f"{i + 1},{d['name']},{cs},{ch},{cg},{cr}")

Evaluation for ANN
Comparison with other techniques
id,name,Dahlia,Heuristics,Genetic,Random
1,Danmini Doorbell,28539,1247308,41905,57609
2,Ecobee Thermostat,20397,1035072,69988,44799
3,Ennio Doorbell,11104,479001,290062,52864
4,Philips B120N10,26689,1352391,131327,69754
5,Provision PT737E,18775,1010930,203366,38751
6,Provision PT838,19307,1024746,75846,35663
7,Samsung SNH1011N,11481,499504,49940,73823
8,SimpleHome XCS71002,17481,1086964,72601,60413
9,SimpleHome XCS71003,19215,1070324,144104,55858


In [8]:
print("Evaluation for AE")
print("Naive vs Optimized")
print("id,name,Naive,Optimized,Gain")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['structure']
    nbb = math.ceil(min(s)/2)
    ae = construct_ae(nbi, nbo, nbb, s)
    ae_naive = []
    ae_naive.extend(sorted(s, reverse=True))
    ae_naive.append(nbb)
    ae_naive.extend(sorted(s, reverse=False))
    cb = int(compute_time_complexity_single_pass(nbi, ae_naive, nbo))
    ca = int(compute_time_complexity_single_pass(nbi, ae, nbo))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{cb},{ca},{g:0.2f}")

Evaluation for AE
Naive vs Optimized
id,name,Naive,Optimized,Gain
1,Danmini Doorbell,63184,56112,11.19
2,Ecobee Thermostat,43716,40223,7.99
3,Ennio Doorbell,22172,20028,9.67
4,Philips B120N10,57308,50900,11.18
5,Provision PT737E,42171,37091,12.05
6,Provision PT838,42868,37896,11.60
7,Samsung SNH1011N,23089,20857,9.67
8,SimpleHome XCS71002,36068,33576,6.91
9,SimpleHome XCS71003,41006,36702,10.50


In [9]:
print("Evaluation for AE")
print("NN Structure")
print("id,name,i,structure,i")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['structure']
    nbb = math.ceil(min(nbi,min(s))/2)
    ae = construct_ae(nbi, nbo, nbb, s)
    ae_naive = []
    ae_naive.extend(sorted(s, reverse=True))
    ae_naive.append(nbb)
    ae_naive.extend(sorted(s, reverse=False))
    cb = int(compute_time_complexity_single_pass(nbi, ae_naive, nbo))
    ca = int(compute_time_complexity_single_pass(nbi, ae, nbo))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{nbi},{ae},{nbi}")

Evaluation for AE
NN Structure
id,name,i,structure,i
1,Danmini Doorbell,47,[83, 53, 64, 58, 73, 48, 97, 24, 83, 53, 64, 58, 73, 48, 97],47
2,Ecobee Thermostat,47,[75, 58, 66, 52, 88, 24, 75, 58, 66, 52, 88],47
3,Ennio Doorbell,59,[49, 34, 38, 43, 31, 57, 16, 49, 34, 38, 43, 31, 57],59
4,Philips B120N10,51,[75, 50, 60, 55, 67, 46, 101, 23, 75, 50, 60, 55, 67, 46, 101],51
5,Provision PT737E,40,[75, 47, 52, 58, 43, 87, 20, 75, 47, 52, 58, 43, 87],40
6,Provision PT838,42,[75, 48, 52, 58, 44, 88, 21, 75, 48, 52, 58, 44, 88],42
7,Samsung SNH1011N,56,[50, 35, 39, 44, 32, 59, 16, 50, 35, 39, 44, 32, 59],56
8,SimpleHome XCS71002,56,[67, 53, 59, 48, 76, 24, 67, 53, 59, 48, 76],56
9,SimpleHome XCS71003,56,[76, 53, 59, 48, 88, 24, 76, 53, 59, 48, 88],56


In [10]:
print("Evaluation for AE")
print("NN Structure Heuristics")
print("id,name,i,structure,i")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['struct_heuristics']
    nbb = math.ceil(min(nbi,min(s))/2)
    ae = construct_ae(nbi, nbo, nbb, s)
    ae_naive = []
    ae_naive.extend(sorted(s, reverse=True))
    ae_naive.append(nbb)
    ae_naive.extend(sorted(s, reverse=False))
    cb = int(compute_time_complexity_single_pass(nbi, ae_naive, nbo))
    ca = int(compute_time_complexity_single_pass(nbi, ae, nbo))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{nbi},{ae},{nbi}")

Evaluation for AE
NN Structure Heuristics
id,name,i,structure,i
1,Danmini Doorbell,47,[476, 2384, 24, 476, 2384],47
2,Ecobee Thermostat,47,[432, 2160, 24, 432, 2160],47
3,Ennio Doorbell,59,[281, 1408, 30, 281, 1408],59
4,Philips B120N10,51,[495, 2476, 26, 495, 2476],51
5,Provision PT737E,40,[430, 2150, 20, 430, 2150],40
6,Provision PT838,42,[432, 2161, 21, 432, 2161],42
7,Samsung SNH1011N,56,[289, 1447, 28, 289, 1447],56
8,SimpleHome XCS71002,56,[439, 2195, 28, 439, 2195],56
9,SimpleHome XCS71003,56,[435, 2179, 28, 435, 2179],56


In [11]:
print("Evaluation for AE")
print("NN Structure Genetic")
print("id,name,i,structure,i")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['struct_genetic']
    nbb = math.ceil(min(nbi,min(s))/2)
    ae = construct_ae(nbi, nbo, nbb, s)
    ae_naive = []
    ae_naive.extend(sorted(s, reverse=True))
    ae_naive.append(nbb)
    ae_naive.extend(sorted(s, reverse=False))
    cb = int(compute_time_complexity_single_pass(nbi, ae_naive, nbo))
    ca = int(compute_time_complexity_single_pass(nbi, ae, nbo))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{nbi},{ae},{nbi}")

Evaluation for AE
NN Structure Genetic
id,name,i,structure,i
1,Danmini Doorbell,47,[1, 873, 1, 873, 1],47
2,Ecobee Thermostat,47,[40, 804, 20, 40, 804],47
3,Ennio Doorbell,59,[114, 1676, 30, 114, 1676],59
4,Philips B120N10,51,[112, 805, 26, 112, 805],51
5,Provision PT737E,40,[127, 1217, 20, 127, 1217],40
6,Provision PT838,42,[20, 1223, 10, 20, 1223],42
7,Samsung SNH1011N,56,[60, 430, 28, 60, 430],56
8,SimpleHome XCS71002,56,[25, 896, 13, 25, 896],56
9,SimpleHome XCS71003,56,[24, 1801, 12, 24, 1801],56


In [12]:
print("Evaluation for AE")
print("NN Structure Random")
print("id,name,i,structure,i")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['struct_random']
    nbb = math.ceil(min(nbi,min(s))/2)
    ae = construct_ae(nbi, nbo, nbb, s)
    ae_naive = []
    ae_naive.extend(sorted(s, reverse=True))
    ae_naive.append(nbb)
    ae_naive.extend(sorted(s, reverse=False))
    cb = int(compute_time_complexity_single_pass(nbi, ae_naive, nbo))
    ca = int(compute_time_complexity_single_pass(nbi, ae, nbo))
    g = (100.0 * (cb - ca))/cb
    print(f"{i + 1},{d['name']},{nbi},{ae},{nbi}")

Evaluation for AE
NN Structure Random
id,name,i,structure,i
1,Danmini Doorbell,47,[45, 18, 65, 94, 86, 26, 91, 18, 50, 83, 70, 47, 16, 46, 71, 59, 59, 61, 18, 16, 57, 8, 45, 18, 65, 94, 86, 26, 91, 18, 50, 83, 70, 47, 16, 46, 71, 59, 59, 61, 18, 16, 57],47
2,Ecobee Thermostat,47,[55, 41, 72, 55, 97, 23, 45, 28, 70, 75, 58, 42, 72, 33, 79, 12, 55, 41, 72, 55, 97, 23, 45, 28, 70, 75, 58, 42, 72, 33, 79],47
3,Ennio Doorbell,59,[17, 76, 13, 79, 64, 65, 24, 26, 20, 69, 79, 89, 76, 89, 50, 94, 7, 17, 76, 13, 79, 64, 65, 24, 26, 20, 69, 79, 89, 76, 89, 50, 94],59
4,Philips B120N10,51,[66, 30, 81, 53, 55, 12, 96, 91, 21, 28, 71, 50, 22, 61, 15, 34, 80, 37, 95, 57, 97, 65, 89, 6, 66, 30, 81, 53, 55, 12, 96, 91, 21, 28, 71, 50, 22, 61, 15, 34, 80, 37, 95, 57, 97, 65, 89],51
5,Provision PT737E,40,[19, 10, 19, 57, 24, 71, 25, 17, 87, 74, 70, 27, 94, 19, 36, 27, 86, 59, 23, 64, 5, 19, 10, 19, 57, 24, 71, 25, 17, 87, 74, 70, 27, 94, 19, 36, 27, 86, 59, 23, 64],40
6,Provision PT838,42,[55, 30, 67, 59

In [13]:
print("Evaluation for AE")
print("Comparison with other techniques")
print("id,name,Dahlia,Heuristics,Genetic,Random")
for i, d in enumerate(devices):
    nbi = d['features']
    nbo = 1
    s = d['structure']
    h = d['struct_heuristics']
    g = d['struct_genetic']
    r = d['struct_random']
    nbbs = math.ceil(min(nbi,min(s))/2)
    nbbh = math.ceil(min(nbi,min(h))/2)
    nbbg = math.ceil(min(nbi,min(g))/2)
    nbbr = math.ceil(min(nbi,min(r))/2)
    aes = construct_ae(nbi, nbo, nbbs, s)
    aeh = construct_ae(nbi, nbo, nbbh, h)
    aeg = construct_ae(nbi, nbo, nbbg, g)
    aer = construct_ae(nbi, nbo, nbbr, r)
    cs = int(compute_time_complexity_single_pass(nbi, aes, nbo))
    ch = int(compute_time_complexity_single_pass(nbi, aeh, nbo))
    cg = int(compute_time_complexity_single_pass(nbi, aeg, nbo))
    cr = int(compute_time_complexity_single_pass(nbi, aer, nbo))
    print(f"{i + 1},{d['name']},{cs},{ch},{cg},{cr}")

Evaluation for AE
Comparison with other techniques
id,name,Dahlia,Heuristics,Genetic,Random
1,Danmini Doorbell,56112,2362964,3540,112758
2,Ecobee Thermostat,39897,1950912,83884,86334
3,Ennio Doorbell,20028,859953,444230,105408
4,Philips B120N10,50900,2556207,210679,136983
5,Provision PT737E,36767,1919950,342295,77093
6,Provision PT838,37733,1941862,63413,68158
7,Samsung SNH1011N,20857,902605,69110,145716
8,SimpleHome XCS71002,33576,2027741,59069,114751
9,SimpleHome XCS71003,36702,1995461,111493,110566
