In [8]:
import numpy as np
import pandas as pd
from models import Hankel,Corrhankel,RobustPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score

In [9]:
df1 = pd.read_csv('~/data/ctown/dataset03.csv')
df2 = pd.read_csv('~/data/ctown/dataset04.csv')

train_normal = pd.concat((df1,df2),axis=0,ignore_index=True)
train_attack = df2[df2['ATT_FLAG']==1]

In [10]:
sensors = [col for col in train_normal.columns if col not in ['DATETIME','ATT_FLAG','S_PU6','F_PU6','S_PU11','F_PU11']]

In [11]:
y_truth=train_normal.loc[:,'ATT_FLAG']
y_truth

0        0
1        0
2        0
3        0
4        0
        ..
12933    0
12934    0
12935    0
12936    0
12937    0
Name: ATT_FLAG, Length: 12938, dtype: int64

In [12]:
scaler = StandardScaler()
X_normal = pd.DataFrame(index=train_normal.index, columns=sensors, data=scaler.fit_transform(train_normal[sensors]))


In [13]:
#  train_attack[sensors]
X_attack = pd.DataFrame(index=train_attack.index, columns=sensors, data=scaler.fit_transform(train_attack[sensors])).reset_index().drop(columns=['index'])
X_attack

Unnamed: 0,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,S_PU1,F_PU2,...,P_J300,P_J256,P_J289,P_J415,P_J302,P_J306,P_J307,P_J317,P_J14,P_J422
0,0.510851,-0.546397,-0.878085,-2.084040,-1.526839,-2.239309,-0.533841,-0.623119,0.0,0.548804,...,-2.059407,0.794886,-2.054304,0.013617,-1.323365,0.506761,-1.344766,1.507526,0.763488,-2.086719
1,0.623110,-1.147382,-0.297260,-2.101038,-1.035659,-0.755638,-0.193180,-0.784230,0.0,0.506152,...,-2.298174,0.975545,-2.298500,0.105519,0.590949,1.095478,0.604112,-1.873753,0.980724,-2.343036
2,0.757819,-1.734552,0.341648,-1.710086,-0.312534,0.613905,0.182722,-0.896547,0.0,0.476418,...,-2.578631,0.907219,-2.577037,0.232052,0.696182,1.172992,0.708874,-1.758276,1.095377,-2.634306
3,0.735368,-1.858894,0.995076,-0.911185,0.451524,0.785098,0.770069,-0.477660,0.0,0.587312,...,-0.797353,1.370448,-0.783719,0.457146,0.419171,1.141594,0.433104,-1.461149,-0.877852,-0.618722
4,0.690464,-1.617118,1.503298,-0.078287,1.270157,-0.013802,1.745065,-0.474898,0.0,0.587799,...,-0.069683,-0.380559,-0.089286,0.707547,-1.112899,0.855085,-1.130620,2.164060,-0.657599,-0.051718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,-1.824121,0.323996,-0.268218,-0.316258,1.720405,0.785098,0.558624,-0.129661,0.0,0.679439,...,0.059176,1.170102,0.082415,0.530402,-0.947312,0.784439,-0.944206,0.232086,-0.570101,0.173530
488,-1.741798,0.241102,0.428772,0.431650,0.778977,0.785098,1.145971,-0.233692,0.0,0.651898,...,0.055386,1.181682,0.078599,0.405202,0.106567,-1.176969,0.104953,0.627823,-0.565575,0.185181
489,-1.622056,0.185839,1.111241,0.737612,-0.176095,0.785098,1.944763,-0.284327,0.0,0.638493,...,0.062966,1.538369,0.082415,0.575687,0.573926,-1.154401,0.587165,-1.086172,-0.564067,0.189065
490,-1.427475,0.234194,1.633983,1.434527,-0.899221,-0.812702,0.417661,-0.369946,0.0,0.615827,...,0.532920,-0.321497,0.517390,0.856722,0.654398,-1.162251,0.667277,-1.221112,-0.437345,0.515286


In [14]:
hankel = Hankel()
corrhankel = Corrhankel() 
lag = 60
stride = 0.5

In [15]:
df_test = pd.read_csv('~/data/ctown/test_dataset.csv')

In [27]:
corr_normal,nolag_normal = corrhankel.fit(X_normal.to_numpy(),lag,stride)

In [28]:
corr_attack,nolag_attack = corrhankel.fit(X_attack.to_numpy(),lag,stride)

# Epasad with 1 cluster

In [17]:
X_test = pd.DataFrame(index=df_test.index, columns=sensors, data=scaler.fit_transform(df_test[sensors]))
Y_test = df_test.loc[:,'ATT_FLAG']

In [18]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [19]:
sensor_models = []
sensor_predicted = []
accuracy = []
precision = []
recall = []
fscore = []
for i,sens in enumerate(sensors):
    print("sensor -> ",sens)
    train_normal = X_normal.loc[:,sens].values
    model = RobustPipeline()
    model.fit(train_normal,train_attack,lag,stride,optimal_k=1,kscore_init='inertia',tune=False,alpha=0.05,y_truth=y_truth)
    test = X_test.loc[:,sens].values
    y_predicted = model.predict(test)
    sensor_predicted.append(y_predicted)
    accuracy.append(accuracy_score(y_actual,y_predicted))
    precision.append(precision_score(y_actual,y_predicted))
    recall.append(recall_score(y_actual,y_predicted))
    fscore.append(f1_score(y_actual,y_predicted))
    sensor_models.append(model) 
   

sensor ->  L_T1
Percentage of attack points are inactive in 1th: (cluster) iteration is 3.23 %
Percentage of attack points are inactive in 2th: (cluster) iteration is 12.90 %
Percentage of attack points are inactive in 3th: (cluster) iteration is 12.90 %
Percentage of attack points are inactive in 4th: (cluster) iteration is 9.68 %
Percentage of attack points are inactive in 5th: (cluster) iteration is 9.68 %
Percentage of attack points are inactive in 6th: (cluster) iteration is 9.68 %
Percentage of attack points are inactive in 7th: (cluster) iteration is 9.68 %
Clustering Done
Percentage of attack points considered inactive in 1th: (PCA) iteration is 22.58 %
Percentage of attack points considered inactive in 2th: (PCA) iteration is 38.71 %
Percentage of attack points considered inactive in 3th: (PCA) iteration is 41.94 %
               PCA done
Percentage of attack points considered inactive in 1th: (MEEPC) iteration is 3.23 %
Percentage of attack points considered inactive in 2th: 

In [20]:
sensor_predicted = np.asarray(sensor_predicted)
y_predicted = np.any(sensor_predicted,axis=0).astype(int)
y_predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0])

In [21]:
print("Accuracy ",accuracy_score(y_actual,y_predicted))
print("Precision ",precision_score(y_actual,y_predicted))
print("Recall ",recall_score(y_actual,y_predicted))
print("F1-score ",f1_score(y_actual,y_predicted))

Accuracy  0.7647058823529411
Precision  0.8823529411764706
Recall  0.5172413793103449
F1-score  0.6521739130434783


In [22]:
print("Accuracy -  Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(accuracy).mean(), np.median(np.asarray(accuracy)),np.asarray(accuracy).min(), np.asarray(accuracy).max()))
print("precision - Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(precision).mean(), np.median(np.asarray(precision)),np.asarray(precision).min(), np.asarray(precision).max()))
print("recall -    Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(recall).mean(), np.median(np.asarray(recall)),np.asarray(recall).min(), np.asarray(recall).max()))
print("f1 -        Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(fscore).mean(), np.median(np.asarray(fscore)),np.asarray(fscore).min(), np.asarray(fscore).max()))

Accuracy -  Mean: 0.583710407239819 Median : 0.5735294117647058 Min : 0.5735294117647058 Max : 0.6470588235294118
precision - Mean: 0.24908424908424906 Median : 0.0 Min : 0.0 Max : 1.0
recall -    Mean: 0.02564102564102564 Median : 0.0 Min : 0.0 Max : 0.1724137931034483
f1 -        Mean: 0.0453389106852105 Median : 0.0 Min : 0.0 Max : 0.29411764705882354


# Multiple clusters + No threshold tuning

In [23]:
cluster_fscore = []
for k in range(2,8):
    print('-------------------------------------',k,'-------------------------------------')
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        print(sens)
        if sens is not 'L_T3':
            continue
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = RobustPipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,kscore_init='inertia',alpha=0.05,y_truth=y_truth)
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

------------------------------------- 2 -------------------------------------
L_T1
Percentage of attack points are inactive in 1th: (cluster) iteration is 9.68 %
Percentage of attack points are inactive in 2th: (cluster) iteration is 12.90 %
Percentage of attack points are inactive in 3th: (cluster) iteration is 16.13 %
Percentage of attack points are inactive in 4th: (cluster) iteration is 16.13 %
Clustering Done
Percentage of attack points considered inactive in 1th: (PCA) iteration is 23.08 %
Percentage of attack points considered inactive in 2th: (PCA) iteration is 38.46 %
               PCA done
Percentage of attack points considered inactive in 1th: (MEEPC) iteration is 0.00 %
Percentage of attack points considered inactive in 2th: (MEEPC) iteration is 0.00 %
Percentage of attack points considered inactive in 3th: (MEEPC) iteration is 0.00 %
Percentage of attack points considered inactive in 4th: (MEEPC) iteration is 0.00 %
Percentage of attack points considered inactive in 5th: 

ZeroDivisionError: division by zero

In [None]:
cluster_fscore

[0.7234042553191489,
 0.7234042553191489,
 0.6229508196721311,
 0.6849315068493151,
 0.6329113924050632,
 0.6206896551724138]

# Epasad with 1 cluster + No threshold tuning (only Correlation)

In [29]:
X_test = pd.DataFrame(index=df_test.index, columns=sensors, data=scaler.fit_transform(df_test[sensors]))
corr_test,nolag_test = corrhankel.fit(X_test.to_numpy(),lag,stride)
Y_test = df_test.loc[:,'ATT_FLAG']

In [30]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [31]:
sensor_models = []
sensor_predicted = []
accuracy = []
precision = []
recall = []
fscore = []
for i,sens in enumerate(sensors):
    train_normal = X_normal.loc[:,sens].values
    train_attack = X_attack.loc[:,sens].values
    model = RobustPipeline()
    model.fit(train_normal,train_attack,lag,stride,optimal_k=1,tune=False,kscore_init='inertia',corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
              corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True,alpha=0.05,y_truth=y_truth)
    test = X_test.loc[:,sens].values
    y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
    sensor_predicted.append(y_predicted)
    accuracy.append(accuracy_score(y_actual,y_predicted))
    precision.append(precision_score(y_actual,y_predicted))
    recall.append(recall_score(y_actual,y_predicted))
    fscore.append(f1_score(y_actual,y_predicted))
    sensor_models.append(model)    

Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering Done
               PCA done
               Meepc done
Clustering

In [None]:
sensor_predicted = np.asarray(sensor_predicted)
y_predicted = np.any(sensor_predicted,axis=0).astype(int)
y_predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0])

In [None]:
print("Accuracy ",accuracy_score(y_actual,y_predicted))
print("Precision ",precision_score(y_actual,y_predicted))
print("Recall ",recall_score(y_actual,y_predicted))
print("F1-score ",f1_score(y_actual,y_predicted))

Accuracy  0.8088235294117647
Precision  0.9
Recall  0.6206896551724138
F1-score  0.7346938775510204


In [None]:
print("Accuracy -  Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(accuracy).mean(), np.median(np.asarray(accuracy)),np.asarray(accuracy).min(), np.asarray(accuracy).max()))
print("precision - Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(precision).mean(), np.median(np.asarray(precision)),np.asarray(precision).min(), np.asarray(precision).max()))
print("recall -    Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(recall).mean(), np.median(np.asarray(recall)),np.asarray(recall).min(), np.asarray(recall).max()))
print("f1 -        Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(fscore).mean(), np.median(np.asarray(fscore)),np.asarray(fscore).min(), np.asarray(fscore).max()))

Accuracy -  Mean: 0.5935143288084465 Median : 0.5735294117647058 Min : 0.5441176470588235 Max : 0.6764705882352942
precision - Mean: 0.32967032967032966 Median : 0.0 Min : 0.0 Max : 1.0
recall -    Mean: 0.05039787798408488 Median : 0.0 Min : 0.0 Max : 0.2413793103448276
f1 -        Mean: 0.0851332210203178 Median : 0.0 Min : 0.0 Max : 0.3888888888888889


# Multiple clusters + No threshold tuning (No concat of training and test data)

In [None]:
cluster_fscore = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = RobustPipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
                corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True,alpha=0.05,y_truth=y_truth)
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

clustering done
n:  357
(340, 42)
(340, 42)
(340, 42)
pca done
R value 15
meepc done
n:  73
(70, 42)
(70, 42)
pca done
R value 13
meepc done
clustering done
n:  383
(364, 42)
(364, 42)
(364, 42)
pca done
R value 15
meepc done
n:  47
(45, 42)
(45, 42)
pca done
R value 15
meepc done
clustering done
n:  363
(345, 42)
(345, 42)
(345, 42)
pca done
R value 13
meepc done
n:  67
(64, 42)
(64, 42)
pca done
R value 12
meepc done
clustering done
n:  341
(324, 42)
(324, 42)
pca done
R value 13
meepc done
n:  89
(85, 42)
(85, 42)
pca done
R value 13
meepc done
clustering done
n:  291
(277, 42)
(277, 42)
pca done
R value 12
meepc done
n:  139
(133, 42)
(133, 42)
pca done
R value 12
meepc done
clustering done
n:  116
(111, 42)
(111, 42)
pca done
R value 13
meepc done
n:  314
(299, 42)
(299, 42)
pca done
R value 13
meepc done
clustering done
n:  289
(275, 42)
(275, 42)
(275, 42)
pca done
R value 14
meepc done
n:  141
(134, 42)
(134, 42)
pca done
R value 13
meepc done
clustering done
n:  199
(190, 42)


In [None]:
cluster_fscore

[0.7547169811320755,
 0.7812500000000001,
 0.71875,
 0.7428571428571429,
 0.7532467532467533,
 0.6582278481012658]

# Multiple clusters + Threshold tuning (No concat of training and test data)

In [None]:
cluster_fscore = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = RobustPipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,kscore_init='inertia',corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
                corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True,alpha=0.05,y_truth=y_truth)
        test = X_test.loc[:,sens].values    
        y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

clustering done
n:  398
(379, 42)
(379, 42)
(379, 42)
pca done
R value 16
meepc done
n:  32
(31, 42)
(31, 42)
pca done
R value 13
meepc done
clustering done
n:  62
(59, 42)
(59, 42)
pca done
R value 15
meepc done
n:  368
(350, 42)
(350, 42)
(350, 42)
pca done
R value 15
meepc done
clustering done
n:  297
(283, 42)
(283, 42)
(283, 42)
pca done
R value 13
meepc done
n:  133
(127, 42)
(127, 42)
pca done
R value 12
meepc done
clustering done
n:  155
(148, 42)
(148, 42)
pca done
R value 13
meepc done
n:  275
(262, 42)
(262, 42)
(262, 42)
pca done
R value 13
meepc done
clustering done
n:  172
(164, 42)
(164, 42)
(164, 42)
pca done
R value 11
meepc done
n:  258
(246, 42)
(246, 42)
(246, 42)
pca done
R value 12
meepc done
clustering done
n:  200
(190, 42)
(190, 42)
(190, 42)
pca done
R value 13
meepc done
n:  230
(219, 42)
(219, 42)
(219, 42)
pca done
R value 13
meepc done
clustering done
n:  97
(93, 42)
(93, 42)
pca done
R value 13
meepc done
n:  333
(317, 42)
(317, 42)
(317, 42)
pca done
R v

In [None]:
cluster_fscore

[0.7605633802816901,
 0.7272727272727274,
 0.7200000000000001,
 0.7000000000000002,
 0.6829268292682926,
 0.6585365853658537]