In [1]:
import numpy as np
import pandas as pd
from models import Hankel,Corrhankel,RobustPipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score

In [2]:
df1 = pd.read_csv('~/data/ctown/dataset03.csv')
df2 = pd.read_csv('~/data/ctown/dataset04.csv')

train_normal = pd.concat((df1,df2),axis=0,ignore_index=True)
train_attack = df2[df2['ATT_FLAG']==1]

In [3]:
sensors = [col for col in train_normal.columns if col not in ['DATETIME','ATT_FLAG']]

In [4]:
scaler = StandardScaler()
X_normal = pd.DataFrame(index=train_normal.index, columns=sensors, data=scaler.fit_transform(train_normal[sensors]))


In [5]:
#  train_attack[sensors]
X_attack = pd.DataFrame(index=train_attack.index, columns=sensors, data=scaler.fit_transform(train_attack[sensors])).reset_index().drop(columns=['index'])
X_attack

Unnamed: 0,L_T1,L_T2,L_T3,L_T4,L_T5,L_T6,L_T7,F_PU1,S_PU1,F_PU2,...,P_J300,P_J256,P_J289,P_J415,P_J302,P_J306,P_J307,P_J317,P_J14,P_J422
0,0.510851,-0.546397,-0.878085,-2.084040,-1.526839,-2.239309,-0.533841,-0.623119,0.0,0.548804,...,-2.059407,0.794886,-2.054304,0.013617,-1.323365,0.506761,-1.344766,1.507526,0.763488,-2.086719
1,0.623110,-1.147382,-0.297260,-2.101038,-1.035659,-0.755638,-0.193180,-0.784230,0.0,0.506152,...,-2.298174,0.975545,-2.298500,0.105519,0.590949,1.095478,0.604112,-1.873753,0.980724,-2.343036
2,0.757819,-1.734552,0.341648,-1.710086,-0.312534,0.613905,0.182722,-0.896547,0.0,0.476418,...,-2.578631,0.907219,-2.577037,0.232052,0.696182,1.172992,0.708874,-1.758276,1.095377,-2.634306
3,0.735368,-1.858894,0.995076,-0.911185,0.451524,0.785098,0.770069,-0.477660,0.0,0.587312,...,-0.797353,1.370448,-0.783719,0.457146,0.419171,1.141594,0.433104,-1.461149,-0.877852,-0.618722
4,0.690464,-1.617118,1.503298,-0.078287,1.270157,-0.013802,1.745065,-0.474898,0.0,0.587799,...,-0.069683,-0.380559,-0.089286,0.707547,-1.112899,0.855085,-1.130620,2.164060,-0.657599,-0.051718
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
487,-1.824121,0.323996,-0.268218,-0.316258,1.720405,0.785098,0.558624,-0.129661,0.0,0.679439,...,0.059176,1.170102,0.082415,0.530402,-0.947312,0.784439,-0.944206,0.232086,-0.570101,0.173530
488,-1.741798,0.241102,0.428772,0.431650,0.778977,0.785098,1.145971,-0.233692,0.0,0.651898,...,0.055386,1.181682,0.078599,0.405202,0.106567,-1.176969,0.104953,0.627823,-0.565575,0.185181
489,-1.622056,0.185839,1.111241,0.737612,-0.176095,0.785098,1.944763,-0.284327,0.0,0.638493,...,0.062966,1.538369,0.082415,0.575687,0.573926,-1.154401,0.587165,-1.086172,-0.564067,0.189065
490,-1.427475,0.234194,1.633983,1.434527,-0.899221,-0.812702,0.417661,-0.369946,0.0,0.615827,...,0.532920,-0.321497,0.517390,0.856722,0.654398,-1.162251,0.667277,-1.221112,-0.437345,0.515286


In [6]:
hankel = Hankel()
corrhankel = Corrhankel()
lag = 60
stride = 0.5

In [7]:
corr_normal,nolag_normal = corrhankel.fit(X_normal.to_numpy(),lag,stride)

In [8]:
corr_attack,nolag_attack = corrhankel.fit(X_attack.to_numpy(),lag,stride)

In [9]:
df_test = pd.read_csv('~/data/ctown/test_dataset.csv')

# Epasad with 1 cluster

In [10]:
X_test = pd.DataFrame(index=df_test.index, columns=sensors, data=scaler.fit_transform(df_test[sensors]))
corr_test,nolag_test = corrhankel.fit(X_test.to_numpy(),lag,stride)
Y_test = df_test.loc[:,'ATT_FLAG']

In [11]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [12]:
sensor_models = []
sensor_predicted = []
accuracy = []
precision = []
recall = []
fscore = []
for i,sens in enumerate(sensors):
    train_normal = X_normal.loc[:,sens].values
    model = RobustPipeline()
    model.fit(train_normal,train_attack,lag,stride,optimal_k=1,kscore_init='inertia',tune=False,alpha=0.05)
    test = X_test.loc[:,sens].values
    y_predicted = model.predict(test)
    sensor_predicted.append(y_predicted)
    accuracy.append(accuracy_score(y_actual,y_predicted))
    precision.append(precision_score(y_actual,y_predicted))
    recall.append(recall_score(y_actual,y_predicted))
    fscore.append(f1_score(y_actual,y_predicted))
    sensor_models.append(model)    

clustering done
n:  430
(409, 60)
(409, 60)
(409, 60)
pca done
R value 8
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 11
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 14
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 36
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 14
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 33
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 32
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
(409, 60)
pca done
R value 15
meepc done
clustering done
n:  430
(409, 60)
pca done
R value 59
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
(409, 60)
pca done
R value 15
meepc done
clustering done
n:  430
(409, 60)
(409, 60)
pca done
R value 15
meepc done
clustering done
n:  430
(409, 60)
pca done
R value 59
meepc done
clustering done
n:  430
(409, 60)
pca done
R value 59
meepc done
clustering done
n:  430
(4

In [None]:
sensor_predicted = np.asarray(sensor_predicted)
y_predicted = np.any(sensor_predicted,axis=0).astype(int)
y_predicted

array([1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1])

In [None]:
print("Accuracy ",accuracy_score(y_actual,y_predicted))
print("Precision ",precision_score(y_actual,y_predicted))
print("Recall ",recall_score(y_actual,y_predicted))
print("F1-score ",f1_score(y_actual,y_predicted))

Accuracy  0.5
Precision  0.45098039215686275
Recall  0.7931034482758621
F1-score  0.5750000000000001


In [None]:
print("Accuracy -  Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(accuracy).mean(), np.median(np.asarray(accuracy)),np.asarray(accuracy).min(), np.asarray(accuracy).max()))
print("precision - Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(precision).mean(), np.median(np.asarray(precision)),np.asarray(precision).min(), np.asarray(precision).max()))
print("recall -    Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(recall).mean(), np.median(np.asarray(recall)),np.asarray(recall).min(), np.asarray(recall).max()))
print("f1 -        Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(fscore).mean(), np.median(np.asarray(fscore)),np.asarray(fscore).min(), np.asarray(fscore).max()))

Accuracy -  Mean: 0.580316742081448 Median : 0.5735294117647058 Min : 0.5147058823529411 Max : 0.6617647058823529
precision - Mean: 0.37145909645909647 Median : 0.4166666666666667 Min : 0.0 Max : 1.0
recall -    Mean: 0.07869142351900972 Median : 0.034482758620689655 Min : 0.0 Max : 0.27586206896551724
f1 -        Mean: 0.12373239279321178 Median : 0.06451612903225806 Min : 0.0 Max : 0.4102564102564103


# Multiple clusters + No threshold tuning

In [None]:
cluster_fscore = []
for k in range(2,8):
    print('-------------------------------------',k,'-------------------------------------')
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        print(sens)
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = RobustPipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,kscore_init='inertia',alpha=0.05)
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

------------------------------------- 2 -------------------------------------
L_T1
clustering done
n:  236
(225, 60)
(225, 60)
pca done
R value 8
meepc iterations done
meepc done
n:  194
(185, 60)
(185, 60)
pca done
R value 9
meepc iterations done
meepc done
L_T2
clustering done
n:  216
(206, 60)
(206, 60)
pca done
R value 10
meepc iterations done
meepc done
n:  214
(204, 60)
(204, 60)
pca done
R value 10
meepc iterations done
meepc done
L_T3
clustering done
n:  215
(205, 60)
(205, 60)
(205, 60)
pca done
R value 14
meepc iterations done
meepc done
n:  215
(205, 60)
(205, 60)
pca done
R value 13
meepc iterations done
meepc done
L_T4
clustering done
n:  190
(181, 60)
(181, 60)
pca done
R value 32
meepc iterations done
meepc done
n:  240
(228, 60)
(228, 60)
(228, 60)
pca done
R value 32
meepc iterations done
meepc done
L_T5
clustering done
n:  207
(197, 60)
(197, 60)
pca done
R value 12
meepc iterations done
meepc done
n:  223
(212, 60)
(212, 60)
pca done
R value 12
meepc iterations done


In [None]:
cluster_fscore

[0.7659574468085107,
 0.6296296296296295,
 0.676470588235294,
 0.5538461538461539,
 0.5945945945945945,
 0.5750000000000001]

# Epasad with 1 cluster + No threshold tuning (No concat of training and test data)

In [None]:
X_test = pd.DataFrame(index=df_test.index, columns=sensors, data=scaler.fit_transform(df_test[sensors]))
corr_test,nolag_test = corrhankel.fit(X_test.to_numpy(),lag,stride)
Y_test = df_test.loc[:,'ATT_FLAG']

In [None]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [None]:
sensor_models = []
sensor_predicted = []
accuracy = []
precision = []
recall = []
fscore = []
for i,sens in enumerate(sensors):
    train_normal = X_normal.loc[:,sens].values
    train_attack = X_attack.loc[:,sens].values
    model = Pipeline()
    model.fit(train_normal,train_attack,lag,stride,optimal_k=1,tune=False,kscore_init='inertia',corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
              corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True)
    test = X_test.loc[:,sens].values
    y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
    sensor_predicted.append(y_predicted)
    accuracy.append(accuracy_score(y_actual,y_predicted))
    precision.append(precision_score(y_actual,y_predicted))
    recall.append(recall_score(y_actual,y_predicted))
    fscore.append(f1_score(y_actual,y_predicted))
    sensor_models.append(model)    

In [None]:
sensor_predicted = np.asarray(sensor_predicted)
y_predicted = np.any(sensor_predicted,axis=0).astype(int)
y_predicted

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1])

In [None]:
print("Accuracy ",accuracy_score(y_actual,y_predicted))
print("Precision ",precision_score(y_actual,y_predicted))
print("Recall ",recall_score(y_actual,y_predicted))
print("F1-score ",f1_score(y_actual,y_predicted))

Accuracy  0.75
Precision  0.6764705882352942
Recall  0.7931034482758621
F1-score  0.7301587301587301


In [None]:
print("Accuracy -  Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(accuracy).mean(), np.median(np.asarray(accuracy)),np.asarray(accuracy).min(), np.asarray(accuracy).max()))
print("precision - Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(precision).mean(), np.median(np.asarray(precision)),np.asarray(precision).min(), np.asarray(precision).max()))
print("recall -    Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(recall).mean(), np.median(np.asarray(recall)),np.asarray(recall).min(), np.asarray(recall).max()))
print("f1 -        Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(fscore).mean(), np.median(np.asarray(fscore)),np.asarray(fscore).min(), np.asarray(fscore).max()))

Accuracy -  Mean: 0.6063611491108071 Median : 0.6029411764705882 Min : 0.5294117647058824 Max : 0.7058823529411765
precision - Mean: 0.5817183462532299 Median : 0.75 Min : 0.0 Max : 1.0
recall -    Mean: 0.10344827586206896 Median : 0.06896551724137931 Min : 0.0 Max : 0.3103448275862069
f1 -        Mean: 0.16933723168106843 Median : 0.12903225806451613 Min : 0.0 Max : 0.4736842105263158


# Multiple clusters + No threshold tuning (No concat of training and test data)

In [None]:
cluster_fscore = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = Pipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
                corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True)
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

------Number of Clusters:  2 -----------
Accuracy  0.75
Precision  0.6363636363636364
Recall  0.9655172413793104
F1-score  0.7671232876712328 

------Number of Clusters:  3 -----------
Accuracy  0.6470588235294118
Precision  0.5471698113207547
Recall  1.0
F1-score  0.7073170731707317 

------Number of Clusters:  4 -----------
Accuracy  0.5441176470588235
Precision  0.4827586206896552
Recall  0.9655172413793104
F1-score  0.6436781609195403 

------Number of Clusters:  5 -----------
Accuracy  0.5588235294117647
Precision  0.4915254237288136
Recall  1.0
F1-score  0.6590909090909091 

------Number of Clusters:  6 -----------
Accuracy  0.5294117647058824
Precision  0.47540983606557374
Recall  1.0
F1-score  0.6444444444444445 

------Number of Clusters:  7 -----------
Accuracy  0.5294117647058824
Precision  0.47540983606557374
Recall  1.0
F1-score  0.6444444444444445 



In [None]:
cluster_fscore

[0.7671232876712328,
 0.7073170731707317,
 0.6436781609195403,
 0.6590909090909091,
 0.6444444444444445,
 0.6444444444444445]

# Multiple clusters + Threshold tuning (No concat of training and test data)

In [None]:
cluster_fscore = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    for i,sens in enumerate(sensors):
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = Pipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,kscore_init='inertia',corr_normal=corr_normal[:,i].reshape(nolag_normal,len(X_normal.columns)).T,
                corr_attack=corr_attack[:,i].reshape(nolag_attack,len(X_attack.columns)).T,only_corr=True)
        test = X_test.loc[:,sens].values    
        y_predicted = model.predict(test,corr_test=corr_test[:,i].reshape(nolag_test,len(X_normal.columns)).T)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    print('------Number of Clusters: ',k,'-----------')
    print("Accuracy ",accuracy_score(y_actual,y_predicted))
    print("Precision ",precision_score(y_actual,y_predicted))
    print("Recall ",recall_score(y_actual,y_predicted))
    print("F1-score ",f1_score(y_actual,y_predicted),'\n')
    cluster_fscore.append(f1_score(y_actual,y_predicted))

------Number of Clusters:  2 -----------
Accuracy  0.75
Precision  0.6363636363636364
Recall  0.9655172413793104
F1-score  0.7671232876712328 

------Number of Clusters:  3 -----------
Accuracy  0.6323529411764706
Precision  0.5370370370370371
Recall  1.0
F1-score  0.6987951807228916 

------Number of Clusters:  4 -----------
Accuracy  0.6176470588235294
Precision  0.5272727272727272
Recall  1.0
F1-score  0.6904761904761904 

------Number of Clusters:  5 -----------
Accuracy  0.45588235294117646
Precision  0.4393939393939394
Recall  1.0
F1-score  0.6105263157894737 

------Number of Clusters:  6 -----------
Accuracy  0.5735294117647058
Precision  0.5
Recall  1.0
F1-score  0.6666666666666666 

------Number of Clusters:  7 -----------
Accuracy  0.4852941176470588
Precision  0.453125
Recall  1.0
F1-score  0.6236559139784946 



In [None]:
cluster_fscore

[0.7671232876712328,
 0.6987951807228916,
 0.6904761904761904,
 0.6105263157894737,
 0.6666666666666666,
 0.6236559139784946]