In [1]:
import sys
sys.path.append("C:/Users/user/meepc")
import numpy as np
import pandas as pd
from models import Hankel,Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score

C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\user\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\numpy\.libs\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll


In [2]:
def calculate_fpr(y_actual, y_prediction):
    fp = 0
    tn = 0
    
    for i in range(len(y_actual)):
        if y_actual[i] == 0 and y_prediction[i] == 1:
            fp += 1
        elif y_actual[i] == 0 and y_prediction[i] == 0:
            tn += 1
    fpr= fp/(fp+tn)
    return fpr

In [3]:
train_normal = pd.read_csv('~/data/te/normal_training.csv')
train_attack = pd.read_csv('~/data/te/attack_training.csv')

In [4]:
sensors = [col for col in train_normal.columns if col not in ['faultNumber', 'simulationRun', 'sample','LABEL']]

In [5]:
scaler = StandardScaler()
X_normal = pd.DataFrame(index=train_normal.index, columns=sensors, data=scaler.fit_transform(train_normal[sensors]))
X_attack = pd.DataFrame(data=scaler.fit_transform(train_attack.loc[:,sensors]),index=train_attack.index,columns=sensors).reset_index().drop(columns=['index'])

In [6]:
hankel = Hankel()
lag = 360
stride = 0.5

In [7]:
df_test1 = pd.read_csv('~/data/te/normal_testing.csv')
df_test2 = pd.read_csv('~/data/te/attack_testing.csv')
df_test=pd.concat((df_test1,df_test2),axis=0)

# Epasad with 1 cluster and no threshold tuning (training attack included in test data)

In [8]:
test_combined = pd.concat((df_test,train_attack),axis=0)
X_test = pd.DataFrame(index=test_combined.index, columns=sensors, data=scaler.fit_transform(test_combined[sensors]))
Y_test = test_combined.loc[:,'LABEL']

In [9]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [10]:
sensor_models = []
sensor_predicted = []
accuracy = []
precision = []
recall = []
fscore = []
fpr = []
for sens in sensors:
    train_normal = X_normal.loc[:,sens].values
    train_attack = X_attack.loc[:,sens].values
    model = Pipeline()
    model.fit(train_normal,train_attack,lag,stride,optimal_k=1,tune=False)
    test = X_test.loc[:,sens].values
    y_predicted = model.predict(test)
    sensor_predicted.append(y_predicted)
    accuracy.append(accuracy_score(y_actual,y_predicted))
    precision.append(precision_score(y_actual,y_predicted))
    recall.append(recall_score(y_actual,y_predicted))
    fscore.append(f1_score(y_actual,y_predicted))
    fpr.append(calculate_fpr(y_actual,y_predicted))
    sensor_models.append(model)    

In [11]:
sensor_predicted = np.asarray(sensor_predicted)
y_predicted = np.any(sensor_predicted,axis=0).astype(int)
y_predicted

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
print("Accuracy ",accuracy_score(y_actual,y_predicted))
print("Precision ",precision_score(y_actual,y_predicted))
print("Recall ",recall_score(y_actual,y_predicted))
print("F1-score ",f1_score(y_actual,y_predicted))
print("False Positive Rate ",calculate_fpr(y_actual,y_predicted))

Accuracy  0.963855421686747
Precision  0.975609756097561
Recall  0.9876543209876543
F1-score  0.9815950920245398


In [13]:
print("Accuracy -  Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(accuracy).mean(), np.median(np.asarray(accuracy)),np.asarray(accuracy).min(), np.asarray(accuracy).max()))
print("precision - Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(precision).mean(), np.median(np.asarray(precision)),np.asarray(precision).min(), np.asarray(precision).max()))
print("recall -    Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(recall).mean(), np.median(np.asarray(recall)),np.asarray(recall).min(), np.asarray(recall).max()))
print("f1 -        Mean: {} Median : {} Min : {} Max : {}".format(np.asarray(fscore).mean(), np.median(np.asarray(fscore)),np.asarray(fscore).min(), np.asarray(fscore).max()))

Accuracy -  Mean: 0.22949490268767378 Median : 0.14457831325301204 Min : 0.07228915662650602 Max : 0.9036144578313253
precision - Mean: 0.9964204550252901 Median : 1.0 Min : 0.9663865546218487 Max : 1.0
recall -    Mean: 0.21355650522317188 Median : 0.12345679012345678 Min : 0.04938271604938271 Max : 0.9259259259259259
f1 -        Mean: 0.2988380079986286 Median : 0.21978021978021975 Min : 0.09411764705882353 Max : 0.949367088607595


# Multiple clusters + No threshold tuning (training attack mixed in test data)

In [14]:
cluster_accuracy_1 = []
cluster_precision_1 = []
cluster_recall_1 = []
cluster_fscore_1 = []
cluster_fpr_1 = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    fpr = []
    for i,sens in enumerate(sensors):
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = Pipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,kscore_init='inertia')
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        fpr.append(calculate_fpr(y_actual,y_predicted))
        sensor_models.append(model)  
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int) 
    cluster_accuracy_1.append(accuracy_score(y_actual, y_predicted))
    cluster_precision_1.append(precision_score(y_actual,y_predicted))
    cluster_recall_1.append(recall_score(y_actual,y_predicted))
    cluster_fscore_1.append(f1_score(y_actual,y_predicted))
    cluster_fpr_1.append(calculate_fpr(y_actual,y_predicted))
    print('------Number of Clusters: ',k,'-----------') 
    print("Accuracy ",cluster_accuracy_1[-1])
    print("Precision ",cluster_precision_1[-1])
    print("Recall ",cluster_recall_1[-1])
    print("F1-score ",cluster_fscore_1[-1])
    print("False Positive Rate ",cluster_fpr_1[-1],"\n") 

------Number of Clusters:  2 -----------
Accuracy  0.963855421686747
Precision  0.975609756097561
Recall  0.9876543209876543
F1-score  0.9815950920245398
False Positive Rate  1.0 

------Number of Clusters:  3 -----------
Accuracy  0.8012048192771084
Precision  0.9777777777777777
Recall  0.8148148148148148
F1-score  0.8888888888888888
False Positive Rate  0.75 



KeyboardInterrupt: 

In [None]:
print("Accuracy Scores: ",cluster_accuracy_1)
print("Precision Scores: ",cluster_precision_1)
print("Recall Scores: ",cluster_recall_1)
print("F1 Scores: ",cluster_fscore_1)
print("False Positive Rates: ",cluster_fpr_1)

[0.9847094801223242,
 0.8835616438356164,
 0.7727272727272727,
 0.8805460750853241,
 0.8689655172413794,
 0.9320388349514563]

# Multiple clusters + No threshold tuning (No concat of training and test data)

In [None]:
X_test = pd.DataFrame(index=df_test.index, columns=sensors, data=scaler.fit_transform(df_test[sensors]))
Y_test = df_test.loc[:,'LABEL']

In [None]:
labels = hankel.fit(np.array(Y_test),lag,stride)
y_actual = np.any(labels>0,axis=0).astype(int)

In [None]:
cluster_accuracy_2 = []
cluster_precision_2 = []
cluster_recall_2 = []
cluster_fscore_2 = []
cluster_fpr_2 = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    fpr = []
    for sens in sensors:
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = Pipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k=k,tune=False,kscore_init='inertia')
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        fpr.append(calculate_fpr(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    cluster_accuracy_2.append(accuracy_score(y_actual, y_predicted))
    cluster_precision_2.append(precision_score(y_actual,y_predicted))
    cluster_recall_2.append(recall_score(y_actual,y_predicted))
    cluster_fscore_2.append(f1_score(y_actual,y_predicted))
    cluster_fpr_2.append(calculate_fpr(y_actual,y_predicted))
    print('------Number of Clusters: ',k,'-----------') 
    print("Accuracy ",cluster_accuracy_2[-1])
    print("Precision ",cluster_precision_2[-1])
    print("Recall ",cluster_recall_2[-1])
    print("F1-score ",cluster_fscore_2[-1])
    print("False Positive Rate ",cluster_fpr_2[-1],"\n") 

-------number of clusters------ 2
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 3
Accuracy  0.8090909090909091
Precision  0.9775280898876404
Recall  0.8207547169811321
F1-score  0.8923076923076922
-------number of clusters------ 4
Accuracy  0.6363636363636364
Precision  1.0
Recall  0.6226415094339622
F1-score  0.7674418604651162
-------number of clusters------ 5
Accuracy  0.7363636363636363
Precision  0.9753086419753086
Recall  0.7452830188679245
F1-score  0.8449197860962566
-------number of clusters------ 6
Accuracy  0.7818181818181819
Precision  0.9659090909090909
Recall  0.8018867924528302
F1-score  0.8762886597938144
-------number of clusters------ 7
Accuracy  0.8272727272727273
Precision  0.978021978021978
Recall  0.839622641509434
F1-score  0.9035532994923858


In [None]:
print("Accuracy Scores: ",cluster_accuracy_2)
print("Precision Scores: ",cluster_precision_2)
print("Recall Scores: ",cluster_recall_2)
print("F1 Scores: ",cluster_fscore_2)
print("False Positive Rates: ",cluster_fpr_2)

[0.9814814814814815,
 0.8923076923076922,
 0.7674418604651162,
 0.8449197860962566,
 0.8762886597938144,
 0.9035532994923858]

# Multiple clusters + Threshold tuning (No concat of training and test data)

In [None]:
cluster_accuracy_3 = []
cluster_precision_3 = []
cluster_recall_3 = []
cluster_fscore_3 = []
cluster_fpr_3 = []
for k in range(2,8):
    sensor_models = []
    sensor_predicted = []
    accuracy = []
    precision = []
    recall = []
    fscore = []
    fpr = []
    for sens in sensors:
        train_normal = X_normal.loc[:,sens].values
        train_attack = X_attack.loc[:,sens].values
        model = Pipeline()
        model.fit(train_normal,train_attack,lag,stride,optimal_k = k, kscore_init='inertia')
        test = X_test.loc[:,sens].values
        y_predicted = model.predict(test)
        sensor_predicted.append(y_predicted)
        accuracy.append(accuracy_score(y_actual,y_predicted))
        precision.append(precision_score(y_actual,y_predicted))
        recall.append(recall_score(y_actual,y_predicted))
        fscore.append(f1_score(y_actual,y_predicted))
        fpr.append(calculate_fpr(y_actual,y_predicted))
        sensor_models.append(model)    
    sensor_predicted = np.asarray(sensor_predicted)
    y_predicted = np.any(sensor_predicted,axis=0).astype(int)
    cluster_accuracy_3.append(accuracy_score(y_actual, y_predicted))
    cluster_precision_3.append(precision_score(y_actual,y_predicted))
    cluster_recall_3.append(recall_score(y_actual,y_predicted))
    cluster_fscore_3.append(f1_score(y_actual,y_predicted))
    cluster_fpr_3.append(calculate_fpr(y_actual,y_predicted))
    print('------Number of Clusters: ',k,'-----------') 
    print("Accuracy ",cluster_accuracy_3[-1])
    print("Precision ",cluster_precision_3[-1])
    print("Recall ",cluster_recall_3[-1])
    print("F1-score ",cluster_fscore_3[-1])
    print("False Positive Rate ",cluster_fpr_3[-1],"\n")

-------number of clusters------ 2
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 3
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 4
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 5
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 6
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815
-------number of clusters------ 7
Accuracy  0.9636363636363636
Precision  0.9636363636363636
Recall  1.0
F1-score  0.9814814814814815


In [None]:
print("Accuracy Scores: ",cluster_accuracy_3)
print("Precision Scores: ",cluster_precision_3)
print("Recall Scores: ",cluster_recall_3)
print("F1 Scores: ",cluster_fscore_3)
print("False Positive Rates: ",cluster_fpr_3)

[0.9814814814814815,
 0.9814814814814815,
 0.9814814814814815,
 0.9814814814814815,
 0.9814814814814815,
 0.9814814814814815]