#Supervised Learning

The following implements KNN model for supervised learning after under sampling and taking those features only which have correlation higher than 0.5 with the Normal/Attack occurance.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [None]:
df = pd.read_csv("SWaT_Dataset_Attack_v0.csv")

In [None]:
df.head()

Unnamed: 0,Timestamp,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,P201,P202,P203,P204,P205,P206,DPIT301,FIT301,LIT301,MV301,MV302,MV303,MV304,P301,P302,AIT401,AIT402,FIT401,LIT401,P401,P402,P403,P404,UV401,AIT501,AIT502,AIT503,AIT504,FIT501,FIT502,FIT503,FIT504,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack
0,28/12/2015 10:00:00 AM,2.427057,522.8467,2,2,1,262.0161,8.396437,328.6337,2.445391,2,1,1,2,1,2,1,19.74838,2.206835,956.1651,1,2,1,1,1,2,148.808,156.0882,1.713517,942.0662,1,2,1,1,2,7.878621,145.1166,264.5475,12.03538,1.723789,1.279621,0.735269,0.307786,2,1,250.8652,1.649953,189.5988,0.000128,1,1,1,Normal
1,28/12/2015 10:00:01 AM,2.446274,522.886,2,2,1,262.0161,8.396437,328.6337,2.445391,2,1,1,2,1,2,1,19.74838,2.208244,956.1651,1,2,1,1,1,2,148.808,156.0882,1.715952,942.0277,1,2,1,1,2,7.878621,145.1166,264.5475,12.03538,1.723789,1.297554,0.735269,0.307786,2,1,250.8652,1.649953,189.6789,0.000128,1,1,1,Normal
2,28/12/2015 10:00:02 AM,2.489191,522.8467,2,2,1,262.0161,8.394514,328.6337,2.442316,2,1,1,2,1,2,1,19.69076,2.208628,956.4855,1,2,1,1,1,2,148.808,156.0882,1.715952,941.8739,1,2,1,1,2,7.878621,145.1166,264.5475,12.03538,1.723404,1.293967,0.735269,0.308619,2,1,250.8812,1.649953,189.6789,0.000128,1,1,1,Normal
3,28/12/2015 10:00:03 AM,2.53435,522.9645,2,2,1,262.0161,8.394514,328.6337,2.442316,2,1,1,2,1,2,1,19.69076,2.208628,956.806,1,2,1,1,1,2,148.808,156.0882,1.71467,941.797,1,2,1,1,2,7.878621,145.0141,264.5475,12.03538,1.723404,1.281158,0.735269,0.308619,2,1,250.8812,1.649953,189.6148,0.000128,1,1,1,Normal
4,28/12/2015 10:00:04 AM,2.56926,523.4748,2,2,1,262.0161,8.394514,328.6337,2.443085,2,1,1,2,1,2,1,19.69076,2.208628,957.0864,1,2,1,1,1,2,148.808,156.0882,1.71467,942.22,1,2,1,1,2,7.878621,144.8859,264.5475,12.03538,1.723404,1.281158,0.735269,0.308619,2,1,250.8812,1.649953,189.5027,0.000128,1,1,1,Normal


In [None]:
cols = df.columns
cols

Index([' Timestamp', 'FIT101', 'LIT101', ' MV101', 'P101', 'P102', ' AIT201',
       'AIT202', 'AIT203', 'FIT201', ' MV201', ' P201', ' P202', 'P203',
       ' P204', 'P205', 'P206', 'DPIT301', 'FIT301', 'LIT301', 'MV301',
       'MV302', ' MV303', 'MV304', 'P301', 'P302', 'AIT401', 'AIT402',
       'FIT401', 'LIT401', 'P401', 'P402', 'P403', 'P404', 'UV401', 'AIT501',
       'AIT502', 'AIT503', 'AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504',
       'P501', 'P502', 'PIT501', 'PIT502', 'PIT503', 'FIT601', 'P601', 'P602',
       'P603', 'Normal/Attack'],
      dtype='object')

In [None]:

# convert normal and attack innto binary variable
df['Normal/Attack'].replace('Normal', 0, inplace=True)
df['Normal/Attack'].replace('Attack', 1, inplace=True)


In [None]:
# %%
df['Normal/Attack'].value_counts()

0    395298
1     54621
Name: Normal/Attack, dtype: int64

In [None]:
df.drop(' Timestamp', axis = 1, inplace=True)

Calculating correlation with the target column Normal/Attack

In [None]:
df_num_corr = df.corr()['Normal/Attack'][:-1] 
golden_features_list = df_num_corr[abs(df_num_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with SalePrice:\n{}".format(len(golden_features_list), golden_features_list))

There is 15 strongly correlated values with SalePrice:
AIT402    0.737683
AIT502    0.707564
MV304     0.609133
AIT501   -0.577902
LIT401   -0.670578
P402     -0.743164
FIT502   -0.752143
PIT503   -0.753205
FIT501   -0.753403
PIT501   -0.755207
P501     -0.757980
UV401    -0.758003
FIT503   -0.758781
FIT504   -0.761722
FIT401   -0.763314
Name: Normal/Attack, dtype: float64


Droping the other column vectors and only keeping the golden features (correlation >0.5)

In [None]:
df_reduced = df[['AIT402','AIT502','MV304','AIT501','LIT401','P402','FIT502','PIT503','FIT501','PIT501','P501','UV401','FIT503','FIT504','FIT401','Normal/Attack']]

In [None]:
train_y = df_reduced['Normal/Attack']
X_df = df_reduced.iloc[:,:-1]

#Undersampling
Since the dataset is highly imbalanced, performing **Under sampling** to get approximately equal ratio of the two classes. Also since the dataset is large undersampling is preferred 

In [None]:
from imblearn.under_sampling import RandomUnderSampler



Under sampling

In [None]:
#defining under sampling strategy 
under_sample = RandomUnderSampler(sampling_strategy=0.5)
# fit and apply the transform
X_under, y_under = under_sample.fit_resample(X_df, train_y)



Spliting the dataset using **stratify** to get same class ratio in test and training dataset

In [None]:
# let's split into train and test set and then use scalar transformation
X_train, X_test, y_train, y_test = train_test_split(X_under, y_under, random_state=0, test_size=0.3, stratify =y_under)
# now let's standardize
scalar = StandardScaler()
SX_train = scalar.fit_transform(X_train)
SX_test = scalar.transform(X_train)

In [None]:
from collections import Counter
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 76469, 1: 38235})
Counter({0: 32773, 1: 16386})


Now the class ratio is improved in both training and test dataset

# KNN model

In [None]:
# %%
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(SX_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
y_pred = model.predict(X_test)

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)
x = pd.DataFrame(report).transpose()
x

Unnamed: 0,precision,recall,f1-score,support
0,0.826679,0.999542,0.90493,32773.0
1,0.998427,0.580862,0.734442,16386.0
accuracy,0.859985,0.859985,0.859985,0.859985
macro avg,0.912553,0.790202,0.819686,49159.0
weighted avg,0.883927,0.859985,0.848102,49159.0


In [None]:
L

array([[  2.58424942, 522.84670268,   2.00003459, ...,   0.99999646,
          0.99999587,   0.99999646],
       [  2.58424236, 522.88600266,   2.00003534, ...,   0.99999778,
          0.99999718,   0.99999778],
       [  2.58456411, 522.84670267,   2.0000352 , ...,   0.99999783,
          0.99999723,   0.99999783],
       ...,
       [  2.53147094, 520.6877919 ,   1.99484618, ...,   1.00000318,
          1.0000032 ,   1.00000318],
       [  2.52121878, 520.72709185,   1.98819699, ...,   1.00000328,
          1.00000329,   1.00000328],
       [  2.51074078, 521.11959156,   1.98304875, ...,   1.00000334,
          1.00000334,   1.00000334]])

In [None]:
L.shape

(449919, 51)

In [None]:
############ basis vector
##################################  pre-defined function
import numpy as np

def gs(X):
    Q, R = np.linalg.qr(X)
    return Q

In [None]:
A = gs(L)

In [None]:
A.shape

(449919, 51)

In [None]:
A

array([[-0.00197435, -0.00027501, -0.00116224, ..., -0.00242623,
        -0.02058128, -0.01406679],
       [-0.00197435, -0.00027483, -0.00116376, ...,  0.02283848,
         0.02155911,  0.00724438],
       [-0.00197459, -0.00027532, -0.0011661 , ..., -0.05275858,
         0.03146894,  0.02786435],
       ...,
       [-0.00193403, -0.00023256, -0.00055887, ..., -0.00170343,
        -0.00213372, -0.00192036],
       [-0.0019262 , -0.00022233, -0.00060792, ...,  0.00024263,
        -0.00043183,  0.00165853],
       [-0.00191819, -0.00021036, -0.00062957, ...,  0.00053063,
         0.0008598 ,  0.00319622]])