In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import plot_tree
from sklearn.svm import SVC
from sklearn import preprocessing
from tqdm import tqdm

%matplotlib inline

## Load the data

In [2]:
# Importing the dataset
data = pd.read_csv('./data/SWaT_train.csv', delimiter=',')
data

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14
0,11.239323,-0.015946,0.019378,-1.177505,-1.262729,0.922708,-0.061910,0.660440,-0.079713,-1.752990,0.229442,2.389817,-0.284886,-0.494532
1,11.250785,-0.028606,-0.484913,-1.264875,-0.429987,0.967385,-1.001235,4.449296,-1.148676,-1.297496,-1.480187,2.006869,0.072566,-0.151499
2,8.773312,-1.304100,-2.975731,15.844832,-1.014392,9.681100,6.901244,-3.125790,27.095738,46.955617,-4.045413,12.653101,4.766548,-0.784740
3,7.140906,-0.706576,-4.087434,7.171380,1.410655,5.293826,2.058185,-0.095913,11.753912,22.351361,-2.424112,7.005019,2.397368,0.369435
4,4.788603,-1.268151,-2.704121,2.395107,1.352515,2.798837,0.755153,-0.633777,2.202329,6.586427,-0.850385,3.342348,-0.497214,0.446388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10924,0.185606,-4.434843,2.391681,-1.051042,-1.561775,-1.053438,0.469733,1.111261,0.890847,0.521523,-0.326381,-1.044522,-1.487023,-0.180109
10925,0.198419,-4.433672,2.386375,-1.040680,-1.558411,-1.044374,0.476529,1.107094,0.886195,0.533418,-0.326928,-1.046730,-1.487050,-0.179652
10926,0.188516,-4.436008,2.388583,-1.053560,-1.554844,-1.042339,0.479398,1.103145,0.855416,0.497227,-0.324437,-1.053632,-1.505631,-0.173749
10927,0.204008,-4.431134,2.380313,-1.019775,-1.554725,-1.048638,0.482756,1.096638,0.811040,0.456664,-0.326047,-1.090093,-1.589117,-0.173931


## Clustering Models

### Isolation forest

In [4]:
from sklearn.ensemble import IsolationForest
from joblib import dump

# Define a function
def IsolationForestC(data, n_estimators, max_samples, contamination):
  model = IsolationForest(
      n_estimators = n_estimators, 
      max_samples = max_samples, 
      contamination = contamination,
      random_state = np.random.RandomState(42)
  )
  model.fit(data)
  return model

# Hyper parameter tunning?

# Create an instance
clf = IsolationForestC(
    data, 
    n_estimators=50, 
    max_samples=500, 
    contamination=0.01
)

# Get labels
preds = []
for i, row in data.iterrows():
  pred = clf.predict(np.asarray(row).reshape(1, -1))
  preds.append(pred[0])































































































































































































































































































































































































































































In [15]:
data_full = data.copy()
data_full.insert(len(data_full.columns), 'Y', preds)
data_full.to_csv('./data/SWaT_labeled.csv')

In [None]:
#data_labeled = pd.read_csv('./data/SWaT_labeled.csv', delimiter=',')

## Classification Models

### Split the data into train and validation

In [10]:
i = round(len(data)*.8)
# 80% of the data
X_train = np.asarray(data[0:i])
y_train = np.asarray(preds[0:i])
# 20% of the data
X_valid = np.asarray(data[i+1:len(data)])
y_valid = np.asarray(preds[i+1:len(preds)])
# Print shapes
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(8743, 14) (8743,) (2185, 14) (2185,)


### Support Vector Classifier

In [18]:
# Linear SVC
def Linear_SupVectC(X_train, y_train, X_test, y_test):
  params = []
  accuracy_scores = []
  # Fitting the model using the training set
  for C in np.linspace(0.01, 5, 20):
    model = SVC(C = .55, kernel = 'linear')
    model.fit(X_train, y_train)
    # Evaluating the model using the testing set
    test_prediction = model.predict(X_test)
    model_acc = accuracy_score(y_test, test_prediction)
    # Saving the values
    params.append({'C': C})
    accuracy_scores.append(model_acc)

  return model, accuracy_scores, params

# Polynomial SVC
def Poly_SupVectC(X_train, y_train, X_valid, y_valid):
  params = []
  accuracy_scores = []

  # Nested for loops to try different params and find the best ones
  for C in np.linspace(0.01, 5, 20):
      for degree in range(2, 10):
          # Fitting the model using the training set
          model = SVC(C = C, degree = degree, kernel = "poly")
          model.fit(X_train, y_train)
          # Evaluating the model using the testing set
          test_prediction = model.predict(X_valid)
          test_accuracy = accuracy_score(y_valid, test_prediction)
          # Saving the values
          params.append({'C': C, 'degree': degree})
          accuracy_scores.append(test_accuracy)

  return model, accuracy_scores, params

# RBF
def RBF_SupVectC(X_train, y_train, X_valid, y_valid):
  params = []
  accuracy_scores = []

  for C in tqdm(np.linspace(0.01, 5, 20)):
      for gamma in np.linspace(0.001, 2, 40):

          model = SVC(C = C, gamma = gamma, kernel = 'rbf')
          model.fit(X_train, y_train)

          test_prediction = model.predict(X_valid)
          test_accuracy = accuracy_score(y_valid, test_prediction)

          params.append({'C': C, 'gamma': gamma})
          accuracy_scores.append(test_accuracy)

  return model, accuracy_scores, params

# Sigmoid
def Sigmoid_SupVectC(X_train, y_train, X_valid, y_valid):
  params = []
  accuracy_scores = []

  for C in tqdm(np.linspace(0.01, 5, 20)):
      for gamma in np.linspace(0.001, 2, 40):

          model = SVC(C = C, gamma = gamma, kernel = 'sigmoid')
          model.fit(X_train, y_train)

          test_prediction = model.predict(X_valid)
          test_accuracy = accuracy_score(y_valid, test_prediction)

          params.append({'C': C, 'gamma': gamma})
          accuracy_scores.append(test_accuracy)

  return model, accuracy_scores, params

### Evaluation of SVC

In [21]:
model_lsvc, accuracy_lsvc, params_lsvc             = Linear_SupVectC(X_train, y_train, X_valid, y_valid)
model_psvc, accuracy_psvc, params_psvc             = Poly_SupVectC(X_train, y_train, X_valid, y_valid)
#model_rbf, accuracy_rbf, params_rbf                = RBF_SupVectC(X_train, y_train, X_valid, y_valid)
#model_sigmoid, accuracy_sigmoid, params_sigmoid    = Sigmoid_SupVectC(X_train, y_train, X_valid, y_valid)

print("\n|#| Best accuracy score %.4f"%max(accuracy_lsvc) + " achieved with params " + str(params_lsvc[np.argmax(accuracy_lsvc)]))
print("|#| Best accuracy score %.4f"%max(accuracy_psvc) + " achieved with params " + str(params_psvc[np.argmax(accuracy_psvc)]))
#print("|#| Best accuracy score %.4f"%max(accuracy_rbf) + " achieved with params " + str(params_rbf[np.argmax(accuracy_rbf)]))
#print("|#| Best accuracy score %.4f"%max(accuracy_sigmoid) + " achieved with params " + str(params_sigmoid[np.argmax(accuracy_sigmoid)]))


|#| Best accuracy score 0.9991 achieved with params {'C': 0.01}
|#| Best accuracy score 0.9995 achieved with params {'C': 0.01, 'degree': 2}


### Save the model

In [22]:
# Save model
import pickle

# It is important to use binary access
with open('./models/svc_poly.pickle', 'wb') as f:
    pickle.dump(model_psvc, f)