In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import OneClassSVM
from sklearn import preprocessing
from tqdm import tqdm

%matplotlib inline

## Load the data

In [None]:
# Importing the dataset
data = pd.read_csv('./data/SWaT_train.csv', delimiter=',')
data

## Clustering Models

### Isolation forest

In [None]:
from sklearn.ensemble import IsolationForest
from joblib import dump

# Define a function
def IsolationForestC(data, n_estimators, max_samples, contamination):
  model = IsolationForest(
      n_estimators = n_estimators, 
      max_samples = max_samples, 
      contamination = contamination,
      random_state = np.random.RandomState(42)
  )
  model.fit(data)
  return model

# Hyper parameter tunning?

# Create an instance
clf = IsolationForestC(
    data, 
    n_estimators=50, 
    max_samples=500, 
    contamination=0.01
)

# Get labels
#preds = []
#for i, row in data.iterrows():
#  pred = clf.predict(np.asarray(row).reshape(1, -1))
#  preds.append(pred[0])

data_full = data.copy()
data_full.insert(len(data_full.columns), 'Y', preds)
data_full.to_csv('./data/SWaT_labeled.csv')

#data_labeled = pd.read_csv('./data/SWaT_labeled.csv', delimiter=',')

## Classification Models

### Split the data into train and validation

In [None]:
i = round(len(data)*.8)
# 80% of the data
X_train = np.asarray(data[0:i])
y_train = np.asarray(preds[0:i])
# 20% of the data
X_valid = np.asarray(data[i+1:len(data)])
y_valid = np.asarray(preds[i+1:len(preds)])
# Print shapes
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

### Support Vector Classifier

In [None]:
# Save model
import pickle

X = data.iloc[:, data.columns != 'Time']
clf = OneClassSVM(gamma='auto').fit(X)

# It is important to use binary access
with open('../ocsvm.pickle', 'wb') as f:
    pickle.dump(clf, f)

### Evaluation of SVC

In [None]:
model_lsvc, accuracy_lsvc, params_lsvc             = Linear_SupVectC(X_train, y_train, X_valid, y_valid)
model_psvc, accuracy_psvc, params_psvc             = Poly_SupVectC(X_train, y_train, X_valid, y_valid)

print("\n|#| Best accuracy score %.4f"%max(accuracy_lsvc) + " achieved with params " + str(params_lsvc[np.argmax(accuracy_lsvc)]))
print("|#| Best accuracy score %.4f"%max(accuracy_psvc) + " achieved with params " + str(params_psvc[np.argmax(accuracy_psvc)]))

In [None]:
# Evaluation in test data
model_psvc, accuracy_psvc, params_psvc             = Poly_SupVectC(X_train, y_train, X_valid, y_valid)

### Save the model

In [None]:
# Save model
import pickle

# It is important to use binary access
with open('./models/isolation_forest.pickle', 'wb') as f:
    pickle.dump(clf, f)