<a href="https://colab.research.google.com/github/moniquebeaulieu/510_project/blob/main/data_cleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data cleansing and Preprocessing

# Downloading Data From Kaggle

In [1]:
# how to add kaggle data sets to colab
# https://medium.com/analytics-vidhya/how-to-download-kaggle-datasets-into-google-colab-via-google-drive-dcb348d7af07

# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# setting working directory to Kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"

In [3]:
# downloading API command
!kaggle datasets download -d icsdataset/hai-security-dataset --force --unzip

Downloading hai-security-dataset.zip to /content
 99% 460M/465M [00:03<00:00, 133MB/s]
100% 465M/465M [00:03<00:00, 159MB/s]


In [4]:
%ls

[0m[01;34mdrive[0m/      [01;34mhai-21.03[0m/  hai_dataset_technical_details_v3.0.pdf
[01;34mhai-20.07[0m/  [01;34mhai-22.04[0m/  [01;34msample_data[0m/


Following code is a similar structure to code found here https://dacon.io/en/competitions/official/235757/codeshare/3652?page=1&dtype=recent

# Import libraries

In [5]:
import sys
from pathlib import Path
from datetime import timedelta
import dateutil
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Utility functions

In [6]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([x for x in targets])

# Dataframe preparation

In [7]:
# downloading each train csv (differnt csv b/c taken in different time periods)

train1_raw = dataframe_from_csv("/content/hai-22.04/train1.csv") 
train2_raw = dataframe_from_csv("/content/hai-22.04/train2.csv") 
train3_raw = dataframe_from_csv("/content/hai-22.04/train3.csv") 
train4_raw = dataframe_from_csv("/content/hai-22.04/train4.csv") 
train5_raw = dataframe_from_csv("/content/hai-22.04/train5.csv") 
train6_raw = dataframe_from_csv("/content/hai-22.04/train6.csv") 

In [8]:
# concatenating all train data

train = [train1_raw, train2_raw, train3_raw, train4_raw, train5_raw, train6_raw]
train_df_raw = dataframe_from_csvs(train)
train_df_raw.shape

(1004402, 88)

In [9]:
train_df_raw.dtypes 

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [10]:
train_df_raw.head(3)

Unnamed: 0,timestamp,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,2021-07-11 10:00:00,0.08771,0.88504,476.76703,1014.79321,26.92264,15.07713,595.06104,30.08042,15.50464,...,7.08818,276.40338,-0.00087,14742,276.45758,267.90363,8.90254,9914,27170,0
1,2021-07-11 10:00:01,0.08771,0.88619,476.76703,1014.79321,26.92264,14.97197,531.50317,30.08423,16.20023,...,7.08818,276.18634,0.00058,14781,278.91705,268.95258,8.90254,9914,27171,0
2,2021-07-11 10:00:02,0.08771,0.88836,476.76703,1014.79321,26.92264,14.90129,451.06253,30.09148,16.53352,...,7.08818,279.85754,-0.00072,14831,278.89899,269.76636,8.90254,9914,27170,0


In [11]:
train_df_raw.Attack.unique() # Train data has no attacks which is what we want. can drop attack column

array([0])

In [12]:
# downloading each test csv (differnt csv b/c taken in different time periods) NOTE: train and test data have been pre-split in this public dataset

test1_raw = dataframe_from_csv("/content/hai-22.04/test1.csv") 
test2_raw = dataframe_from_csv("/content/hai-22.04/test2.csv") 
test3_raw = dataframe_from_csv("/content/hai-22.04/test3.csv") 
test4_raw = dataframe_from_csv("/content/hai-22.04/test4.csv") 

In [13]:
# concatenating all test data

test = [test1_raw, test2_raw, test3_raw, test4_raw]
test_df_raw = dataframe_from_csvs(test)
test_df_raw.shape

(361200, 88)

In [14]:
test_df_raw.dtypes

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [15]:
test_df_raw.head(3)

Unnamed: 0,timestamp,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,2021-07-10 00:00:01,0.059445,1.04366,431.328095,1034.712769,27.736847,14.9131,386.51172,31.521,15.09755,...,0.716042,356.64423,0.00058,18131,292.86029,334.58112,20.989594,10000,27181,0
1,2021-07-10 00:00:02,0.059445,1.04217,431.328095,1034.712769,27.736847,14.9554,411.33905,31.51756,14.71477,...,0.716042,352.08698,-0.00282,18111,295.82605,332.97162,20.989594,9999,27174,0
2,2021-07-10 00:00:03,0.059445,1.0392,431.328095,1034.712769,27.736847,14.992,438.15259,31.50269,14.30731,...,0.716042,347.63818,-0.00398,18100,300.02173,332.15784,20.989594,9999,27172,0


In [16]:
test_df_raw.Attack.unique()

array([0, 1])

In [17]:
test_df_raw.Attack.value_counts()[1] # 12030 attacks in the test df, 349170 normal = 3.44%   0.03445313171234642

12030

In [18]:
# change the labels in the datasets from 0 and 1 for the majority and minority classes respectively, to +1 and -1.
train_df_raw['Attack'] = train_df_raw['Attack'].replace([0, 1], [1, -1]) # normal is now 1 and attack is now -1
test_df_raw['Attack'] = test_df_raw['Attack'].replace([0, 1], [1, -1])

In [19]:
test_df_raw.Attack.unique()

array([ 1, -1])

In [20]:
# creating target with attack labels
train_target = train_df_raw['Attack'].copy()
test_target = test_df_raw['Attack'].copy()

# dropping timestamp and attack labels (might need timestamp for window later)... can also drop features that are not used.. do principal features later
train_raw = train_df_raw.drop(['timestamp','Attack'], axis=1).copy()
test_raw = test_df_raw.drop(['timestamp','Attack'], axis=1).copy()

# now 86 parameters and no attack label

In [21]:
total = 1004402+361200 
1004402/total * 100

# 74% train 26 % test split 

73.55012661082804

# Normalize data

- use min max values so they fall within 0-1
- using the exponential weighted function after normalization to smooth noise generated by sensors

In [21]:
def normalize(df):
  min = df.min()
  max = df.max()
  new_df = df.copy()
  for col in df.columns:
      if min[col] == max[col]:
          new_df[col] = df[col] - min[col]
      else:
          new_df[col] = (df[col] - min[col]) / (max[col] - min[col])
  return new_df

  # using this method to keep as pandas df for now

In [22]:
train_df = normalize(train_raw).ewm(alpha=0.9).mean() # normalize and smooth noise
test_df = normalize(test_raw).ewm(alpha=0.9).mean() # normalize and smooth noise
test_df

Unnamed: 0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PO,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01
0,0.000000,0.316773,0.246834,0.865356,0.971259,0.149131,0.121069,0.685155,0.240077,0.254971,...,0.831028,0.000000,0.468399,0.467667,0.384527,0.236058,0.391359,0.654622,0.571675,0.587387
1,0.000000,0.315166,0.246834,0.865356,0.971259,0.149516,0.128272,0.684496,0.234543,0.249680,...,0.857520,0.000000,0.453013,0.357238,0.383226,0.246136,0.385738,0.654622,0.570105,0.575921
2,0.000000,0.311848,0.246834,0.865356,0.971259,0.149880,0.136630,0.681614,0.228208,0.232601,...,0.820810,0.000000,0.436742,0.309953,0.382400,0.261173,0.382416,0.654622,0.569964,0.571642
3,0.000000,0.304068,0.246834,0.865356,0.971259,0.150110,0.145446,0.676915,0.231046,0.229928,...,0.701315,0.000000,0.410708,0.384379,0.375876,0.269659,0.377961,0.654622,0.569950,0.564731
4,0.000000,0.300356,0.246834,0.865356,0.971259,0.150440,0.142333,0.676446,0.241279,0.230534,...,0.666735,0.000000,0.390879,0.463841,0.365692,0.280363,0.370826,0.654622,0.569948,0.570527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129595,0.072406,0.323699,0.674401,0.653310,0.096372,0.298944,0.333665,0.249624,0.226953,0.235446,...,0.422847,0.182591,0.530327,0.460777,0.373407,0.330826,0.364010,0.921419,0.429155,0.209645
129596,0.072406,0.324841,0.674401,0.653310,0.096372,0.299480,0.326022,0.250297,0.237697,0.216137,...,0.446059,0.182591,0.541507,0.297203,0.379679,0.329371,0.363680,0.921419,0.428408,0.207451
129597,0.072406,0.322628,0.674401,0.653310,0.096372,0.299920,0.312425,0.248991,0.252867,0.212787,...,0.467248,0.182591,0.546313,0.390170,0.374510,0.333728,0.361083,0.921419,0.426779,0.212096
129598,0.072406,0.316173,0.674401,0.653310,0.096372,0.300228,0.293328,0.245170,0.265238,0.212234,...,0.491442,0.182591,0.550904,0.469242,0.372254,0.326315,0.360323,0.921419,0.426616,0.215804


In [23]:
def boundary_check(df):
    x = np.array(df, dtype=np.float32)
    return np.any(x > 1.0), np.any(x < 0), np.any(np.isnan(x))

boundary_check(train_df) # making sure normalization within boundaries
boundary_check(test_df)

(False, False, False)

In [25]:
# transform the dataframes?

In [26]:
## plot the test data without the timestamp but keep attack for colouring, 


## Convert dataframes to np.arrays?? is this necesary?

In [27]:
train = np.array(train_raw)
test = np.array(test_raw)
train_target = np.array(train_target)
test_target = np.array(test_target)
test

array([[5.94447180e-02, 1.04366000e+00, 4.31328095e+02, ...,
        2.09895935e+01, 1.00000000e+04, 2.71810000e+04],
       [5.94447180e-02, 1.04217000e+00, 4.31328095e+02, ...,
        2.09895935e+01, 9.99900000e+03, 2.71740000e+04],
       [5.94447180e-02, 1.03920000e+00, 4.31328095e+02, ...,
        2.09895935e+01, 9.99900000e+03, 2.71720000e+04],
       ...,
       [6.94400000e-02, 1.04839000e+00, 4.67038540e+02, ...,
        2.67609400e+01, 9.91600000e+03, 2.69730000e+04],
       [6.94400000e-02, 1.04255000e+00, 4.67038540e+02, ...,
        2.67609400e+01, 9.91600000e+03, 2.69750000e+04],
       [6.94400000e-02, 1.04072000e+00, 4.67038540e+02, ...,
        2.67609400e+01, 9.91600000e+03, 2.69680000e+04]])

## One-Class SVM
- A one-class classifier is fit on a training dataset that only has examples from the normal class ( no anomalies ) 
- negative case (class 0) is taken as “normal” and the positive case (class 1) is taken as an outlier or anomaly
- OneClass SVM  is fit in an unsupervised manner and does not provide the normal hyperparameters for tuning the margin like C. Instead, it provides a hyperparameter “nu” that controls the sensitivity of the support vectors and should be tuned to the approximate ratio of outliers in the data
  - here our train data is 0% but our test data is 3.44% nu = 3.44 in terms of test but idk if thats what we want to do will test to see
- kernel is the kernel type to be used - svms ability to use non-linear function to project the space to higher dimensions. default is rbf = radial basis function
- gamma is a parameter of the rbf kernel type and controls the influence of individual training samples- affects the smootheness of the model
  - low gamma improves smoothenss and generalizability
  - high gamma reduces smoothness but makes model tighter- fitted
  - here we are going to start with 0.00005
- When calling the predict() function on the model, it will output a +1 for normal examples, so-called inliers, and a -1 for outliers.
- If we want to evaluate the performance of the model as a binary classifier, we must change the labels in the test dataset from 0 and 1 for the majority and minority classes respectively, to +1 and -1.


see https://machinelearningmastery.com/one-class-classification-algorithms/ 
and https://medium.com/@jamesstradling/unsupervised-machine-learning-with-one-class-support-vector-machines-129579a49d1d

In [None]:
# instantiate a model and train it with training data
from sklearn import svm
nu = 0.03445313171234642 # ratio of attack instances in test data
model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma = 0.00005)
model.fit(train) 

checking accuracy of the model

In [None]:
# use the predict function on our data and then use sklearn's built-in analysis functions to compare the labels between the predict output and our target
from sklearn import metrics
preds = model.predict(train)
targs = train_target 
print("accuracy: ", metrics.accuracy_score(targs, preds))
print("precision: ", metrics.precision_score(targs, preds)) 
print("recall: ", metrics.recall_score(targs, preds))
print("f1: ", metrics.f1_score(targs, preds))
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))

# shows the model predicts with __ accuacy the class of the data from the training set (which is all normal)
# Precision, recall, F1, and AUC are all measures of the model’s effectiveness at predicting classes
# Closer to 1.0 they are, the better

checking accuracy of the test set

In [None]:
preds = model.predict(test)
targs = test_target 
print("accuracy: ", metrics.accuracy_score(targs, preds))
print("precision: ", metrics.precision_score(targs, preds)) 
print("recall: ", metrics.recall_score(targs, preds))
print("f1: ", metrics.f1_score(targs, preds))
print("area under curve (auc): ", metrics.roc_auc_score(targs, preds))

# shows the model predicts with __ accuacy the class of the data from the test set

## test svm from internet

In [None]:
## THIS IS AN EXAMPLE OF ONE-CLASS SVM JUST TO TEST 


# one-class svm for imbalanced binary classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.999], flip_y=0, random_state=4)


In [None]:
X

In [None]:
# just to see what X looks like in a dataframe
pdf = pd.DataFrame(X)
pdf.head(10)

In [None]:
y

In [None]:
pdf = pd.DataFrame(y)
pdf.head(10)

In [None]:
values = pdf.values
np.unique(values)

In [None]:
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)

In [None]:
trainX

In [None]:
testX

In [None]:
trainy

In [None]:
testy

In [None]:
np.unique(testy)

In [None]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
trainX = trainX[trainy==0]
model.fit(trainX)

In [None]:
# detect outliers in the test set
yhat = model.predict(testX)

In [None]:
# mark inliers 1, outliers -1
testy[testy == 1] = -1
testy[testy == 0] = 1
# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

In [None]:
# today todo:
# do something with labels...
# do X and y split for train and test data - need to see how this will be done
# try model out of box

In [None]:
# notes:
# no na values
# attack column will be used for labels/ classification
# I have been looking at code from this competition website https://dacon.io/en/competitions/official/235757/codeshare?page=1&dtype=recent&ptype=pub&keyword=
# can adjust nu to see how it performs
# adjust gamma to see how it performs
# find principal components? - feature extraction if performs poorly
# will try without timestamp to see how model performs, then try with a time window?
# look at hyper parameters to improve model

# next steps:
# vector quantization- I think we should do OCSVM model and see results then add the VQ to compare how it improves
# model = one class vector machine - look at class notes 
#         - One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

# Vector Quantization
- class imbalance problem here due to many instances of no anomolies vs few instances of attacks (skewed dataset)
- performance of ML will decline 
- instead of undersampling (eliminating instances) VQ compresses datasets by clustering them
- VQ is a lossy data compression method based on principal block coding 
https://link.springer.com/content/pdf/10.1007/978-0-387-34747-9_9.pdf

![algorithm for LVQ](https://d2mk45aasx86xg.cloudfront.net/Implementing_learning_vector_quantization_in_Python_a126c1c235.webp)