<a href="https://colab.research.google.com/github/moniquebeaulieu/510_project/blob/main/data_cleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data cleansing and Preprocessing

# Downloading Data From Kaggle

In [21]:
# how to add kaggle data sets to colab
# https://medium.com/analytics-vidhya/how-to-download-kaggle-datasets-into-google-colab-via-google-drive-dcb348d7af07

# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [22]:
# setting working directory to Kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"

In [23]:
# downloading API command
!kaggle datasets download -d icsdataset/hai-security-dataset --force --unzip

Downloading hai-security-dataset.zip to /content
 98% 454M/465M [00:03<00:00, 72.6MB/s]
100% 465M/465M [00:03<00:00, 126MB/s] 


In [24]:
%ls

[0m[01;34mdrive[0m/      [01;34mhai-21.03[0m/  hai_dataset_technical_details_v3.0.pdf
[01;34mhai-20.07[0m/  [01;34mhai-22.04[0m/  [01;34msample_data[0m/


Following code is a similar structure to code found here https://dacon.io/en/competitions/official/235757/codeshare/3652?page=1&dtype=recent

# Import libraries

In [25]:
import sys
from pathlib import Path
from datetime import timedelta
import dateutil
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Utility functions

In [26]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([x for x in targets])

# Dataframe preparation

In [27]:
# downloading each train csv (differnt csv b/c taken in different time periods)

train1_raw = dataframe_from_csv("/content/hai-22.04/train1.csv") 
train2_raw = dataframe_from_csv("/content/hai-22.04/train2.csv") 
train3_raw = dataframe_from_csv("/content/hai-22.04/train3.csv") 
train4_raw = dataframe_from_csv("/content/hai-22.04/train4.csv") 
train5_raw = dataframe_from_csv("/content/hai-22.04/train5.csv") 
train6_raw = dataframe_from_csv("/content/hai-22.04/train6.csv") 

In [28]:
# concatenating all train data

train = [train1_raw, train2_raw, train3_raw, train4_raw, train5_raw, train6_raw]
train_df_raw = dataframe_from_csvs(train)
train_df_raw.shape

(1004402, 88)

In [29]:
train_df_raw.dtypes 

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [30]:
train_df_raw.head(3)

Unnamed: 0,timestamp,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,2021-07-11 10:00:00,0.08771,0.88504,476.76703,1014.79321,26.92264,15.07713,595.06104,30.08042,15.50464,...,7.08818,276.40338,-0.00087,14742,276.45758,267.90363,8.90254,9914,27170,0
1,2021-07-11 10:00:01,0.08771,0.88619,476.76703,1014.79321,26.92264,14.97197,531.50317,30.08423,16.20023,...,7.08818,276.18634,0.00058,14781,278.91705,268.95258,8.90254,9914,27171,0
2,2021-07-11 10:00:02,0.08771,0.88836,476.76703,1014.79321,26.92264,14.90129,451.06253,30.09148,16.53352,...,7.08818,279.85754,-0.00072,14831,278.89899,269.76636,8.90254,9914,27170,0


In [31]:
# downloading each test csv (differnt csv b/c taken in different time periods) NOTE: train and test data have been pre-split in this public dataset

test1_raw = dataframe_from_csv("/content/hai-22.04/test1.csv") 
test2_raw = dataframe_from_csv("/content/hai-22.04/test2.csv") 
test3_raw = dataframe_from_csv("/content/hai-22.04/test3.csv") 
test4_raw = dataframe_from_csv("/content/hai-22.04/test4.csv") 

In [32]:
# concatenating all test data

test = [test1_raw, test2_raw, test3_raw, test4_raw]
test_df_raw = dataframe_from_csvs(test)
test_df_raw.shape

(361200, 88)

In [33]:
test_df_raw.dtypes

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [34]:
test_df_raw.head(3)

Unnamed: 0,timestamp,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,2021-07-10 00:00:01,0.059445,1.04366,431.328095,1034.712769,27.736847,14.9131,386.51172,31.521,15.09755,...,0.716042,356.64423,0.00058,18131,292.86029,334.58112,20.989594,10000,27181,0
1,2021-07-10 00:00:02,0.059445,1.04217,431.328095,1034.712769,27.736847,14.9554,411.33905,31.51756,14.71477,...,0.716042,352.08698,-0.00282,18111,295.82605,332.97162,20.989594,9999,27174,0
2,2021-07-10 00:00:03,0.059445,1.0392,431.328095,1034.712769,27.736847,14.992,438.15259,31.50269,14.30731,...,0.716042,347.63818,-0.00398,18100,300.02173,332.15784,20.989594,9999,27172,0


In [35]:
# dropping timestamp as results are not dependent on time series
train_raw = train_df_raw.drop(['timestamp'], axis=1)
test_raw = test_df_raw.drop(['timestamp'], axis=1)

# now 86 parameters and one attack classification column

## DO I GET RID OF ATTACK COLUMN BEFORE NORMALIZING???

# Normalize data

- use min max values so they fall within 0-1
- using the exponential weighted function after normalization to smooth noise generated by sensors

In [39]:
def normalize(df):
  min = df.min()
  max = df.max()
  new_df = df.copy()
  for col in df.columns:
      if min[col] == max[col]:
          new_df[col] = df[col] - min[col]
      else:
          new_df[col] = (df[col] - min[col]) / (max[col] - min[col])
  return new_df

  # using this method to keep as pandas df for now

In [40]:
train_df = normalize(train_raw).ewm(alpha=0.9).mean() # normalize and smooth noise
train_df

Unnamed: 0,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,P1_FCV01Z,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,0.22613,0.132221,0.798452,0.538147,0.209725,0.150771,0.183693,0.194185,0.155046,0.116706,...,0.708818,0.169066,0.501686,0.135313,0.172978,0.133292,0.178051,0.207547,0.449040,0.0
1,0.22613,0.133443,0.798452,0.538147,0.209725,0.149815,0.165517,0.194535,0.161370,0.121738,...,0.708818,0.168332,0.517560,0.137850,0.181368,0.136943,0.178051,0.207547,0.450383,0.0
2,0.22613,0.135839,0.798452,0.538147,0.209725,0.149092,0.141083,0.195226,0.164942,0.139408,...,0.708818,0.180570,0.504886,0.141302,0.182062,0.140079,0.178051,0.207547,0.449173,0.0
3,0.22613,0.140053,0.798452,0.538147,0.209725,0.148311,0.125447,0.196440,0.163216,0.154058,...,0.708818,0.187292,0.522383,0.147183,0.187506,0.146871,0.178051,0.207547,0.450383,0.0
4,0.22613,0.140232,0.798452,0.538147,0.209725,0.147409,0.117700,0.196492,0.158453,0.157091,...,0.708818,0.187419,0.517845,0.152922,0.198128,0.151538,0.178051,0.207547,0.450504,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259195,0.57358,0.368856,0.632541,0.414192,0.748145,1.000000,0.996289,0.696846,1.000000,0.994085,...,1.000000,0.517805,0.515751,0.318660,0.291538,0.326726,0.093523,0.410377,0.511453,0.0
259196,0.57358,0.362542,0.632541,0.414192,0.748145,1.000000,0.995484,0.695028,1.000000,0.994085,...,1.000000,0.518674,0.509379,0.316453,0.290860,0.324073,0.093523,0.410377,0.496492,0.0
259197,0.57358,0.359870,0.632541,0.414192,0.748145,1.000000,0.995510,0.694256,1.000000,0.994085,...,1.000000,0.511131,0.528358,0.312046,0.292563,0.320878,0.093523,0.410377,0.501643,0.0
259198,0.57358,0.358277,0.632541,0.414192,0.748145,1.000000,0.995761,0.693797,1.000000,0.993949,...,1.000000,0.510014,0.520068,0.311541,0.282717,0.317817,0.093523,0.410377,0.507476,0.0


In [56]:
def boundary_check(df):
    x = np.array(df, dtype=np.float32)
    return np.any(x > 1.0), np.any(x < 0), np.any(np.isnan(x))

boundary_check(train_df) # making sure normalization within boundaries

(False, False, False)

In [36]:
# today todo:
# get rid of attack column since unsupervised?
# study model to use
# find principal components? - feature extraction, need to see if this is done for this model
# do X and y split for train and test data 
# validation

In [37]:
# notes about dataset!!!
# no na values
# attack column will be used for labels/ classification
# I have been looking at code from this competition website https://dacon.io/en/competitions/official/235757/codeshare?page=1&dtype=recent&ptype=pub&keyword=

# next steps:
# vector quantization- I think we should do OCSVM model and see results then add the VQ to compare how it improves
# unblanced dataset - how to deal
# model = one class vector machine - look at class notes 
#         - One-class SVM is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set.

# Vector Quantization
- class imbalance problem here due to many instances of no anomolies vs few instances of attacks (skewed dataset)
- performance of ML will decline 
- instead of undersampling (eliminating instances) VQ compresses datasets by clustering them
- VQ is a lossy data compression method based on principal block coding 
https://link.springer.com/content/pdf/10.1007/978-0-387-34747-9_9.pdf

![algorithm for LVQ](https://d2mk45aasx86xg.cloudfront.net/Implementing_learning_vector_quantization_in_Python_a126c1c235.webp)