<a href="https://colab.research.google.com/github/moniquebeaulieu/510_project/blob/main/data_cleansing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data cleansing and Preparation

# Downloading Data From Kaggle

In [60]:
# how to add kaggle data sets to colab
# https://medium.com/analytics-vidhya/how-to-download-kaggle-datasets-into-google-colab-via-google-drive-dcb348d7af07

# mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [61]:
# setting working directory to Kaggle
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/Kaggle"

In [63]:
# downloading API command
!kaggle datasets download -d icsdataset/hai-security-dataset --force --unzip

Downloading hai-security-dataset.zip to /content
 98% 456M/465M [00:02<00:00, 163MB/s]
100% 465M/465M [00:02<00:00, 169MB/s]


In [64]:
%ls

[0m[01;34mdrive[0m/  [01;34mhai-20.07[0m/  [01;34mhai-22.04[0m/                              [01;34msample_data[0m/
[01;34mhai[0m/    [01;34mhai-21.03[0m/  hai_dataset_technical_details_v3.0.pdf


Following code is a similar structure to code found here https://dacon.io/en/competitions/official/235757/codeshare/3652?page=1&dtype=recent

# Import libraries

In [67]:
import sys
from pathlib import Path
from datetime import timedelta
import dateutil
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

# Utility functions

In [83]:
def dataframe_from_csv(target):
    return pd.read_csv(target).rename(columns=lambda x: x.strip())

def dataframe_from_csvs(targets):
    return pd.concat([x for x in targets])

# Dataframe preparation

In [66]:
# downloading each train csv (differnt csv b/c taken in different time periods)

train1_raw = dataframe_from_csv("/content/hai-22.04/train1.csv") 
train2_raw = dataframe_from_csv("/content/hai-22.04/train2.csv") 
train3_raw = dataframe_from_csv("/content/hai-22.04/train3.csv") 
train4_raw = dataframe_from_csv("/content/hai-22.04/train4.csv") 
train5_raw = dataframe_from_csv("/content/hai-22.04/train5.csv") 
train6_raw = dataframe_from_csv("/content/hai-22.04/train6.csv") 

In [86]:
# concatenating all train data

train = [train1_raw, train2_raw, train3_raw, train4_raw, train5_raw, train6_raw]
train_df_raw = dataframe_from_csvs(train)
train_df_raw.shape

(1004402, 88)

In [87]:
train_df_raw.dtypes 

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [92]:
train_df_raw.head(3)

Unnamed: 0,timestamp,P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV01D,...,P4_HT_PS,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PS,P4_ST_PT01,P4_ST_TT01,Attack
0,2021-07-11 10:00:00,0.08771,0.88504,476.76703,1014.79321,26.92264,15.07713,595.06104,30.08042,15.50464,...,7.08818,276.40338,-0.00087,14742,276.45758,267.90363,8.90254,9914,27170,0
1,2021-07-11 10:00:01,0.08771,0.88619,476.76703,1014.79321,26.92264,14.97197,531.50317,30.08423,16.20023,...,7.08818,276.18634,0.00058,14781,278.91705,268.95258,8.90254,9914,27171,0
2,2021-07-11 10:00:02,0.08771,0.88836,476.76703,1014.79321,26.92264,14.90129,451.06253,30.09148,16.53352,...,7.08818,279.85754,-0.00072,14831,278.89899,269.76636,8.90254,9914,27170,0


In [73]:
# downloading each test csv (differnt csv b/c taken in different time periods) NOTE: train and test data have been pre-split in this public dataset

test1_raw = dataframe_from_csv("/content/hai-22.04/test1.csv") 
test2_raw = dataframe_from_csv("/content/hai-22.04/test2.csv") 
test3_raw = dataframe_from_csv("/content/hai-22.04/test3.csv") 
test4_raw = dataframe_from_csv("/content/hai-22.04/test4.csv") 

In [89]:
# concatenating all test data

test = [test1_raw, test2_raw, test3_raw, test4_raw]
test_df_raw = dataframe_from_csvs(test)
test_df_raw.shape

(361200, 88)

In [90]:
test_df_raw.dtypes

timestamp      object
P1_B2004      float64
P1_B2016      float64
P1_B3004      float64
P1_B3005      float64
               ...   
P4_ST_PO      float64
P4_ST_PS      float64
P4_ST_PT01      int64
P4_ST_TT01      int64
Attack          int64
Length: 88, dtype: object

In [None]:
# notes about dataset!!!
# I think we can drop the timestamp column 
# attack column will be used for labels/ classification
# I have been looking at code from this competition website https://dacon.io/en/competitions/official/235757/codeshare?page=1&dtype=recent&ptype=pub&keyword=