# Preparation of Datasets
Datasets need to be first downloaded and copied in the folder "raw_data". The process is explained below for every single dataset. 

## Preparation of BeRfiPl Dataset
Please first download the datasets from the simulation ds1 at https://drive.google.com/drive/folders/1JJ80uSWyBDHvwBgc2UcIOlMgUsk7YhWt and save them in the folder raw_data/BeRfiPl/. <br>
Done this, please execute the following lines of code. <br>
The Code will save the preprocessed file in the directory preprocessed_data/BeRfiPl/


In [None]:
import os
import pandas as pd 
!pip install openpyxl
os.chdir('..')

In [None]:
def prepare_BeRfiPl(raw_data_path, preprocessed_data_path):
    df_n = pd.read_csv(raw_data_path, index_col=0)
    # filter according to relecant columns
    relevant_col_str_list = ["time", "v_flow", "level", "m_flow", "fluidVolume",
                        "N_in", "opening", "medium.t", "port_a.p", "port_b.p"]
    # list of columns that hold one of the string in the list above
    col_selection = [str(c) for c in df_n.columns if any([e in c for e in relevant_col_str_list])]
    #remove those that start with "der("
    col_selection = [c for c in col_selection if not c.startswith('der(')]
    df_preprocessed = df_n.loc[:, col_selection].reset_index(drop=True)
    df_preprocessed.to_csv(f'{preprocessed_data_path}')

In [None]:
BeRfiPl_raw_anom_data_path = 'raw_data/BeRfiPl/ds1c.csv'
BeRfiPl_raw_norm_data_path = 'raw_data/BeRfiPl/ds1n.csv'
BeRfiPl_anom_data_path = 'preprocessed_data/BeRfiPl/ds1c.csv'
BeRfiPl_norm_data_path = 'preprocessed_data/BeRfiPl/ds1n.csv'

In [None]:
prepare_BeRfiPl(BeRfiPl_raw_anom_data_path, BeRfiPl_anom_data_path)
prepare_BeRfiPl(BeRfiPl_raw_norm_data_path, BeRfiPl_norm_data_path)

## Preparation of SmA Dataset
Please first download the dataset from https://github.com/thomasbierweiler/FaultsOf4-TankBatchProcess/blob/main/SmA-Four-Tank-Batch-Process_V2.zip and save them in the folder raw_data/SmA/. <br>
Done this, please execute the following lines of code. <br>
The Code will save the preprocessed files the directory preprocessed_data/SmA/.  <br>
The first file will be exclusively from Deviation ID1. <br>
The following files are a merged combination of ID1 (nominal behavior) and a anomaly Deviation ID (ID2 - ID10).


In [None]:
def prepare_SmA(raw_data_path, preprocessed_data_path):
    df = pd.read_csv(raw_data_path, delimiter=';', index_col=0)
    df_norm =  df.loc[df[df.columns[0]] == 1].reset_index(drop=True).drop(columns=['DeviationID ValueY'])
    df_norm.to_csv(f'{preprocessed_data_path}id1_norm.csv', index=False)
    for i in range(9):
        df_anomaly = df.loc[df[df.columns[0]] == i+2].reset_index(drop=True).drop(columns=['DeviationID ValueY'])
        df_preprocessed = pd.concat([df_norm, df_anomaly])
        df_preprocessed.to_csv(f'{preprocessed_data_path}id{i+2}_anomaly.csv', index=False)
    


In [None]:
SmA_raw_data_path = 'raw_data/SmA/SmA-Four-Tank-Batch-Process_V2.csv'
SmA_data_path = 'preprocessed_data/SmA/'

In [None]:
prepare_SmA(SmA_raw_data_path, SmA_data_path)

## Preparation of SWaT Dataset
Please first request access from https://itrust.sutd.edu.sg/testbeds/secure-water-treatment-swat/ and download the datasets from directory SWaT/SWat.A1&A2_Dec2015/Physical via the provided link to a shared drive and save them in the folder raw_data/SWaT/. <br>
Done this, please execute the following lines of code. <br>
The Code will save the preprocessed files as a part of the whole with 60.000 samples to the directory preprocessed_data/SWaT/.  <br>
The file will filter the data on the Process P1 and exclude the others. <br>


In [None]:
def prepare_SWaT(raw_data_path, preprocessed_data_path):
    df = pd.read_excel(raw_data_path, index_col=0, header=1)
    df_int = df.select_dtypes(include=['float64', 'int64', 'object']).iloc[110000:170000]
    der_cols = [c for c in df_int.columns if "101" in c]
    df_p1 = df_int.loc[:, der_cols].iloc[:, :]
    df_p1.to_csv(f'{preprocessed_data_path}', index=False)


In [None]:
SWat_raw_anom_data_path = 'raw_data/SWaT/SWaT_Dataset_Attack_v0.xlsx'
SWaT_raw_norm_data_path = 'raw_data/SWaT/SWaT_Dataset_Normal_v0.xlsx'
SWaT_anom_data_path = 'preprocessed_data/SWaT/swat_anom_p1.csv'
SWaT_norm_data_path = 'preprocessed_data/SWaT/swat_norm_p1.csv'

In [None]:
prepare_SWaT(SWaT_raw_norm_data_path, SWaT_norm_data_path)
prepare_SWaT(SWat_raw_anom_data_path, SWaT_anom_data_path)