In [1]:
import pandas as pd
import glob

from sklearn.preprocessing import StandardScaler


def load_dataset(data_folder_dir, selected_cols=None, scaler=None, ewm=True, validation=False):
    """
    :param data_folder_dir: directory of dataset folder. '.../folder/'
    :param selected_cols: selected columns to use. If None, automatically generates and returns
    :param scaler: fitted scaler to train_dataset. If None, automatically generates and returns
    :param ewm: whether to use Exponential smoothing.
    :return: if cols, scaler are None, gives list of np arrays. else, given with cols, scaler (train)
    """
    dirs = glob.glob(data_folder_dir + '*.csv')
    data_list = [pd.read_csv(path) for path in dirs]
    itc = [0]  # indices to cut
    if not selected_cols or not scaler:  # train
        integrated = pd.DataFrame(columns=data_list[0].columns)
        for i, data in enumerate(data_list):
            integrated = integrated.append(data)
            itc.append(integrated.shape[0])
        selected_cols = get_selected_columns(integrated)
        data_pd = integrated[selected_cols]
        assert check_isnan(data_pd), "there exist nan in dataset"
        scaler = StandardScaler()
        data_np = scaler.fit_transform(data_pd)
        if ewm:
            return [pd.DataFrame(data_np[itc[i]:itc[i+1]]).ewm(alpha=0.9).mean().values
                    for i in range(len(itc) - 1)], selected_cols, scaler
        else:
            return [data_np[itc[i]:itc[i + 1]] for i in range(len(itc) - 1)], selected_cols, scaler
    else:  # test
        integrated = pd.DataFrame(columns=selected_cols)
        for i, data in enumerate(data_list):
            integrated = integrated.append(data[selected_cols])
            itc.append(integrated.shape[0])
        assert check_isnan(integrated), "there exist nan in dataset"
        data_np = scaler.transform(integrated)
        if ewm:
            if validation:
                return [pd.DataFrame(data_np[itc[i]:itc[i + 1]]).ewm(alpha=0.9).mean().values
                        for i in range(len(itc) - 1)], data_list[0]['attack'].values
            else:
                return [pd.DataFrame(data_np[itc[i]:itc[i + 1]]).ewm(alpha=0.9).mean().values
                        for i in range(len(itc) - 1)]
        else:
            if validation:
                return [data_np[itc[i]:itc[i + 1]] for i in range(len(itc) - 1)], data_list[0]['attack'].values
            else:
                return [data_np[itc[i]:itc[i + 1]] for i in range(len(itc) - 1)]


def check_isnan(df):
    isnan = df.isna().any()
    for col in df.columns:
        if isnan[col]:
            return False
    return True


def get_selected_columns(integrated):
    selected_columns = set(integrated.columns.drop('timestamp'))
    mins, maxes = integrated.min(), integrated.max()
    for col in integrated.columns:
        if mins[col] == maxes[col]:
            selected_columns.remove(col)
    return sorted(list(selected_columns))


def train_valid_split(np_data_list, valid_ratio=0.2):
    np_train_list = []
    np_valid_list = []
    for np_data in np_data_list:
        cut_idx = int(len(np_data) * (1 - valid_ratio))
        np_train_list.append(np_data[:cut_idx, :])
        np_valid_list.append(np_data[cut_idx:, :])
    return np_train_list, np_valid_list


if __name__ == "__main__":
    train, selected_cols, scaler = load_dataset('../datasets/train/')
    test = load_dataset('../datasets/test/', selected_cols, scaler)

In [2]:
from abc import ABC

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator


class HAIDataLoader(tf.data.Dataset, ABC):
    def __new__(cls, np_data_list, length=50, stride=3,
                batch_size=32, train=True):
        # for np_data in np_data_list:
        #     assert len(np_data.shape) == 2, 'np_data must be 2D'

        if train:
            seq_data_list = [TimeseriesGenerator(data=elt, targets=elt, length=length,
                                                 stride=stride, batch_size=batch_size,
                                                 shuffle=True)
                             for elt in np_data_list]

            def gen():
                for seq_data in seq_data_list:
                    for x, y in seq_data:
                        yield x, y

            std_x_shape, std_y_shape = seq_data_list[0][0][0].shape, seq_data_list[0][0][1].shape
            # print(f'x_shape: {std_x_shape}   y_shape: {std_y_shape}')
            return tf.data.Dataset.from_generator(
                gen,
                output_signature=(
                    tf.TensorSpec(shape=std_x_shape, dtype=tf.float32),
                    tf.TensorSpec(shape=std_y_shape, dtype=tf.float32)
                )
            )
        else:
            return [TimeseriesGenerator(data=elt, targets=elt, length=length,
                                        stride=1, batch_size=batch_size)
                    for elt in np_data_list]

In [4]:
np_data_list, selected_cols, scaler = load_dataset('../datasets/train/')
dataset = HAIDataLoader(np_data_list, train=False)

In [6]:
print(selected_cols)

['C01', 'C03', 'C04', 'C05', 'C06', 'C07', 'C08', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C20', 'C21', 'C23', 'C24', 'C25', 'C27', 'C28', 'C30', 'C31', 'C32', 'C33', 'C34', 'C35', 'C37', 'C40', 'C41', 'C42', 'C43', 'C44', 'C45', 'C46', 'C47', 'C48', 'C50', 'C51', 'C53', 'C54', 'C56', 'C57', 'C58', 'C59', 'C60', 'C61', 'C62', 'C64', 'C65', 'C66', 'C67', 'C68', 'C70', 'C71', 'C72', 'C73', 'C74', 'C75', 'C76', 'C77', 'C78', 'C79', 'C80', 'C81', 'C83', 'C84', 'C86']


In [11]:
print(np_data_list[0][:3,:])

[[-1.95926209e-01  1.30952046e-01 -5.10446621e-01 -1.28661028e+00
  -2.32210743e-01  3.00756290e-01 -9.00937651e-01 -7.05245585e-01
   7.88264664e-01  3.39643611e-01 -3.87053905e-01 -1.31450374e+00
  -1.85595674e+00 -2.95073030e-01 -5.31484389e-01 -3.68616921e-01
  -1.93207539e+00  5.75111309e-01  2.34990968e+00 -1.88097732e+00
  -2.11158060e+00  7.88647317e-01 -2.05664261e-02 -7.74039303e-01
  -4.79355424e-01 -2.95075024e-01  1.60011801e+00 -1.09891282e+00
  -1.37607847e+00 -1.08797947e+00 -2.04969881e-02  1.02198387e+00
  -1.56255249e-01 -5.85094476e-01  9.00937651e-01  4.77126483e-01
   2.95075024e-01  1.77641832e+00  8.19694100e-01 -1.11881031e+00
   1.30502241e-01 -1.50065908e+00  9.24611852e-01 -4.09670978e-01
  -4.62213835e-01 -1.24225649e+00  2.95073030e-01  8.85886205e-01
  -2.95073030e-01 -1.11281275e+00 -8.21038554e-01 -1.29086301e+00
  -2.32335225e+00 -7.06858020e-01 -4.86283897e-01 -1.20888696e+00
  -8.56572369e-02 -1.31689083e+00 -1.79728748e+00 -3.98040336e-01
  -2.05961

In [17]:
print(dataset[0][0][0][0,:3,:])

[[-1.95926209e-01  1.30952046e-01 -5.10446621e-01 -1.28661028e+00
  -2.32210743e-01  3.00756290e-01 -9.00937651e-01 -7.05245585e-01
   7.88264664e-01  3.39643611e-01 -3.87053905e-01 -1.31450374e+00
  -1.85595674e+00 -2.95073030e-01 -5.31484389e-01 -3.68616921e-01
  -1.93207539e+00  5.75111309e-01  2.34990968e+00 -1.88097732e+00
  -2.11158060e+00  7.88647317e-01 -2.05664261e-02 -7.74039303e-01
  -4.79355424e-01 -2.95075024e-01  1.60011801e+00 -1.09891282e+00
  -1.37607847e+00 -1.08797947e+00 -2.04969881e-02  1.02198387e+00
  -1.56255249e-01 -5.85094476e-01  9.00937651e-01  4.77126483e-01
   2.95075024e-01  1.77641832e+00  8.19694100e-01 -1.11881031e+00
   1.30502241e-01 -1.50065908e+00  9.24611852e-01 -4.09670978e-01
  -4.62213835e-01 -1.24225649e+00  2.95073030e-01  8.85886205e-01
  -2.95073030e-01 -1.11281275e+00 -8.21038554e-01 -1.29086301e+00
  -2.32335225e+00 -7.06858020e-01 -4.86283897e-01 -1.20888696e+00
  -8.56572369e-02 -1.31689083e+00 -1.79728748e+00 -3.98040336e-01
  -2.05961

In [21]:
print(np_data_list[0][50,:])

[-0.18244645  0.08420561 -0.23369552 -1.24720949 -0.24959887  0.30075629
 -0.90093765  0.4916414   2.37258583 -0.70467889 -0.37882322 -1.31450374
 -1.19878687 -0.29507303 -0.53148439 -0.46428533 -0.36652708  0.56665383
  2.15453118 -0.25536223 -0.61146558  0.78793409  0.06932728  0.85055393
 -0.58537466 -0.29507502  1.60011801 -1.09891282 -1.38696206 -1.16797475
 -0.01002031  2.3842519  -0.0818203  -0.48985892  0.90093765  0.71959823
  0.29507502  1.78914765  0.82390725 -1.11881031 -0.26562488  0.14105688
  0.92461185 -0.40967098 -0.34644011 -1.69037945  0.29507303  0.8486869
 -0.29507303 -1.11281275 -0.82103855 -1.35786655 -3.27704922  0.73339056
 -0.54309488 -1.20380516 -0.02466257 -1.39678985 -1.39525817 -0.25080892
 -2.98191149 -1.13369196  0.84023597 -1.36885203 -0.49656436  0.12097539
 -0.00560655  1.23787531]


In [19]:
print(dataset[0][0][1][0,:])

[-0.18244645  0.08420561 -0.23369552 -1.24720949 -0.24959887  0.30075629
 -0.90093765  0.4916414   2.37258583 -0.70467889 -0.37882322 -1.31450374
 -1.19878687 -0.29507303 -0.53148439 -0.46428533 -0.36652708  0.56665383
  2.15453118 -0.25536223 -0.61146558  0.78793409  0.06932728  0.85055393
 -0.58537466 -0.29507502  1.60011801 -1.09891282 -1.38696206 -1.16797475
 -0.01002031  2.3842519  -0.0818203  -0.48985892  0.90093765  0.71959823
  0.29507502  1.78914765  0.82390725 -1.11881031 -0.26562488  0.14105688
  0.92461185 -0.40967098 -0.34644011 -1.69037945  0.29507303  0.8486869
 -0.29507303 -1.11281275 -0.82103855 -1.35786655 -3.27704922  0.73339056
 -0.54309488 -1.20380516 -0.02466257 -1.39678985 -1.39525817 -0.25080892
 -2.98191149 -1.13369196  0.84023597 -1.36885203 -0.49656436  0.12097539
 -0.00560655  1.23787531]
