In [1]:
import itertools
import pickle
import time
from typing import List

import numpy as np
import pandas as pd
from blist import blist
from scipy import stats
from sklearn.feature_extraction import image

from time_utils import Timebar
import multiprocessing

import scalogram

ModuleNotFoundError: No module named 'scalogram'

In [None]:
# columns names aka feature names
def cols_names():
    cols = ['entity']

    feature_descriptions = ['packet_count',
                            'ip_external',
                            'ip_internal',
                            'port_high',
                            'port_low',
                            'tcp_syn',
                            'tcp_fin',
                            'tcp_rst',
                            'volume_down',
                            'volume_up',
                            'volume_internal',
                            'less_64kb']

    feature_descriptions_combinations = ['mean', 'skew', 'kurtosis', 'var']

    for d in feature_descriptions:
        for f in feature_descriptions_combinations:
            cols.append(f"{d}:{f}")

    cols.append('count_nonzero')
    cols.append('relative_day_position')
    #cols.append('is_weekday')
    cols.append('silent_occr')
    cols.append('silent_mean')
    cols.append('silent_var')

    for i in range(0,10):
        cols.append(f"wavelet_{i}")

    return cols

#def compute_features(raw_files: list = [], split_size: int = 24, step_size: float = 1 / 3, skip_weekends: bool = True):
def compute_features(params):

    np_file, split_size = params

    features = blist([])

    nr_splits = 86400 / split_size
    step_size = split_size * 1/3

    print(f"Nr_splits: {nr_splits}, step_size: {time.strftime('%H:%M:%S', time.gmtime(step_size))} ({int(step_size)})")

    print(f"{np_file}:{split_size} Loading {np_file}...")
    days = pickle.load(open(np_file, 'rb'))  # type: np.ndarray
    print(f"{np_file}:{split_size} done loading.")

    nr_days = 0
    nr_blocks = 0
    nr_skip_days = 0

    for timebar_lst in days:

        split = image.extract_patches_2d(timebar_lst, (int(len(timebar_lst) / nr_splits), timebar_lst.shape[1]))

        # calculate features for each day slice
        relative_day_position = 0

        for day_block in itertools.islice(split, 0, len(split), int(step_size)):
            # try to slide 1 minute in the block of nr_splits
            # try:
            #    for _ in range(0,60):
            #        day_block = next(it)
            # except StopIteration:
            #    pass

            nr_blocks = nr_blocks + 1

            name = np_file
            feature1 = np.mean(day_block, axis=0)
            ## feature2 = np.average(day_block, axis=0) # average is mean with more options
            feature3 = stats.skew(day_block, axis=0)
            feature4 = stats.kurtosis(day_block, axis=0)
            feature5 = np.var(day_block, axis=0)
            feature6 = np.count_nonzero(day_block)
            feature7 = relative_day_position
            #is_weekday = 1
            relative_day_position = relative_day_position + 1

            silent_periods = []
            silent_counter = 0

            pckts_arr = day_block[:, 0]

            for n in pckts_arr:
                if n != 0:
                    if silent_counter != 0:
                        silent_periods.append(silent_counter)
                    silent_counter = 0
                else:
                    silent_counter = silent_counter + 1

            if silent_counter != 0:
                if silent_counter == len(silent_periods):
                    silent_periods.append(0)
                else:
                    silent_periods.append(silent_counter)

            silent_occr = len(silent_periods)
            silent_mean = np.mean(silent_periods) if silent_periods else 0
            silent_var = np.var(silent_periods) if silent_periods else 0

            scales = np.arange(1, int(np.floor(np.sqrt(pckts_arr.shape[0] / 2))))

            if np.sum(pckts_arr) == 0:
                S = np.zeros(scales.shape[0])
            else:
                S, _ = scalogram.scalogramCWT(pckts_arr, scales)

            indices = np.linspace(0, S.shape[0] - 1, 10).astype(int)
            feature11 = np.take(S, indices)

            # append feature line
            features.append(np.hstack(
                (name, feature1, feature3, feature4, feature5, feature6, feature7,
                 #is_weekday,
                 silent_occr,
                 silent_mean,
                 silent_var,
                 feature11)))

        nr_days = nr_days + 1

        print(f"- {np_file}:{split_size} {nr_days} days splitted in {nr_blocks} blocks, skipped {nr_skip_days} days.")

    print(f"{np_file}:{split_size} Creating df...")

    df = pd.DataFrame(list(features), columns=cols_names())

    df = df.apply(pd.to_numeric, errors='ignore')

    filename = f"df_{split_size}m_0.3step_{np_file}-v3.raw"

    print(f"{np_file}:{split_size} Dumping DataFrame {df.shape} table to {filename}...")

    pickle.dump(df, open(filename, "wb"))

    print(f"{np_file}:{split_size} Done.", end="\n\n")


    #return features

In [None]:
raw_files = [
    "np_freebsd.raw",
    "np_nuc.raw",
    "np_tohiba.raw",
    "np_ubuntu.raw",
    "np_windows10x86.raw",
    "np_x58pc.raw"
]

workers = []

# 24 horas 1440  1 blocos 86400
# 6 horas   360  4 blocos 21600
# 2 horas   120 12 blocos  7200
# 1 hora     60 24 blocos  3600
# 15 min     15 96 blocos   900
# nr_splits = (split_size / 60) / 24


for split_size in [86400, 21600, 7200, 3600, 900]:

    for np_file in raw_files:

        w = multiprocessing.Process(
            name=f"{np_file}:{split_size}",
            target=compute_features,
            args=((np_file,split_size),)
        )

        workers.append(w)

for i in workers:
    i.start()

for i in workers:
    i.join()

    #features = compute_features(raw_files, split_size, step_size=1 / 3, skip_weekends=False)