#### Load serialized sniffed timebar-data and convert it to Panda's Dataframe

For each sniffed machine:
1. Load the sniffed data that is now arranged in `Timebar`s
2. Unroll `Timebar` counter values to a list (up to 86400 lines - aggregation of every second)
3. Split the unrolled list in, e.g., 24 blocks (features of every hour)
4. Convert do Panda's DataFrame

In [1]:
import pickle

import numpy as np
from scipy import stats

from time_utils import Timebar

In [1]:
def compute_features(raw_files: list = [], nr_splits: int = 24, skip_weekends: bool = True):
    """
    Computes features for the sniffed aggregated data
    :param raw_files: list of files containing serialized pickle Timebar data 
    :param nr_splits: Split each day in how many blocks
    :param skip_weekends: Compute features only from monday to friday
    :return: list with all features from all raw_files
    """

    raw_data = []  # list containing all features

    for file in raw_files:

        print(f"Loading {file}...", end=" ")
        days = pickle.load(open(file, 'rb'))  # type: list[Timebar]
        print("done.",end="\t")
        
        nr_days = 0
        nr_blocks = 0
        nr_skip_days = 0

        day = None  # type: Timebar

        for day in days:

            # check if day is empty
            if day.is_empty():
                nr_skip_days = nr_skip_days + 1
                continue

            if day.is_weekend() and skip_weekends:
                nr_skip_days = nr_skip_days + 1
                continue

            timebar_lst = day.unroll_to_lst()
            np_tb = np.array(timebar_lst)

            # 2.split the whole day in how many blocks? 24 for every hour. 96 for every 15 minutes
            split = np.array_split(np_tb, nr_splits)

            # calculate features for each day slice
            relative_day_position = 0
            for day_block in split:
                nr_blocks = nr_blocks + 1

                name = file
                feature1 = np.mean(day_block, axis=0)
                #feature2 = np.average(day_block, axis=0) # average is mean with more options
                feature3 = stats.skew(day_block, axis=0)
                feature4 = stats.kurtosis(day_block, axis=0)
                feature5 = np.var(day_block, axis=0)
                feature6 = np.count_nonzero(day_block)
                feature7 = relative_day_position
                is_weekday = 0 if day.is_weekend() else 1
                
                relative_day_position = relative_day_position + 1

                # append feature line
                raw_data.append(
                    np.hstack((name, feature1, feature3, feature4, feature5, feature6, feature7, is_weekday)))

            nr_days = nr_days + 1

        print(f"{nr_days} days splitted in {nr_blocks} blocks and skipped {nr_skip_days} days.")

    return raw_data


In [2]:
raw_files = [
    "counters_freebsd.raw",
    "counters_toshiba.raw",
    "counters_windows10x86.raw",
    "counters_nuc.raw",
    "counters_ubuntu.raw",
    "counters_x58pc.raw"
]

raw_files = [
    "counters_freebsd.raw",
    "counters_windows10x86.raw",
]

import pandas as pd

# columns names aka feature names

feature_labels = ['entity']

feature_descriptions = ['packet_count',
     'ip_external',
     'ip_internal',
     'port_high',
     'port_low',
     'tcp_syn',
     'tcp_fin',
     'tcp_rst',
     'volume_down',
     'volume_up',
     'volume_internal',
     'less_64kb']

feature_descriptions_combinations = ['mean', 'skew', 'kurtosis', 'var']

for d in feature_descriptions:
    for f in feature_descriptions_combinations:
        feature_labels.append(f"{d}:{f}")

feature_labels.append('count_nonzero')
feature_labels.append('relative_day_position')
feature_labels.append('is_weekday')


for nr_splits in [1, 4, 12, 24, 96]:
    print(f"Split size: {nr_splits}")

    raw_data = compute_features(raw_files, nr_splits, skip_weekends=True)

    # convert to panda dataframe and guess types
    df = pd.DataFrame(raw_data, columns=feature_labels)
    df = df.apply(pd.to_numeric, errors='ignore')

    filename = f"df_{nr_splits}.raw"

    print(f"Dumping DataFrame {df.shape} table to {filename}...", end=" ")

    pickle.dump(df, open(filename, "wb"))

    print("Done.", end="\n\n")

#df_backup = df.copy()


Split size: 1
Loading counters_freebsd.raw... done.	32 days splitted in 32 blocks and skipped 335 days.
Loading counters_windows10x86.raw... done.	27 days splitted in 27 blocks and skipped 340 days.
Dumping DataFrame (59, 52) table to df_1.raw... Done.

Split size: 4
Loading counters_freebsd.raw... done.	32 days splitted in 128 blocks and skipped 335 days.
Loading counters_windows10x86.raw... done.	27 days splitted in 108 blocks and skipped 340 days.
Dumping DataFrame (236, 52) table to df_4.raw... Done.

Split size: 12
Loading counters_freebsd.raw... done.	32 days splitted in 384 blocks and skipped 335 days.
Loading counters_windows10x86.raw... done.	27 days splitted in 324 blocks and skipped 340 days.
Dumping DataFrame (708, 52) table to df_12.raw... Done.

Split size: 24
Loading counters_freebsd.raw... done.	32 days splitted in 768 blocks and skipped 335 days.
Loading counters_windows10x86.raw... done.	27 days splitted in 648 blocks and skipped 340 days.
Dumping DataFrame (1416, 52)

In [1]:
feature_labels = ['entity']

feature_descriptions = ['packet_count',
     'ip_external',
     'ip_internal',
     'port_high',
     'port_low',
     'tcp_syn',
     'tcp_fin',
     'tcp_rst',
     'volume_down',
     'volume_up',
     'volume_internal',
     'less_64kb']

feature_descriptions_combinations = ['mean', 'skew', 'kurtosis', 'var']

for d in feature_descriptions:
    for f in feature_descriptions_combinations:
        feature_labels.append(f"{d}:{f}")

feature_labels.append('count_nonzero')
feature_labels.append('relative_day_position')
feature_labels.append('is_weekday')

In [2]:
feature_labels

['entity',
 'packet_count:mean',
 'packet_count:skew',
 'packet_count:kurtosis',
 'packet_count:var',
 'ip_external:mean',
 'ip_external:skew',
 'ip_external:kurtosis',
 'ip_external:var',
 'ip_internal:mean',
 'ip_internal:skew',
 'ip_internal:kurtosis',
 'ip_internal:var',
 'port_high:mean',
 'port_high:skew',
 'port_high:kurtosis',
 'port_high:var',
 'port_low:mean',
 'port_low:skew',
 'port_low:kurtosis',
 'port_low:var',
 'tcp_syn:mean',
 'tcp_syn:skew',
 'tcp_syn:kurtosis',
 'tcp_syn:var',
 'tcp_fin:mean',
 'tcp_fin:skew',
 'tcp_fin:kurtosis',
 'tcp_fin:var',
 'tcp_rst:mean',
 'tcp_rst:skew',
 'tcp_rst:kurtosis',
 'tcp_rst:var',
 'volume_down:mean',
 'volume_down:skew',
 'volume_down:kurtosis',
 'volume_down:var',
 'volume_up:mean',
 'volume_up:skew',
 'volume_up:kurtosis',
 'volume_up:var',
 'volume_internal:mean',
 'volume_internal:skew',
 'volume_internal:kurtosis',
 'volume_internal:var',
 'less_64kb:mean',
 'less_64kb:skew',
 'less_64kb:kurtosis',
 'less_64kb:var',
 'count_n