In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error, r2_score,
    mean_absolute_error, accuracy_score,
    f1_score, roc_auc_score,
    log_loss
)
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
B = 1
MB = 1024 * 1024
KB = 1024
GB = 1024 * 1024 * 1024
TIME_DELTA = 5000
NANO_TO_MICRO = 1000
NANO_TO_SECONDS = 1e9
NANO_SECONDS = 1
NROWS = 30e6
MAX_LEN = 5
PACKETS = "packets"
SENDS = "sends"
ALLOCS = "cpu_allocations"
DISK_READS = "disk_read"
DISK_WRITES = "disk_write"
VIRTUAL_MEMORY = "memory"
RSS_MEMORY = "rss_memory"
DATA_MEMORY = "data_memory"
S_TIME = "s_time"
U_TIME = "u_time"
PACKETS_1 = "packets_1"
PACKETS_2 = "packets_2"
PACKETS_3 = "packets_3"
PACKETS_4 = "packets_4"
PACKETS_5 = "packets_5"
TIMES_1 = "times_1"
TIMES_2 = "times_2"
TIMES_3 = "times_3"
START_TIME = 'start_time'
END_TIME = 'end_time'
SIZE = 'size'
GAP = 'gap'
NETWORK_IN = 'networkin'
NETWORK_OUT = 'networkout'
START_TIME = 'start_time'
SRC_IP = 'src_ip'
DEST_IP = 'dest_ip'
SRC_PORT = 'src_port'
DEST_PORT = 'dest_port'

NSDI_FEATURES = [DISK_READS, DISK_WRITES, VIRTUAL_MEMORY, S_TIME,
                 START_TIME, END_TIME, GAP, NETWORK_IN, SIZE,
                 NETWORK_OUT]  # SRC_IP, DEST_IP, SRC_PORT, DEST_PORT]
ALL_FEATURES = NSDI_FEATURES + [ALLOCS, RSS_MEMORY, DATA_MEMORY, U_TIME, SRC_IP, DEST_IP, SRC_PORT, DEST_PORT]
FS_FEATURES = [SIZE, GAP, DISK_WRITES]
FULL_PATH = "../files"

In [4]:
def calculate_scaling(training_path):
    scaling = {}
    df = pd.read_csv(training_path, index_col=False)
    for column in df.columns:
        scaling[column] = float(df[column].max())
    return scaling

def prepare_files(file, window_size, scaling, target_column='size'):
    df = pd.read_csv(file, index_col=False)
    if scaling is not None:
        df = df.apply((lambda x: resize(x, scaling)), axis=0)
    flow_size = df[target_column]
    df[target_column] = flow_size
    # extend the window
    final_df = df.copy()
    for sample_num in range(1, window_size):
        shifted = df.shift(sample_num)
        shifted.columns = map(lambda x: x+str(sample_num), shifted.columns)
        final_df = pd.concat([shifted, final_df], axis=1)

    final_df = final_df.fillna(0)
    final_df = final_df.drop(target_column, axis=1)

    return (final_df, flow_size)

def make_io(f_df, f_size):
    inputs = None
    outputs = None
    i_data = f_df.values
    o_data = f_size.tolist()
    if inputs is None:
        inputs = i_data
        outputs = o_data
    else:
        inputs = np.append(inputs, i_data, axis=0)
        outputs = np.append(outputs, o_data)
    return (inputs, outputs)

def resize(s, scaling):
    return (s / scaling[s.name])

def print_metrics(real, prediction):
    mse = mean_squared_error(real, prediction)
    mae = mean_absolute_error(real, prediction)
    r2 = r2_score(real, prediction)
    scores = {'mse': mse, 'mae': mae, 'r2': r2}
    return scores

def print_performance(file, model, scaling):
    df, flow_size = prepare_files(
        file,
        WINDOW_SIZE,
        scaling,
        'size'
    )
    inputs, outputs = make_io(df, flow_size)
    y_pred = model.predict(
        xgb.DMatrix(inputs, feature_names=df.columns)
    )
    pred = y_pred.tolist()

    return print_metrics(
        outputs, pred
    )


In [5]:
td = 500
flows_df = pd.read_csv(f"{FULL_PATH}/{td}/full.csv")

In [6]:
flows_df  = flows_df[flows_df['gap'] >= 0]
flows_df = flows_df[:6000000]

In [7]:
tmp_path = 'tmp-data'
target_file_name = "flows.csv"
tmp_train_path = f"{tmp_path}/train"
tmp_test_path = f"{tmp_path}/test"

In [8]:
flows_train, flows_test = train_test_split(
    flows_df, shuffle=False, test_size=0.3
)
flows_train.to_csv(
    f"{tmp_train_path}/{target_file_name}",
    index=False
)
flows_test.to_csv(
    f"{tmp_test_path}/{target_file_name}",
    index=False
)

In [9]:
training_path =  f"{tmp_train_path}/flows.csv"
test_path =  f"{tmp_test_path}/flows.csv"
scaling = calculate_scaling(training_path)

In [10]:
WINDOW_SIZE = 5

df, flow_size = prepare_files(
    training_path,
    WINDOW_SIZE,
    scaling,
    'size'
)

inputs, outputs = make_io(df, flow_size)

# fit model no training data
number_of_trees = 20
param = {
    'max_depth': 12,
    'booster': 'gbtree',
    "predictor": "gpu_predictor",
    'tree_method': 'gpu_hist',
    'colsample_bytree': 0.7,
}
extra_params = dict()
extra_params.update(
    objective='reg:squarederror',
    eval_metric='mae'
)

param.update(extra_params)
training = xgb.DMatrix(inputs, outputs, feature_names=df.columns)
model = xgb.train(param, training, number_of_trees)


In [11]:
result = {}
result['train'] = print_performance(
    training_path, model, scaling
)
result['test'] = print_performance(
    test_path, model, scaling
)

result

{'train': {'mse': 0.0010277209106226437,
  'mae': 0.012392375580179377,
  'r2': 0.46641987672979357},
 'test': {'mse': 0.0015914562328401773,
  'mae': 0.017461205469279494,
  'r2': 0.19677862794596923}}