# Preparing the environment


## Libraries

In [1]:
#%pip install --upgrade pip
#%pip install pandas
#%pip install scipy 
#%pip install scikit-learn 
#%pip install tqdm 
#%pip install nbformat
#%pip install pyarrow

In [2]:
# requirements
import pandas as pd
from tqdm import tqdm
import scripts.learner as l
import scripts.observer as o
import scripts.indicator.calculator as ic
import numpy as np

## Experiment

In [3]:
def compute_lambda_series(train_df):
    return (
        train_df
        .sort_values("window_id")
        .groupby("endpoint")["lam"]
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )

def compute_anomaly_score_series(lambda_series, obs_by_window, train_obs_by_window, window_size, eas):
    window_ids = sorted(obs_by_window.keys())
    eta_series = []

    for window_id in window_ids:
        observed = obs_by_window.get(window_id)
        if observed is None:
            continue

        if window_size > 0:
            eas.lam = lambda_series.iloc[window_id-1]

        out = eas.calculate_eta(
            current_window=observed["total_requests"],
            seconds_in_window=window_size
        )

        out["lambda"] = eas.lam
        out["expected"] = train_obs_by_window.get(window_id - 1, [])
        out["window_start"] = observed["window_start"]
        out["window_id"] = window_id
        eta_series.append(out)

    return pd.DataFrame(eta_series)

def compute_ra_series(eta_series, indicators_calculators):
    window_ids = sorted(eta_series.keys())

    for window_id in window_ids:
        for ind in indicators_calculators:
            ind.update_ra(eta=eta_series[window_id]['eta'])
            ind.record(window_id, eta_series[window_id])
    return indicators_calculators

def format_window_info(obs_df, train_obs_df):
    obs_by_window = {
        wid: {
            "total_requests":g["total_requests"].values,
            "window_start": g['window_start'].values[0],
        }
        for wid, g in obs_df.groupby("window_id")
    }

    train_obs_by_window = {
        wid: g["total_requests"].values
        for wid, g in train_obs_df.groupby("window_id")
    }

    return obs_by_window, train_obs_by_window

def format_eta_info(df_eta):
    eta_by_window = {
        wid: {
            "eta": g["eta"].values[0] if len(g["eta"].values) == 1 else np.nan,
            "window_start": g["window_start"].values[0] if len(g["window_start"].values) == 1 else np.nan,
            "fDeltap":g["fDeltap"].values[0] if len(g["fDeltap"].values) == 1 else np.nan,
            "fDp":g["fDp"].values[0] if len(g["fDp"].values) == 1 else np.nan,
            "fZp":g["fZp"].values[0] if len(g["fZp"].values) == 1 else np.nan
        }
        for wid, g in df_eta.groupby("window_id")
    }

    return eta_by_window

def experiment_eta(train_df, obs_df, train_obs_df, window_size):
    results_anomaly_detection = []

    for endpoint in tqdm(train_df.endpoint.unique()):
        lambda_series = compute_lambda_series(train_df[train_df.endpoint == endpoint])

        obs_by_window, train_obs_by_window = format_window_info(
            obs_df[obs_df.endpoint == endpoint],
            train_obs_df[train_obs_df.endpoint == endpoint]
        )

        eas = o.EndpointAnomalySensor(
            endpoint=endpoint,
            lam=lambda_series.iloc[0]
        )

        df_anomaly_score_series = compute_anomaly_score_series(
            lambda_series,
            obs_by_window,
            train_obs_by_window,
            window_size,
            eas
        )

        df_anomaly_score_series["endpoint"] = endpoint

        results_anomaly_detection.append(df_anomaly_score_series)
    return pd.concat(results_anomaly_detection, ignore_index=True)

def experiment_ra(train_df, obs_df, train_obs_df, window_size, params):
    results = []
    for endpoint in tqdm(train_df.endpoint.unique()):
        lambda_series = compute_lambda_series(train_df[train_df.endpoint == endpoint])

        obs_by_window, train_obs_by_window = format_window_info(
            obs_df[obs_df.endpoint == endpoint],
            train_obs_df[train_obs_df.endpoint == endpoint]
        )

        ra_calculators = [
            #RaCalculator(endpoint, model="sigmoid", params=params),
            #RaCalculator(endpoint, model="exponential", params=params),
            #RaCalculator(endpoint, model="recovery", params=params),
            ic.RaCalculator(endpoint, model="kalman", params=params),
        ]

        eas = o.EndpointAnomalySensor(
            endpoint=endpoint,
            lam=lambda_series.iloc[0]
        )

        df_anomaly_score_series = compute_anomaly_score_series(
            lambda_series,
            obs_by_window,
            train_obs_by_window,
            window_size,
            eas
        )

        eta_series = format_eta_info(df_anomaly_score_series)
        indicators_report = compute_ra_series(eta_series, ra_calculators)

        for ir in indicators_report:
            results.extend(ir.history)

    return pd.DataFrame(results)

# Evaluation

In [4]:
def add_gt(results_history, window_gt, window_sizes):
    """
        @description Add the corresponding ground truth for each endpoint-window,
                     so it can be compared later to generate perfomance metrics
    """

    labeled_data= []
    for window_size in window_sizes:
        size = int(window_size.replace('s', ''))
        labels = window_gt[window_gt['window_size']==size][['endpoint', 'window_id', 'has_anomaly']]
        result_df = results_history[results_history['window_size'] == size]
        df = pd.merge(result_df, labels, on=['endpoint','window_id'], how='left')
        labeled_data.append(df)
    return pd.concat(labeled_data, ignore_index=True)

## Full Pipeline

Dataset path

In [5]:
ton_iot_train_path = "../0-datasets/treated_dataset/ton_treated_train.csv"
ton_iot_test_path = "../0-datasets/treated_dataset/ton_treated_test.csv"

Experiment params

In [6]:
WINDOW_SIZES = ['30s', '60s', '120s','300s']
FIXED_WINDOW_SIZE = 120
k_VAR = [1., 2., 3., 4., 5.]
R_VAR = [0.01,0.05,0.1,0.2]
Q_VAR = [0.001,0.005,0.01,0.02]

Training

In [7]:
tl = l.TrafficLearner(
    window_sizes=WINDOW_SIZES,
    path_normal_traffic_df=ton_iot_train_path,
)

normal_traffic_lambda_df = tl.learn_traffic_information()
normal_traffic_wind_observations_df = tl.index_windows(tl.get_normal_traffic_df())

Collecting observations

In [8]:
df_ton_test = pd.read_csv(ton_iot_test_path, low_memory=False)
df_ton_test['time_local'] = pd.to_datetime(df_ton_test['time_local'])
anomalous_traffic_label_df = tl.label_test_windows(df_ton_test.copy())
anomalous_traffic_win_observations_df = tl.index_windows(df_ton_test)

In [9]:
anomalous_traffic_win_observations_df.groupby('window_size')['window_id'].count()

window_size
30     939250
60     939250
120    939250
300    939250
Name: window_id, dtype: int64

### Eta experiments

In [10]:
def window_size_experiment_eta(normal_windows_lambda_df, window_obs, window_normal, window_sizes):
    results_eta = []
    for window_size in window_sizes:
        print(f"Running experiment: window_size={window_size}")
        size = int(window_size.replace('s', ''))

        df_eta = experiment_eta(
            normal_windows_lambda_df[normal_windows_lambda_df['window_size'] == size],
            window_obs[window_obs['window_size'] == size],
            window_normal[window_normal['window_size'] == size],
            size,
        )

        df_eta["window_size"] = size
        results_eta.append(df_eta)

    return pd.concat(results_eta, ignore_index=True)

In [11]:
df_eta_results = window_size_experiment_eta(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_sizes=WINDOW_SIZES)
df_eta_results = add_gt(df_eta_results, anomalous_traffic_label_df, WINDOW_SIZES)

Running experiment: window_size=30s


100%|██████████| 10/10 [00:15<00:00,  1.57s/it]


Running experiment: window_size=60s


100%|██████████| 10/10 [00:08<00:00,  1.22it/s]


Running experiment: window_size=120s


100%|██████████| 10/10 [00:04<00:00,  2.19it/s]


Running experiment: window_size=300s


100%|██████████| 10/10 [00:02<00:00,  4.27it/s]


### Ra experiments

k variation

In [12]:
k_results=[]
for k in k_VAR:
    params={
        "beta": 0.5,
        "k": k,
    }
    print(f"Running experiment: k={k}")
    results = experiment_ra(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_size=30, params=params)
    results["k"] = k
    results["window_size"] = FIXED_WINDOW_SIZE
    k_results.append(results)

df_k_results = pd.concat(k_results, ignore_index=True)
df_k_l_results = add_gt(df_k_results, anomalous_traffic_label_df, WINDOW_SIZES)

Running experiment: k=1.0


100%|██████████| 10/10 [00:24<00:00,  2.47s/it]


Running experiment: k=2.0


100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


Running experiment: k=3.0


100%|██████████| 10/10 [00:24<00:00,  2.46s/it]


Running experiment: k=4.0


100%|██████████| 10/10 [00:24<00:00,  2.47s/it]


Running experiment: k=5.0


100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


q variation

In [13]:
q_results = []
for q in Q_VAR:
    params={
        "beta": 0.5,
        "Q": q,
    }
    print(f"Running experiment: Q={q}")
    results = experiment_ra(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_size=30, params=params)
    results["Q"] = q
    results["window_size"] = FIXED_WINDOW_SIZE
    q_results.append(results)

df_q_results = pd.concat(q_results, ignore_index=True)
df_q_l_results = add_gt(df_q_results, anomalous_traffic_label_df, WINDOW_SIZES)

Running experiment: Q=0.001


100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


Running experiment: Q=0.005


100%|██████████| 10/10 [00:25<00:00,  2.52s/it]


Running experiment: Q=0.01


100%|██████████| 10/10 [00:25<00:00,  2.51s/it]


Running experiment: Q=0.02


100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


R variation

In [14]:
r_results = []
for r in R_VAR:
    params={
        "beta": 0.5,
        "R": r,
    }
    print(f"Running experiment: R={r}")
    results = experiment_ra(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_size=30, params=params)
    results["R"] = r
    results["window_size"] = FIXED_WINDOW_SIZE
    r_results.append(results)

df_r_results = pd.concat(r_results, ignore_index=True)
df_r_l_results = add_gt(df_r_results, anomalous_traffic_label_df, WINDOW_SIZES)


Running experiment: R=0.01


100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


Running experiment: R=0.05


100%|██████████| 10/10 [00:25<00:00,  2.52s/it]


Running experiment: R=0.1


100%|██████████| 10/10 [00:25<00:00,  2.53s/it]


Running experiment: R=0.2


100%|██████████| 10/10 [00:25<00:00,  2.51s/it]


Window variation

In [15]:
def window_size_experiment_ra(normal_windows_lambda_df, window_obs, window_normal, window_sizes, params):
    results_ra = []
    for window_size in window_sizes:
        print(f"Running experiment: window_size={window_size}")
        size = int(window_size.replace('s', ''))

        df_ra = experiment_ra(
            normal_windows_lambda_df[normal_windows_lambda_df['window_size'] == size],
            window_obs[window_obs['window_size'] == size],
            window_normal[window_normal['window_size'] == size],
            window_size=size,
            params=params
        )

        df_ra["window_size"] = size
        results_ra.append(df_ra)

    return pd.concat(results_ra, ignore_index=True)

In [16]:
df_ra_win_results = window_size_experiment_ra(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_sizes=WINDOW_SIZES, params={})
df_ra_win_results = add_gt(df_ra_win_results, anomalous_traffic_label_df, WINDOW_SIZES)

Running experiment: window_size=30s


100%|██████████| 10/10 [00:22<00:00,  2.23s/it]


Running experiment: window_size=60s


100%|██████████| 10/10 [00:11<00:00,  1.16s/it]


Running experiment: window_size=120s


100%|██████████| 10/10 [00:06<00:00,  1.60it/s]


Running experiment: window_size=300s


100%|██████████| 10/10 [00:03<00:00,  3.26it/s]


In [17]:
df_eta_results.to_csv("outputs/kalman_eta_results.csv", index=False)
df_k_l_results.to_csv("outputs/kalman_k_results.csv", index=False)
df_q_l_results.to_csv("outputs/kalman_q_results.csv", index=False)
df_r_l_results.to_csv("outputs/kalman_r_results.csv", index=False)
df_ra_win_results.to_csv("outputs/kalman_ra_win_results.csv", index=False)
anomalous_traffic_win_observations_df.to_csv("../0-datasets/treated_dataset/anomalous_traffic_win_observations.csv")