# Preparing the environment


## Libraries

In [1]:
#%pip install --upgrade pip
#%pip install pandas
#%pip install scipy 
#%pip install scikit-learn 
#%pip install tqdm 
#%pip install plotly 
#%pip install matplotlib
#%pip install nbformat
#%pip install fastparquet
#%pip install pyarrow
#%pip install seaborn

In [2]:
# requirements
import pandas as pd
import numpy as np
from scipy.stats import chi2, poisson, chisquare
from scipy.stats import entropy  # for KL (use small-smoothing)
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix
from tqdm import tqdm
import math
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score
import seaborn as sns

## Training phase: identifying typical traffic behavior

In [3]:
class TrafficLearner:
    def __init__(self, window_sizes, path_normal_traffic_df):
        self.window_sizes = window_sizes
        self.path_normal_traffic_df = path_normal_traffic_df
        self.raw_normal_traffic_df = None

    def get_normal_traffic_df(self):
        if self.raw_normal_traffic_df is None:
            self.raw_normal_traffic_df = pd.read_csv(self.path_normal_traffic_df, low_memory=False)
            self.raw_normal_traffic_df['time_local'] = pd.to_datetime(self.raw_normal_traffic_df['time_local'])
        return self.raw_normal_traffic_df

    def index_windows(self, df):
        windows_df = []

        for window_size in self.window_sizes:
            df['window_start'] = df['time_local'].dt.floor(window_size).copy()
            df['window_id'] = (
                    df['window_start']
                    .astype(int)
                    .rank(method='dense')
                    .astype(int) - 1
            )

            df['window_size'] = int(window_size.replace('s', ''))

            windows_df.append(
                df[[
                    'endpoint',
                    'window_size',
                    'window_id',
                    'window_start',
                    'time_local',
                    'total_requests',
                    'is_anomaly'
                ]]
            )

        return pd.concat(windows_df, ignore_index=True)

    def learn_traffic_information(self):
        df = self.get_normal_traffic_df()
        df = self.index_windows(df)

        window_stats = (
            df.groupby(["window_size", "endpoint", "window_id"])
            .agg(lam=("total_requests", "mean"))
            .reset_index()
        )

        return window_stats

    def label_test_windows(self, df):
        df = self.index_windows(df)

        window_stats = (
            df.groupby(["window_size","endpoint", "window_id"])
            .agg(has_anomaly=("is_anomaly", "max"))
            .reset_index()
        )

        return window_stats


## Monitoring phase: observes traffic for atypical behavior

In [4]:
#Traffic Features Extractor
def kl_divergence(p, q):
    ''' Calculates KL divergence between p and q'''
    SIG_EPS = 1e-10 # avoids division by zero
    p = np.asarray(p, dtype=float) + SIG_EPS
    q = np.asarray(q, dtype=float) + SIG_EPS
    return entropy(p, q)

def js_divergence(p, q, eps=1e-12):
    p = np.asarray(p) + eps
    q = np.asarray(q) + eps
    p /= p.sum()
    q /= q.sum()
    m = 0.5 * (p + q)
    return 0.5 * entropy(p, m) + 0.5 * entropy(q, m)

def calculate_D(pmf_y, dX):
    return js_divergence(pmf_y, dX)

def calculate_Delta(Xbar, lambda_ep):
    return (Xbar - lambda_ep) / max(lambda_ep, 1)

def calculate_ZScore(Xbar, lambda_ep, seconds_in_window):
    return (Xbar - lambda_ep) / math.sqrt(max(lambda_ep, 1) / seconds_in_window)

def sample_expected_distribution(max_count, lambda_ep):
    bins = np.arange(0, max_count+1)
    dY = poisson.pmf(bins, mu=lambda_ep)
    dY = dY / dY.sum()
    return dY

def get_observed_distribution(max_count, current_window):
    obs_counts, _ = np.histogram(
        current_window,
        bins=np.arange(0, max_count+2)
    )
    dX = obs_counts / obs_counts.sum()
    return dX

def extract_traffic_changes(current_window, lambda_ep, seconds_in_window):
    max_count = max(
        int(current_window.max()),
        int(lambda_ep*3)+5
    )

    Xbar = current_window.mean()
    dX = get_observed_distribution(max_count, current_window)
    dY = sample_expected_distribution(max_count, lambda_ep)

    D = calculate_D(dX, dY)
    #D_norm = D/np.log(2)
    Delta = calculate_Delta(Xbar, lambda_ep)
    Z = calculate_ZScore(Xbar, lambda_ep, seconds_in_window)
    return D, Delta, Z


In [5]:
# Fuzzy Score
def gaussian_membership(u, mu=0.0, sigma=1.0):
    return math.exp(-((u-mu)**2) / (2*(sigma**2)))

def fuzzification(current_window, D, Delta, Z):
    sigma_u = max(1.0, np.std(current_window))  # adaptive width
    fD = gaussian_membership(D, mu=0.0, sigma=sigma_u)
    fDelta = gaussian_membership(Delta, mu=0.0, sigma=sigma_u)
    fZ = gaussian_membership(Z, mu=0.0, sigma=sigma_u)
    return fD, fDelta, fZ

def anomaly_score(fD, fDelta, fZ):
    fDprime = 1 - fD
    fDelprime = 1 - fDelta
    fZprime = 1 - fZ
    eta = fDelprime + fDprime + fZprime
    return eta/3


In [6]:
### MONITORING MODULE: Observes the traffic and compare with the registered lambdas
def analyze_window(current_window, lambda_endpoint, beta, seconds_in_window):
    ''''''
    D, Delta, Z = extract_traffic_changes(current_window, lambda_endpoint, seconds_in_window)
    fD, fDelta, fZ = fuzzification(current_window, D, Delta, Z)
    eta = anomaly_score(fD, fDelta, fZ)

    C2= -math.tanh((eta - beta)*2)

    return {
        'D': D,
        'Delta': Delta,
        'Z': Z,
        'eta': eta,
        'C2': C2,
        'fD': fD,
        'fDelta': fDelta,
        'fZ': fZ
    }

In [7]:
class IndicatorCalculator:
    def __init__(self, endpoint, initial_Ra=1.0, model="logistic", params=None):
        self.endpoint = endpoint
        self.Ra = float(initial_Ra)
        self.model = model
        self.params = params or {}
        self.P = self.params.get("P0", 0.1)
        self.history = []

    def update(self, eta=None, C2=None):
        if self.model == "sigmoid":
            alpha = self.params.get("alpha", 3.063)
            beta = self.params.get("beta", 0.5)
            self.Ra = 1 / (1 + np.exp(-alpha * ((self.Ra + C2) - beta)))

        elif self.model == "logistic":
            gamma = self.params.get("gamma", 0.2)
            self.Ra += gamma * C2 * self.Ra * (1 - self.Ra)

        elif self.model == "exponential":
            k = self.params.get("k", 0.5)
            anomaly = max(0, eta - 0.5)
            self.Ra *= np.exp(-k * anomaly)

        elif self.model == "recovery":
            gamma = self.params.get("gamma", 0.02)
            delta = self.params.get("delta", 0.2)
            beta = self.params.get("beta", 0.2)

            anomaly = max(0, eta - beta)

            recovery = gamma * (1 - self.Ra)
            damage = delta * anomaly * self.Ra
            self.Ra += recovery - damage

        elif self.model == "kalman":
            self._update_kalman(eta)

        self.Ra = np.clip(self.Ra, 0, 1)

    def _update_kalman(self, eta):
        # parameters
        Q = self.params.get("Q", 0.005)   # process noise
        R = self.params.get("R", 0.05)    # observation noise
        k = self.params.get("k", 3.0)     # eta sensitivity

        # convert eta → health observation
        z = np.exp(-k * eta)

        # prediction step
        Ra_pred = self.Ra
        P_pred = self.P + Q

        # Kalman gain
        K = P_pred / (P_pred + R)

        # correction step
        self.Ra = Ra_pred + K * (z - Ra_pred)

        # update uncertainty
        self.P = (1 - K) * P_pred

    def record(self, window_id, info):
        row = {
            "endpoint": self.endpoint,
            "window_id": window_id,
            "Ra": self.Ra,
            "model": self.model
        }

        row.update(info)

        self.history.append(row)


In [8]:
def compute_lambda_series(train_df):
    return (
        train_df
        .sort_values("window_id")
        .groupby("endpoint")["lam"]
        .expanding()
        .mean()
        .reset_index(level=0, drop=True)
    )


def simulate_traffic_monitor(train_df, obs_df, train_obs_df, window_size, indicators_calculators, beta):
    obs_by_window = {
        wid: {
            "total_requests":g["total_requests"].values,
            "window_start": g['window_start'].values[0],
        }
        for wid, g in obs_df.groupby("window_id")
    }

    train_obs_by_window = {
        wid: g["total_requests"].values
        for wid, g in train_obs_df.groupby("window_id")
    }

    lambda_series = compute_lambda_series(train_df)
    previous_windows = []
    window_ids = sorted(obs_by_window.keys())

    for window_id in window_ids:
        previous_windows.append(window_id - 1)
        if window_id == 0:
            lam = lambda_series.iloc[0]
        else:
            lam = lambda_series.iloc[window_id-1]

        observed = obs_by_window.get(window_id)

        if observed is None:
            continue

        expected = train_obs_by_window.get(window_id - 1, [])

        out = analyze_window(
            current_window=observed["total_requests"],
            lambda_endpoint=lam,
            beta=beta,
            seconds_in_window=window_size
        )

        info = {
            "lambda": lam,
            "eta": out["eta"],
            "C2": out["C2"],
            "fDp": 1-out["fD"],
            "fDeltap": 1-out["fDelta"],
            "fZp": 1-out["fZ"],
            "expected": expected,
            "window_start": observed["window_start"]
        }

        for ind in indicators_calculators:
            ind.update(
                eta=out["eta"],
                C2=out["C2"]
            )

            ind.record(window_id, info)
    return indicators_calculators

## Experiment

In [9]:
def execute_experiment(train_df, obs_df, train_obs_df, window_size, beta):
    results = []

    for endpoint in tqdm(train_df.endpoint.unique()):
        params={"beta": beta}

        indicators = [
            IndicatorCalculator(endpoint, model="sigmoid", params=params),
            IndicatorCalculator(endpoint, model="exponential", params=params),
            IndicatorCalculator(endpoint, model="recovery", params=params),
            IndicatorCalculator(endpoint, model="kalman", params=params),
        ]

        indicators_report = simulate_traffic_monitor(
            train_df[train_df.endpoint == endpoint],
            obs_df[obs_df.endpoint == endpoint],
            train_obs_df[train_obs_df.endpoint == endpoint],
            window_size,
            indicators,
            beta
        )

        for ir in indicators_report:
            results.extend(ir.history)
    return pd.DataFrame(results)

def experiment(normal_windows_lambda_df, window_obs, window_normal, window_sizes, beta_var):
    results = []

    for beta in beta_var:
        for window_size in window_sizes:
            print(f"Running experiment: window_size={window_size} beta={beta}")
            size = int(window_size.replace('s', ''))

            df = execute_experiment(
                normal_windows_lambda_df[normal_windows_lambda_df['window_size'] == size],
                window_obs[window_obs['window_size'] == size],
                window_normal[window_normal['window_size'] == size],
                size,
                beta
            )

            df["window_size"] = size
            df["beta"] = beta
            results.append(df)

    return pd.concat(results, ignore_index=True)

# Evaluation

In [10]:
def add_gt(results_history, window_gt, window_sizes):
    """
        @description Add the corresponding ground truth for each endpoint-window,
                     so it can be compared later to generate perfomance metrics
    """

    labeled_data= []
    for window_size in window_sizes:
        size = int(window_size.replace('s', ''))
        labels = window_gt[window_gt['window_size']==size][['endpoint', 'window_id', 'has_anomaly']]
        result_df = results_history[results_history['window_size'] == size]
        df = pd.merge(result_df, labels, on=['endpoint','window_id'], how='left')
        labeled_data.append(df)
    return pd.concat(labeled_data, ignore_index=True)

## Full Pipeline

Dataset path

In [11]:
ton_iot_train_path = "../datasets/treated_dataset/ton_treated_train.csv"
ton_iot_test_path = "../datasets/treated_dataset/ton_treated_test.csv"

Experiment params

In [12]:
WINDOW_SIZES = ['30s', '40s', '50s']
BETA_VAR=[0.0, 0.3, 0.5, 0.7, 1.0]

Training

In [13]:
tl = TrafficLearner(
    window_sizes=WINDOW_SIZES,
    path_normal_traffic_df=ton_iot_train_path,
)

normal_traffic_lambda_df = tl.learn_traffic_information()
normal_traffic_wind_observations_df = tl.index_windows(tl.get_normal_traffic_df())

Monitoring

In [14]:
df_ton_test = pd.read_csv(ton_iot_test_path, low_memory=False)
df_ton_test['time_local'] = pd.to_datetime(df_ton_test['time_local'])
anomalous_traffic_label_df = tl.label_test_windows(df_ton_test.copy())
anomalous_traffic_win_observations_df = tl.index_windows(df_ton_test)

In [15]:
results = experiment(normal_traffic_lambda_df, anomalous_traffic_win_observations_df, normal_traffic_wind_observations_df, window_sizes=WINDOW_SIZES, beta_var=BETA_VAR)

final = add_gt(results, anomalous_traffic_label_df, WINDOW_SIZES)
final_sigmoid = final[final['model'] == 'sigmoid'].copy()
final_kalman = final[final['model'] == 'kalman'].copy()

final_sigmoid.to_csv("../results/sigmoid_results.csv", index=False)
final_kalman.to_csv("../results/kalman_results.csv", index=False)
anomalous_traffic_win_observations_df.to_csv("../datasets/treated_dataset/anomalous_traffic_win_observations.csv")

Running experiment: window_size=30s beta=0.0


100%|██████████| 16/16 [00:55<00:00,  3.46s/it]


Running experiment: window_size=40s beta=0.0


100%|██████████| 16/16 [00:43<00:00,  2.69s/it]


Running experiment: window_size=50s beta=0.0


100%|██████████| 16/16 [00:35<00:00,  2.23s/it]


Running experiment: window_size=30s beta=0.3


100%|██████████| 16/16 [00:52<00:00,  3.30s/it]


Running experiment: window_size=40s beta=0.3


100%|██████████| 16/16 [00:40<00:00,  2.53s/it]


Running experiment: window_size=50s beta=0.3


100%|██████████| 16/16 [00:32<00:00,  2.05s/it]


Running experiment: window_size=30s beta=0.5


100%|██████████| 16/16 [01:20<00:00,  5.02s/it]


Running experiment: window_size=40s beta=0.5


100%|██████████| 16/16 [01:11<00:00,  4.47s/it]


Running experiment: window_size=50s beta=0.5


100%|██████████| 16/16 [01:09<00:00,  4.32s/it]


Running experiment: window_size=30s beta=0.7


100%|██████████| 16/16 [01:27<00:00,  5.45s/it]


Running experiment: window_size=40s beta=0.7


100%|██████████| 16/16 [01:13<00:00,  4.61s/it]


Running experiment: window_size=50s beta=0.7


100%|██████████| 16/16 [01:11<00:00,  4.46s/it]


Running experiment: window_size=30s beta=1.0


100%|██████████| 16/16 [01:33<00:00,  5.82s/it]


Running experiment: window_size=40s beta=1.0


100%|██████████| 16/16 [01:06<00:00,  4.14s/it]


Running experiment: window_size=50s beta=1.0


100%|██████████| 16/16 [00:56<00:00,  3.55s/it]
