# Syntethic Dataset Generator - CERT backup

In [None]:
def generate_synthetic(n_users=50, days=30):
    rows = []
    for u in range(n_users):
        base_shift = np.random.choice([0,3,6,9])  # cultural proxy
        for d in range(days):
            login_count=np.random.poisson(5)
            malicious = 1 if (np.random.rand()<0.02 and login_count>7) else 0
            rows.append([u,d,login_count,base_shift,malicious])
    df = pd.DataFrame(rows, columns=['user','day','login_count','shift','label'])
    return df

df = generate_synthetic()
df.head()

# Preprocessing & feature engineering

In [1]:
# src/preprocess.py

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib, os

SEED = 42

In [2]:
def load_logs(path):
  # Adapt to CERT version csv schema
  return pd.read_csv(path, parse_dates=['timestamp'])

In [4]:
def add_cultural_proxies(agg):
    # simple proxies - TODO: REFINE (time-zone via IP/host map, communication language via email metadata if available, but avoid PII)
    # time zone proxy (simulate from login hours)
    agg['local_time_offset'] = (agg['avg_hour'] - 12) / 24.0
    # communication density proxy (placeholder)
    agg['comm_density'] = np.log1p(agg['login_count'])
    # team tenure / group inferred: randomly assign group for demo
    np.random.seed(SEED)
    agg['team_id'] = np.random.randint(0,6, size=len(agg))
    return agg

In [5]:
def build_feature_matrix(agg, out_path='data/processed/features.pkl'):
    feat_cols = ['login_count','unique_hosts','files_accessed','avg_hour',
                 'local_time_offset','comm_density','team_id']
    X = agg[feat_cols].copy()
    # encode team_id as one-hot or leave as numeric for GMM
    X = pd.get_dummies(X, columns=['team_id'], drop_first=True)
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)
    joblib.dump((Xs, scaler, X.columns.tolist()), out_path)
    print('Saved features to', out_path)
    return Xs

In [7]:
if __name__=='__main__':
    df = load_logs('data/raw/logs.csv')
    agg = window_features(df)
    agg = add_cultural_proxies(agg)
    Xs = build_feature_matrix(agg)

NameError: name 'df' is not defined

# Evaluating Baseline Models

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [None]:
X, scaler, cols = joblib.load('data/processed/features.pkl')
# Suppose you have binary labels y (1=malicious event, 0=benign)
# If CERT dataset, load labels accordingly. For now synthetic:
y = np.zeros(X.shape[0], dtype=int)
# split
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=42)

# Global IsolationForest
clf = IsolationForest(n_estimators=200, contamination=0.01, random_state=42)
clf.fit(Xtr)
scores = -clf.decision_function(Xte)   # higher = anomalous
# metrics
auc = roc_auc_score(yte, scores)
prec,rec,f1,_ = precision_recall_fscore_support(yte, (scores>np.percentile(scores,99)).astype(int), average='binary', zero_division=0)
print('IF AUC',auc,'F1',f1)

# Implementing CITE Pipeline

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=6, covariance_type='full', random_state=42)
gmm.fit(Xtr)
cluster_probs = gmm.predict_proba(Xte)   # π_u,t
cluster_assign = cluster_probs.argmax(axis=1)

**Per-cluster IsolationForest + thresholds**

For each cluster k:

Train an IsolationForest on training data where cluster_assign==k (or high membership).

Compute anomaly scores for validation set and set a threshold τ_k to achieve target cluster alert rate or target FPR on val set.

In [None]:
from collections import defaultdict
cluster_if = {}
thresholds = {}
for k in range(gmm.n_components):
    idx = (gmm.predict_proba(Xtr).argmax(axis=1) == k)
    if idx.sum() < 50:
        continue
    clf_k = IsolationForest(n_estimators=200, random_state=42).fit(Xtr[idx])
    cluster_if[k] = clf_k
    scores_k = -clf_k.decision_function(Xtr[idx])
    # threshold as 99th percentile
    thresholds[k] = np.percentile(scores_k, 99)

**RL / Simple bandit for threshold adaptation (practical approach)**

Full RL is heavy. For an evaluation, use a contextual bandit / epsilon-greedy to adjust τ_k to control Δ-FPR. This is simpler, reproducible, and measurable.

Algorithm (per evaluation window):

For each cluster k, maintain current τ_k.

Sample small adjustments ±δ.

Use validation set or delayed labels to compute ΔFPR_k.

If ΔFPR_k > target, increase τ_k (make less sensitive), else decrease.

(You can implement a Q-table over discretized thresholds or keep it greedy.)

In [None]:
# for each epoch:
for epoch in range(E):
    for k in clusters:
        candidate = thresholds[k] + np.random.choice([-delta,0,delta])
        # evaluate on held-out val window -> compute new FPR_k
        if new_delta_fpr < old_delta_fpr: thresholds[k]=candidate

**Supervised Stream**

Construct supervised labels y. Train XGBoost on labeled events with class weighting. Include cluster probs as features and per-cluster anomaly scores.

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(Xtr, label=ytr)
params = {'objective':'binary:logistic','eval_metric':'auc','scale_pos_weight': max(1, (len(ytr)-ytr.sum())/max(1,ytr.sum())), 'seed':42}
bst = xgb.train(params, dtrain, num_boost_round=200)
dtest = xgb.DMatrix(Xte)
p = bst.predict(dtest)


In [None]:
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(Xte)
shap.summary_plot(shap_values, features=... )

Evaluation & metrics (exact formulas)

Compute and report:

AUC-ROC: roc_auc_score(y_true, score)

Macro F1: precision_recall_fscore_support(..., average='macro')

Per-cluster FPR: for each k, FPR_k = FP_k / (FP_k + TN_k)

Δ-FPR: max_k |FPR_k - FPR_global|

Cultural Robustness Index (CRI) (your paper proposes it): define concretely, e.g.

Statistical test: use paired bootstrap to compare F1 between baseline and CITE; report p-value.

Plotting:

Boxplots of anomaly scores by cluster

Line chart: thresholds over epochs

Bar chart: FPR_k for baseline vs CITE

SHAP summary

Use matplotlib/seaborn.

Ablation studies (must-have)

Run these and report results in a table:

Baseline global IF

GMM + per-cluster IF (no RL adaptation)

GMM + per-cluster IF + threshold adaptation (bandit/RL)

Full CITE: add supervised XGBoost stream (combine logic described in paper)

CITE without cultural proxies (ablate cultural features)

CITE with noisy/different proxy sets (robustness)

Report Δ-FPR and macro-F1 for each.

Evasion & robustness experiment (recommended for impact)

Implement an attack where an insider “cluster-hops”: they modify behavior to look like cluster j to evade detection (simulate by swapping feature distributions).

Measure detection rate for baseline vs CITE.

Show CITE either resists or specify the limits.