# Preprocessing & feature engineering

In [1]:
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support
from sklearn.mixture import GaussianMixture
import xgboost as xgb
import joblib, os
import matplotlib.pyplot as plt
import shap
from tqdm import tqdm
print('Notebook environment ready.')

SEED = 42

Notebook environment ready.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ---------------------------------------------
# Helper function: compute Δ-FPR fairness metric
# ---------------------------------------------
def compute_delta_fpr(y_true, y_pred, clusters):
    results = []
    for k in np.unique(clusters):
        idx = (clusters == k)
        if idx.sum() == 0:
            continue
        true = y_true[idx]
        pred = y_pred[idx]
        tn = ((true==0)&(pred==0)).sum()
        fp = ((true==0)&(pred==1)).sum()
        fpr = fp / (fp + tn + 1e-9)
        results.append(fpr)
    if len(results) == 0:
        return np.nan
    global_fpr = np.mean(results)
    return max(abs(f - global_fpr) for f in results)


# ---------------------------------------------
# Helper function: evaluate any anomaly score
# ---------------------------------------------
def eval_model(y_true, scores):
    auc = roc_auc_score(y_true, scores)
    thresh = np.percentile(scores, 99)  # anomaly threshold
    pred = (scores > thresh).astype(int)
    _, _, f1, _ = precision_recall_fscore_support(
        y_true, pred, average='binary', zero_division=0
    )
    return auc, f1, pred

In [None]:
# Chunk loader function
# path - file path to dataset (CERT 6.2) CSV log file
# usecols - file-specific features to use for evaluation

def load_chunked(path, usecols, chunksize=200_000):
  iter = pd.read_csv(path, compression='gzip', chunksize=chunksize, usecols=usecols)
  return pd.concat([chunk for chunk in iter])

In [None]:
# Load LOGON data
logon_cols = ['user', 'pc', 'time', 'activity']
logon = load_chunked('/content/drive/MyDrive/BS-Nishika/cert6.2/logon.csv.gz', logon_cols)

logon['time'] = pd.to_datetime(logon['time'])
logon['day'] = logon['time'].dt.date
logon.head()

In [None]:
# Extract LOGON daily features
logon_daily = logon.groupby(['user','day']).agg(
    logins=('activity','count'),
    unique_hosts=('pc','nunique'),
    avg_login_hour=('time', lambda x: x.dt.hour.mean())
).reset_index()

In [None]:
# Load FILE data and extract daily features
file_cols = ['user', 'path', 'time']
file = load_chunked('/content/drive/MyDrive/BS-Nishika/cert6.2/file.csv.gz', file_cols)

file['time'] = pd.to_datetime(file['time'])
file['day'] = file['time'].dt.date

file_daily = file.groupby(['user','day']).agg(
    files_accessed=('path','nunique')
).reset_index()

In [None]:
# Load DEVICE data and extract daily features
device_cols = ['user','time','activity']
device = load_chunked('/content/drive/MyDrive/cert6.2/device.csv.gz', device_cols)

device['time'] = pd.to_datetime(device['time'])
device['day'] = device['time'].dt.date

device_daily = device.groupby(['user','day']).agg(
    device_events=('activity','count')
).reset_index()

In [None]:
# Merge all behavioral features
features = logon_daily.merge(file_daily, on=['user','day'], how='left')
features = features.merge(device_daily, on=['user','day'], how='left')

features.fillna(0, inplace=True)
features.head()

In [None]:
# Add cultural proxy features (safe, non PII - key being that they remain as behavioral patterns, not identity)
np.random.seed(42)

# Proxy 1: Work shift bucket (0–3) based on avg login hour
features['shift_bucket'] = (features['avg_login_hour'] // 6).astype(int)

# Proxy 2: Communication density proxy (log of logins)
features['comm_density'] = np.log1p(features['logins'])

# Proxy 3: Synthetic team_id inferred from user hash (stable, non-sensitive)
features['team_id'] = features['user'].astype('category').cat.codes % 6


In [None]:
# Load LABELS and merge to features
labels = pd.read_csv('/content/drive/MyDrive/BS-Nishika/cert6.2/insider-threat6.2_label.csv.gz',
                     compression='gzip')

# Simplify for evaluation: Treat any user with any malicious tag as malicious on all days
mal_users = set(labels['user'][labels['label']==1])
features['label'] = features['user'].apply(lambda u: 1 if u in mal_users else 0)


In [None]:
# FINAL ML FEATURE MATRIX
feat_cols = [
    'logins', 'unique_hosts', 'files_accessed', 'device_events',
    'avg_login_hour', 'shift_bucket', 'comm_density', 'team_id'
]

X = features[feat_cols]
y = features['label'].values

# One-hot team encoding (except first)
X = pd.get_dummies(X, columns=['team_id'], drop_first=True)

scaler = StandardScaler().fit(X)
Xs = scaler.transform(X)

Xtr, Xte, ytr, yte = train_test_split(Xs, y, test_size=0.3, random_state=42)

# Evaluating Baseline Model
Global IsolationForest

In [None]:
baseline = IsolationForest(n_estimators=200, contamination=0.01, random_state=42)
baseline.fit(Xtr)
scores = -baseline.decision_function(Xte)

auc_base = roc_auc_score(yte, scores)
pred_base = (scores > np.percentile(scores, 99)).astype(int)
prec, rec, f1, _ = precision_recall_fscore_support(yte, pred_base, average='binary')

print("Baseline IF — AUC:", auc_base, "F1:", f1)

# Implementing CITE Pipeline
GMM Clustering + Per-Cluster IsolationForest (RL STREAM)

In [None]:
# Fit Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=6, random_state=42)
gmm.fit(Xtr)

cluster_tr = gmm.predict(Xtr)
cluster_te = gmm.predict(Xte)

In [None]:
# Train per-cluster IF models
cluster_models = {}
cluster_thresh = {}

for k in range(6):
    idx = (cluster_tr == k)
    if idx.sum() < 40:
        continue
    sub = Xtr[idx]

    clf_k = IsolationForest(n_estimators=200, random_state=42).fit(sub)
    scores_k = -clf_k.decision_function(sub)

    cluster_models[k] = clf_k
    cluster_thresh[k] = np.percentile(scores_k, 99)

In [None]:
# Infer cluster-model anomaly scores
scores_cite = []

for x, k in zip(Xte, cluster_te):
    if k in cluster_models:
        s = -cluster_models[k].decision_function(x.reshape(1,-1))
        scores_cite.append(s[0])
    else:
        scores_cite.append(0)

In [None]:
# Get predictions
scores_cite = np.array(scores_cite)
auc_cite = roc_auc_score(yte, scores_cite)
pred_cite = (scores_cite > np.percentile(scores_cite, 99)).astype(int)
prec, rec, f1_cite, _ = precision_recall_fscore_support(yte, pred_cite, average='binary')

print("CITE (GMM + per-cluster IF) — AUC:", auc_cite, "F1:", f1_cite)

Supervised XGBoost Stream (SUPERVISED STREAM)

In [None]:
dtrain = xgb.DMatrix(Xtr, label=ytr)
dtest = xgb.DMatrix(Xte)

params = {
    'objective':'binary:logistic',
    'eval_metric':'auc',
    'seed':42,
    'eta':0.05,
    'max_depth':6
}

bst = xgb.train(params, dtrain, num_boost_round=200)

p = bst.predict(dtest)
auc_xgb = roc_auc_score(yte, p)
print("XGBoost stream — AUC:", auc_xgb)


# Metrics

In [None]:
# SHAP Interpretability
explainer = shap.TreeExplainer(bst)
shap_values = explainer.shap_values(Xte)

plt.figure(figsize=(9,5))
shap.summary_plot(shap_values, Xte, show=False)
plt.title("SHAP Feature Importance")
plt.show()

In [None]:
# Built Generic Evaluation Table
table = pd.DataFrame({
    'Model': ['Baseline IF', 'CITE-GMM-IF', 'XGBoost'],
    'AUC': [auc_base, auc_cite, auc_xgb],
    'F1': [f1, f1_cite, np.nan]
})
table

In [None]:
# Plot Per-Cluster
results = []
for k in range(6):
    idx = (cluster_te == k)
    if idx.sum() == 0: continue

    true = yte[idx]
    preds = pred_cite[idx]

    tn = ((true==0)&(preds==0)).sum()
    fp = ((true==0)&(preds==1)).sum()
    fpr = fp / (fp+tn+1e-9)

    results.append([k, fpr])

cluster_fprs = pd.DataFrame(results, columns=['cluster','FPR'])

sns.barplot(data=cluster_fprs, x='cluster', y='FPR')
plt.title("Per-Cluster FPR (Fairness)")
plt.show()


In [None]:
# Save Processed Feature File to CSV
features.to_csv("processed_features.csv", index=False)