In [1]:
import numpy as np

from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import roc_auc_score as roc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import pairwise_kernels as apply_kernel
from shape_dd import shape

import pandas as pd
from copy import deepcopy

import multiprocessing
from multiprocessing.pool import Pool
from tqdm import tqdm
import os
import random
import time


In [5]:
def gen_window_matrix(l1,l2, n_perm, cache=dict()):
    if (l1,l2, n_perm) not in cache.keys():
        w = np.array(l1*[1./l1]+(l2)*[-1./(l2)])
        W = np.array([w] + [np.random.permutation(w) for _ in range(n_perm)])
        cache[(l1,l2,n_perm)] = W
    return cache[(l1,l2,n_perm)]
def mmd(X, s=None, n_perm=2500):
    K = apply_kernel(X, metric="rbf")
    if s is None:
        s = int(X.shape[0]/2)
    
    W = gen_window_matrix(s,K.shape[0]-s, n_perm)
    s = np.einsum('ij,ij->i', np.dot(W, K), W)
    p = (s[0] < s).sum() / n_perm
    
    return s[0], p


In [6]:
def shape(X, l1, l2, n_perm):
    w = np.array(l1*[1.]+l1*[-1.]) / float(l1)
    
    n = X.shape[0]
    K = apply_kernel(X, metric="rbf")
    W = np.zeros( (n-2*l1,n) )
    
    for i in range(n-2*l1):
        W[i,i:i+2*l1] = w    
    stat = np.einsum('ij,ij->i', np.dot(W, K), W)

    shape = np.convolve(stat,w)
    shape_prime = shape[1:]*shape[:-1] 
    
    res = np.zeros((n,3))
    res[:,2] = 1
    for pos in np.where(shape_prime < 0)[0]:
        if shape[pos] > 0:
            res[pos,0] = shape[pos]
            a,b = max(0,pos-int(l2/2)),min(n,pos+int(l2/2))
            res[pos,1:] = mmd(X[a:b], pos-a, n_perm)
    return res


Installing the dataset

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dnkumars/cybersecurity-intrusion-detection-dataset")

print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/chaos/.cache/kagglehub/datasets/dnkumars/cybersecurity-intrusion-detection-dataset/versions/2


In [3]:
data = pd.read_csv("cybersecurity_intrusion_data.csv")


In [4]:
data


Unnamed: 0,session_id,network_packet_size,protocol_type,login_attempts,session_duration,encryption_used,ip_reputation_score,failed_logins,browser_type,unusual_time_access,attack_detected
0,SID_00001,599,TCP,4,492.983263,DES,0.606818,1,Edge,0,1
1,SID_00002,472,TCP,3,1557.996461,DES,0.301569,0,Firefox,0,0
2,SID_00003,629,TCP,3,75.044262,DES,0.739164,2,Chrome,0,1
3,SID_00004,804,UDP,4,601.248835,DES,0.123267,0,Unknown,0,1
4,SID_00005,453,TCP,5,532.540888,AES,0.054874,1,Firefox,0,0
...,...,...,...,...,...,...,...,...,...,...,...
9532,SID_09533,194,ICMP,3,226.049889,AES,0.517737,3,Chrome,0,1
9533,SID_09534,380,TCP,3,182.848475,,0.408485,0,Chrome,0,0
9534,SID_09535,664,TCP,5,35.170248,AES,0.359200,1,Firefox,0,0
9535,SID_09536,406,TCP,4,86.664703,AES,0.537417,1,Chrome,1,0


In [6]:
# X = data.drop(columns=["session_id", "attack_detected"])

# Convert categorical columns to numeric (one-hot encoding)
X = pd.get_dummies(data)

# Convert to NumPy array
X = X.values


In [7]:
X


array([[599, 4, 492.9832634426563, ..., False, False, False],
       [472, 3, 1557.9964611204384, ..., True, False, False],
       [629, 3, 75.04426166420741, ..., False, False, False],
       ...,
       [664, 5, 35.1702481662574, ..., True, False, False],
       [406, 4, 86.66470253868668, ..., False, False, False],
       [340, 6, 86.87674403656241, ..., False, False, False]],
      dtype=object)

In [8]:
def batch(X, chunk_size, overlap=100):
    shift = chunk_size-overlap
    res = []
    for i in range(int(X.shape[0]/(shift))-int(chunk_size/shift)+1):
        res.append( X[i*shift : min(chunk_size + i*shift, X.shape[0])] )
    return res


In [9]:
def run_shape_on_real_data(X, chunk_size):
    import time
    t0 = time.time()
    shp = shape(X, 50, chunk_size, 2500)[:,2]
    t1 = time.time()
    res = []
    batches = batch(np.arange(X.shape[0]), chunk_size=chunk_size)
    batch_count = len(batches)
    for b in batches:
        res.append({
            "method": "shape",
            "estimate": float("%.5f" % shp[b].min()),
            "chunk_size": chunk_size,
            "comp_time": (t1 - t0) / batch_count
        })
    return res


In [14]:
def run_exp(X, chunk_size):
    # X,y=gen_random(**dict([("dist",dist),("alt",alt),(task,value)  ,  ("length",750),("min_dist",10)]))
    
    t0 = time.time()
    shp = shape(X, 50,chunk_size, 2500)[:,2]
    t1 = time.time()
    
    res = []
    batches = batch(np.arange(X.shape[0]), chunk_size=chunk_size)
    batch_count = len(batches)
    for b in batches:
        x_ = X[b]
        # drift_score = (y[b][None,:] != y[b][:,None]).sum()/( b.shape[0]*(b.shape[0]-1) )
        res += [{"method":method, "estimate":estimate, "chunk_size": chunk_size, "comp_time": comp_time} for method,(comp_time,estimate) in 
               {
                   "shape":        ((t1-t0)/batch_count,float("%.5f"%shp[b].min()))
               }.items()]
    return res


In [10]:
res = []
res = run_shape_on_real_data(X, 1000)


In [28]:
res


[{'method': 'shape',
  'estimate': 0.0328,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.3552,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0732,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0608,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.1676,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0604,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0696,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0572,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.1828,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817},
 {'method': 'shape',
  'estimate': 0.0636,
  'chunk_size': 1000,
  'comp_time': 5.103003883361817}]

In [13]:
res2 = run_exp(X, 1000)
res2


[{'method': 'shape',
  'estimate': 0.0936,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.2764,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.1868,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.0168,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.4384,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.1652,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.2252,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.092,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.192,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823},
 {'method': 'shape',
  'estimate': 0.1748,
  'chunk_size': 1000,
  'comp_time': 8.784769010543823}]

In [36]:
# Extracting drift_score values
drift_scores = [entry['drift_score'] for entry in res]
print("Drift scores:", drift_scores)


Drift scores: [np.float64(0.4977597597597598), np.float64(0.48893693693693696), np.float64(0.49255455455455455), np.float64(0.4979059059059059), np.float64(0.49376576576576575), np.float64(0.49255455455455455), np.float64(0.49845045045045044), np.float64(0.4932932932932933), np.float64(0.4928048048048048), np.float64(0.4906906906906907)]
