In [None]:
from google.colab import drive
#drive.flush_and_unmount()
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018/

/content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018


In [None]:
# ResAD — MICRO PILOT (minutes to first results)
# -----------------------------------------------
# Design goals
#   • Use **all families** but with a **tiny, fixed sample per class per split** (defaults = 40/40).
#   • No nearest-neighbor search → **centroid residuals** per family.
#   • No flow → **diagonal Gaussian** on constrained residuals.
#   • **Zero AE pretraining by default** (can toggle to 1 epoch).
#   • Optional **feature cap**: keep top-K-variance numeric columns (defaults K=32).
#   • Prints overall + per-family metrics at VAL and TEST.
#
# Expectation: Should finish in a couple of minutes even on CPU.

import os, re, argparse, random, hashlib, math
from dataclasses import dataclass
from typing import Dict, Tuple, List, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

# ---------------- File discovery ----------------

def resolve_data_dir(path: str) -> str:
    cand = os.path.expanduser(path.rstrip("/ "))
    if os.path.isdir(cand): return cand
    variants = [cand.replace("CSE-CIC-IDS_ALL_DATA","CSE_CIC_IDS_ALL_DATA"), cand.replace("CSE_CIC_IDS_ALL_DATA","CSE-CIC-IDS_ALL_DATA")]
    for v in variants:
        if os.path.isdir(v): return v
    raise FileNotFoundError(f"Directory not found: {path}\nTried: {cand}")

def _clean_display_name(name: str) -> str:
    s = name.strip();
    if len(s)>=2 and s[0]==s[-1] and s[0] in ("'",'"'): s = s[1:-1]
    return s

def _parse_cat_subcat_from_display_name(clean_name: str) -> Tuple[str,str]:
    parts = re.split(r"[-–—_]", clean_name, maxsplit=1)
    if len(parts)==2: return parts[0].strip(), parts[1].strip()
    return clean_name.strip(), ""

def discover_files_by_category(data_dir: str) -> Dict[str,str]:
    data_dir = resolve_data_dir(data_dir)
    out = {}
    with os.scandir(data_dir) as it:
        for e in it:
            if e.is_file() and e.name.lower().endswith('.csv'):
                cleaned = _clean_display_name(os.path.splitext(e.name)[0])
                cat, sub = _parse_cat_subcat_from_display_name(cleaned)
                fam = f"{cat}::{sub}" if sub else cat
                out[fam] = e.path
    if not out: raise RuntimeError(f"No CSVs found in {data_dir}")
    return out

# ---------------- Labels & split policy ----------------

LABEL_CANDIDATES = ["Label","label","Attack","attack","is_anomaly","anomaly","Anomaly","benign_label"]
BENIGN_TOKENS = {"BENIGN","Benign","benign","Normal","normal","0",0,0.0}
ATTACK_TOKENS = {"ATTACK","Attack","attack","1",1,1.0}

def infer_label_column(df: pd.DataFrame) -> str:
    for c in LABEL_CANDIDATES:
        if c in df.columns: return c
    for c in df.columns:
        lc=c.lower();
        if lc.endswith('label') or lc.endswith('attack') or lc.endswith('anomaly'):
            return c
    raise KeyError("Could not infer label column")

def to_binary_series(raw: pd.Series) -> pd.Series:
    if np.issubdtype(raw.dtype,np.number): return (raw.astype(float)>0.0).astype(int)
    s = raw.astype(str)
    return s.apply(lambda v: 0 if v in {str(t) for t in BENIGN_TOKENS} else (1 if v in {str(t) for t in ATTACK_TOKENS} else (1 if v.strip().lower() not in ("benign","normal","0") else 0)))

@dataclass
class SplitPolicy:
    train_frac: float = 2/3
    val_frac: float = 1/6
    seed: int = 42
    def assign(self, file_key: str, row_idx: int) -> str:
        h = hashlib.blake2b(digest_size=8)
        h.update(str(self.seed).encode()); h.update(file_key.encode()); h.update(str(row_idx).encode())
        u = int.from_bytes(h.digest(),'little')/2**64
        if u < self.train_frac: return 'train'
        if u < self.train_frac + self.val_frac: return 'val'
        return 'test'

# ---------------- Online scaler ----------------

class OnlineStandardizer:
    def __init__(self, dim:int):
        self.n=0; self.mean=np.zeros(dim,np.float64); self.M2=np.zeros(dim,np.float64)
        self.std=np.ones(dim,np.float64)
    def partial_fit(self, X: np.ndarray):
        X=np.atleast_2d(X).astype(np.float64)
        for x in X:
            self.n+=1; d=x-self.mean; self.mean+=d/self.n; self.M2+=d*(x-self.mean)
    def finalize(self):
        var=self.M2/max(1,self.n-1); self.std=np.sqrt(np.maximum(var,1e-8)).astype(np.float32); self.mean=self.mean.astype(np.float32); return self
    def transform(self, X: np.ndarray)->np.ndarray:
        X=X.astype(np.float32); return (X-self.mean)/self.std

# ---------------- Feature capping (top-K variance) ----------------

def topk_variance_columns(df: pd.DataFrame, exclude: List[str], k: int) -> List[str]:
    num = df.select_dtypes(include=[np.number])
    cols = [c for c in num.columns if c not in exclude]
    if 0 < k < len(cols):
        v = num[cols].var().sort_values(ascending=False)
        return v.index[:k].tolist()
    return cols

# ---------------- Pilot sampler (tiny quotas) ----------------

def build_micro_pilot(files, fam_names, numeric_cols, label_col, policy: SplitPolicy, chunk_rows:int,
                      per_fam_norm:int, per_fam_anom:int):
    quotas={'train':{'norm':per_fam_norm,'anom':per_fam_anom},
            'val':  {'norm':per_fam_norm,'anom':per_fam_anom},
            'test': {'norm':per_fam_norm,'anom':per_fam_anom}}
    buffers={s:{'X':[], 'y':[], 'fam':[]} for s in ('train','val','test')}
    scaler=OnlineStandardizer(dim=len(numeric_cols))

    def under_quota(split,fid,y):
        kind='anom' if y==1 else 'norm'
        c=0
        for yy,ff in zip(buffers[split]['y'], buffers[split]['fam']):
            if ff==fid and ((yy==1)==(kind=='anom')): c+=1
        return c < quotas[split][kind]

    # Go family by family, fill tiny quotas and move on.
    for fid,fname in enumerate(fam_names):
        path = files[fname]
        for chunk in pd.read_csv(path, chunksize=chunk_rows):
            lbl = label_col if label_col in chunk.columns else infer_label_column(chunk)
            y = to_binary_series(chunk[lbl])
            X = chunk[numeric_cols].copy().replace([np.inf,-np.inf],np.nan).dropna(axis=0)
            if len(X)==0: continue
            y = y.loc[X.index]
            for local_idx,row_idx in enumerate(X.index):
                split = policy.assign(fname, int(row_idx))
                yi = int(y.loc[row_idx])
                if not under_quota(split,fid,yi):
                    continue
                xi = X.iloc[local_idx].values.astype(np.float32)
                if split=='train' and yi==0: scaler.partial_fit(xi[None,:])
                buffers[split]['X'].append(xi); buffers[split]['y'].append(yi); buffers[split]['fam'].append(fid)
            # Stop early if this family's quotas are full for all splits
            done_all=True
            for s in ('train','val','test'):
                for kind in ('norm','anom'):
                    want=quotas[s][kind]
                    have=sum(1 for yy,ff in zip(buffers[s]['y'], buffers[s]['fam']) if ff==fid and ((yy==1)==(kind=='anom')))
                    if have<want: done_all=False; break
                if not done_all: break
            if done_all: break
    scaler.finalize()

    def pack(split):
        X=np.array(buffers[split]['X'],np.float32)
        if X.size: X=scaler.transform(X)
        y=np.array(buffers[split]['y'],np.int64)
        f=np.array(buffers[split]['fam'],np.int64)
        return {'X':torch.tensor(X), 'y':torch.tensor(y), 'family':torch.tensor(f)}

    return pack('train'), pack('val'), pack('test'), scaler

# ---------------- Models (your encoder/decoder + small constraintor) ----------------

class ResBlock(nn.Module):
    def __init__(self, d, drop_path=0.0):
        super().__init__(); self.ln1=nn.LayerNorm(d); self.fc1=nn.Linear(d,d); self.ln2=nn.LayerNorm(d); self.fc2=nn.Linear(d,d); self.drop_path=float(drop_path)
    def forward(self,x):
        h=self.fc1(F.gelu(self.ln1(x))); h=self.fc2(F.gelu(self.ln2(h)))
        if self.training and self.drop_path>0 and torch.rand(())<self.drop_path: return x
        return x+0.5*h

class Encoder(nn.Module):
    def __init__(self, in_dim, hid=128, depth=1, p_drop=0.1, bottleneck=64, feat_drop=0.0):
        super().__init__(); self.inp=nn.Linear(in_dim,hid); self.feat_drop=feat_drop
        self.blocks=nn.ModuleList([ResBlock(hid,drop_path=0.00) for _ in range(depth)]); self.dropout=nn.Dropout(p_drop); self.out=nn.Linear(hid,bottleneck)
    def forward(self,x):
        if self.training and getattr(self,'feat_drop',0.0)>0:
            m=torch.rand_like(x)>self.feat_drop; x=x*m
        h=F.gelu(self.inp(x))
        for b in self.blocks: h=b(h)
        return self.out(self.dropout(h))

class Decoder(nn.Module):
    def __init__(self, bottleneck=64, hid=128, out_dim=32, depth=0):
        super().__init__(); self.fc=nn.Linear(bottleneck,hid); self.blocks=nn.ModuleList([ResBlock(hid) for _ in range(depth)]); self.out=nn.Linear(hid,out_dim)
    def forward(self,z):
        h=F.gelu(self.fc(z))
        for b in self.blocks: h=b(h)
        return self.out(h)

class FeatureConstraintor(nn.Module):
    def __init__(self, in_dim, hid=128):
        super().__init__(); self.net=nn.Sequential(nn.Linear(in_dim,hid), nn.BatchNorm1d(hid), nn.ReLU(True), nn.Linear(hid,in_dim))
    def forward(self,x): return self.net(x)

# ---------------- Losses & simple Gaussian scorer ----------------

def occ_loss(res_c: torch.Tensor, res_init: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
    y=y.float(); norm=torch.linalg.norm(res_c,dim=1); loss_norm=torch.sqrt(1.0+norm*norm)-1.0; loss_abn=torch.sum((res_c-res_init)**2,dim=1)
    return ((1-y)*loss_norm + y*loss_abn).mean()

class RunningDiagGaussian:
    def __init__(self, dim:int, eps:float=1e-5): self.dim=dim; self.eps=eps; self.n=0; self.mean=torch.zeros(dim); self.M2=torch.zeros(dim)
    def to(self, device): self.mean=self.mean.to(device); self.M2=self.M2.to(device); return self
    @torch.no_grad()
    def update(self, x: torch.Tensor):
        for i in range(x.size(0)):
            xi=x[i]; self.n+=1; d=xi-self.mean; self.mean+=d/self.n; self.M2+=d*(xi-self.mean)
    def var(self):
        if self.n<2: return torch.ones_like(self.mean)
        return self.M2 / (self.n-1)
    def logp(self, x: torch.Tensor) -> torch.Tensor:
        var = self.var() + self.eps
        const = 0.5*self.dim*math.log(2*math.pi)
        return - (const + 0.5*( ((x-self.mean)**2/var).sum(dim=1) + torch.log(var).sum() ))

def compute_thr_at_fixed_fpr(y_true: np.ndarray, scores: np.ndarray, fpr: float = 0.05) -> float:
    normals=scores[y_true==0]
    if len(normals)==0: return float(np.percentile(scores, 100*(1-fpr)))
    return float(np.quantile(normals, 1-fpr))

def eval_block(y_true: np.ndarray, scores: np.ndarray, thr: Optional[float]=None):
    auc = roc_auc_score(y_true, scores) if len(np.unique(y_true))>1 else float('nan')
    ap = average_precision_score(y_true, scores) if len(np.unique(y_true))>1 else float('nan')
    if thr is None:
        p,r,t = precision_recall_curve(y_true, scores)
        f1s = 2*p*r/(p+r+1e-9); i = np.nanargmax(f1s); thr = t[i] if i<len(t) else np.inf
    y_pred = (scores>=thr).astype(int)
    tp=int(((y_pred==1)&(y_true==1)).sum()); fp=int(((y_pred==1)&(y_true==0)).sum()); fn=int(((y_pred==0)&(y_true==1)).sum()); tn=int(((y_pred==0)&(y_true==0)).sum())
    prec = tp/(tp+fp+1e-9); rec=tp/(tp+fn+1e-9); f1=2*prec*rec/(prec+rec+1e-9); acc=(tp+tn)/max(1,(tp+tn+fp+fn))
    return {'threshold':float(thr),'auc_roc':float(auc),'ap':float(ap),'precision':float(prec),'recall':float(rec),'f1':float(f1),'accuracy':float(acc)}

def per_family_report(y: np.ndarray, s: np.ndarray, fam: np.ndarray, fam_names: List[str], thr: float) -> List[str]:
    lines=[]
    for i,name in enumerate(fam_names):
        m=(fam==i);
        if not m.any(): continue
        yi,si=y[m],s[m]
        mtr=eval_block(yi,si,thr)
        n=yi.size; pos=int((yi==1).sum()); neg=n-pos
        lines.append(f"  [{i:02d}] {name} | N={n} (+:{pos}/-:{neg}) | AUC={mtr['auc_roc']:.3f} AP={mtr['ap']:.3f} F1={mtr['f1']:.3f} P={mtr['precision']:.3f} R={mtr['recall']:.3f} Acc={mtr['accuracy']:.3f}")
    return lines

# ---------------- Train ----------------

def train(args):
    device='cuda' if torch.cuda.is_available() and not args.cpu else 'cpu'

    files=discover_files_by_category(args.data_dir)
    fam_names=sorted(files.keys())
    print("Discovered families:");
    for k in fam_names: print("  ",k)

    # probe schema on first file
    sample_df=pd.read_csv(files[fam_names[0]], nrows=2000)
    label_col=infer_label_column(sample_df)
    numeric_cols = topk_variance_columns(sample_df, exclude=[label_col], k=args.feat_topk)
    print(f"Numeric dim (capped): {len(numeric_cols)} | Label: {label_col}")

    policy=SplitPolicy(train_frac=args.train_frac, val_frac=args.val_frac, seed=args.seed)

    # build tiny, fixed-size datasets covering all families
    train_pack, val_pack, test_pack, scaler = build_micro_pilot(
        files, fam_names, numeric_cols, label_col, policy, args.chunk_rows,
        per_fam_norm=args.min_per_class, per_fam_anom=args.min_per_class)

    class PackDS(Dataset):
        def __init__(self,p): self.X=p['X']; self.y=p['y']; self.f=p['family']
        def __len__(self): return self.X.size(0)
        def __getitem__(self,i): return {'x':self.X[i],'y':self.y[i],'fam':self.f[i]}

    ds_tr, ds_va, ds_te = PackDS(train_pack), PackDS(val_pack), PackDS(test_pack)
    loader_tr=DataLoader(ds_tr, batch_size=args.batch, shuffle=True, num_workers=0)
    loader_va=DataLoader(ds_va, batch_size=args.batch, shuffle=False, num_workers=0)
    loader_te=DataLoader(ds_te, batch_size=args.batch, shuffle=False, num_workers=0)

    D=len(numeric_cols)
    enc=Encoder(D, hid=args.hid, depth=args.depth, p_drop=args.p_drop, bottleneck=args.bottleneck, feat_drop=args.feat_drop).to(device)
    dec=Decoder(bottleneck=args.bottleneck, hid=args.hid, out_dim=D, depth=args.dec_depth).to(device)
    con=FeatureConstraintor(in_dim=args.bottleneck, hid=max(64,args.bottleneck)).to(device)

    # optional AE (defaults to 0 epoch)
    if args.epochs_ae>0:
        opt_ae=torch.optim.Adam(list(enc.parameters())+list(dec.parameters()), lr=args.lr_ae, weight_decay=5e-4)
        for ep in range(1,args.epochs_ae+1):
            enc.train(); dec.train(); tot=0.0; n=0
            for b in tqdm(loader_tr, desc=f"[AE] {ep}/{args.epochs_ae}"):
                x=b['x'].to(device); y=b['y'].to(device); m=(y==0)
                if not m.any(): continue
                xn=x[m]; z=enc(xn); xhat=dec(z); loss=F.mse_loss(xhat,xn)
                opt_ae.zero_grad(set_to_none=True); loss.backward(); opt_ae.step(); tot+=loss.item()*xn.size(0); n+=xn.size(0)
            print(f"[AE] mean MSE={tot/max(1,n):.6f}")
    else:
        print("[AE] Skipped (set --epochs_ae 1 if you want a quick pretrain)")

    if not args.finetune_encoder:
        for p in enc.parameters(): p.requires_grad_(False)
        enc.eval(); print("[Enc] Frozen.")

    # Build centroids from train normals
    centroids=[torch.zeros(args.bottleneck, device=device) for _ in fam_names]
    counts=[0 for _ in fam_names]
    enc.eval()
    with torch.no_grad():
        for b in DataLoader(ds_tr, batch_size=args.batch, shuffle=False):
            x=b['x'].to(device); y=b['y'].to(device); f=b['fam'].to(device); m=(y==0)
            if not m.any(): continue
            z=enc(x[m])
            for zi,fi in zip(z, f[m].tolist()):
                counts[fi]+=1; centroids[fi] = centroids[fi] + (zi - centroids[fi]) / counts[fi]
    print("Centroids built.")

    def z_ref_for_batch(z, fam):
        out=torch.zeros_like(z)
        for fid in fam.unique().tolist():
            out[fam==fid]=centroids[fid]
        return out

    gauss=RunningDiagGaussian(dim=args.bottleneck).to(device)

    # Quick training loop (few epochs on tiny data)
    params=list(con.parameters()) + (list(enc.parameters()) if args.finetune_encoder else [])
    opt=torch.optim.Adam(params, lr=args.lr, weight_decay=5e-4)

    for ep in range(1, args.epochs+1):
        enc.train(args.finetune_encoder); con.train(); tot={'occ':0.0,'tot':0.0,'n':0}
        for b in tqdm(loader_tr, desc=f"[Mini-ResAD] {ep}/{args.epochs}"):
            x=b['x'].to(device); y=b['y'].to(device); f=b['fam'].to(device)
            z=enc(x); zc=z - z_ref_for_batch(z,f); rc=con(zc)
            with torch.no_grad(): gauss.update(rc[y==0])
            logp = gauss.logp(rc)
            L_occ=occ_loss(rc, zc, y)
            # tiny boundary push (optional); using mean-std at each iter
            bn = logp[y==0].mean() - args.bn_k*logp[y==0].std(unbiased=False)
            L_bg = F.relu(logp - (bn - args.tau)).mean()
            loss = L_occ + args.lambda_bg*L_bg
            opt.zero_grad(set_to_none=True); loss.backward(); opt.step()
            B=x.size(0); tot['occ']+=L_occ.item()*B; tot['tot']+=loss.item()*B; tot['n']+=B
        print(f"[Train] ep={ep} L={tot['tot']/tot['n']:.5f} (occ={tot['occ']/tot['n']:.5f})")

        # Validation
        enc.eval(); con.eval(); scores=[]; labels=[]; fams=[]
        with torch.no_grad():
            for b in loader_va:
                x=b['x'].to(device); y=b['y'].to(device); f=b['fam'].to(device)
                z=enc(x); zc=z - z_ref_for_batch(z,f); rc=con(zc); s = -gauss.logp(rc)
                scores.append(s.cpu().numpy()); labels.append(y.cpu().numpy()); fams.append(f.cpu().numpy())
        if scores:
            yv=np.concatenate(labels); sv=np.concatenate(scores); fv=np.concatenate(fams)
            thr=compute_thr_at_fixed_fpr(yv, sv, fpr=args.fixed_fpr)
            mtr=eval_block(yv, sv, thr)
            print(f"[VAL] AUC={mtr['auc_roc']:.3f} AP={mtr['ap']:.3f} F1={mtr['f1']:.3f} Thr@FPR{args.fixed_fpr:.2f}={thr:.6f}")
            print("[VAL][Per-class] (global thr) →");
            for line in per_family_report(yv, sv, fv, fam_names, thr): print(line)

    # Test (overall + per-class)
    enc.eval(); con.eval(); scores=[]; labels=[]; fams=[]
    with torch.no_grad():
        for b in loader_te:
            x=b['x'].to(device); y=b['y'].to(device); f=b['fam'].to(device)
            z=enc(x); zc=z - z_ref_for_batch(z,f); rc=con(zc); s = -gauss.logp(rc)
            scores.append(s.cpu().numpy()); labels.append(y.cpu().numpy()); fams.append(f.cpu().numpy())
    if scores:
        yt=np.concatenate(labels); st=np.concatenate(scores); ft=np.concatenate(fams)
        thr=compute_thr_at_fixed_fpr(yt, st, fpr=args.fixed_fpr)
        mtr=eval_block(yt, st, thr)
        print(f"\n[TEST] Thr@FPR{args.fixed_fpr:.2f}={thr:.6f} | AUC={mtr['auc_roc']:.3f} AP={mtr['ap']:.3f} F1={mtr['f1']:.3f} Prec={mtr['precision']:.3f} Rec={mtr['recall']:.3f} Acc={mtr['accuracy']:.3f}")
        print("[TEST][Per-class] (global thr) →")
        for line in per_family_report(yt, st, ft, fam_names, thr): print(line)

# ---------------- CLI ----------------

def get_args(argv=None):
    p=argparse.ArgumentParser(description="ResAD micro pilot (all families, tiny per-class sample)")
    p.add_argument('--data_dir', type=str, default=None)
    p.add_argument('--seed', type=int, default=42)
    p.add_argument('--cpu', action='store_true')

    p.add_argument('--train_frac', type=float, default=2/3)
    p.add_argument('--val_frac', type=float, default=1/6)
    p.add_argument('--chunk_rows', type=int, default=10000)
    p.add_argument('--batch', type=int, default=512)

    # tiny sample size per family per split
    p.add_argument('--min_per_class', type=int, default=1000, help='per family per split for normal and anomaly each')
    # feature cap
    p.add_argument('--feat_topk', type=int, default=76)

    # model
    p.add_argument('--bottleneck', type=int, default=64)
    p.add_argument('--hid', type=int, default=128)
    p.add_argument('--depth', type=int, default=1)
    p.add_argument('--dec_depth', type=int, default=0)
    p.add_argument('--p_drop', type=float, default=0.05)
    p.add_argument('--feat_drop', type=float, default=0.0)

    # training
    p.add_argument('--epochs_ae', type=int, default=3)
    p.add_argument('--epochs', type=int, default=5)
    p.add_argument('--lr_ae', type=float, default=1e-3)
    p.add_argument('--lr', type=float, default=1e-3)
    p.add_argument('--finetune_encoder', action='store_true')
    p.add_argument('--lambda_bg', type=float, default=0.5)
    p.add_argument('--tau', type=float, default=0.1)
    p.add_argument('--bn_k', type=float, default=1.0)
    p.add_argument('--fixed_fpr', type=float, default=0.05)

    # Swallow unknown Jupyter flags unless argv is provided
    if argv is None:
        args, _ = p.parse_known_args()
    else:
        args = p.parse_args(argv)

    if args.data_dir is None:
        env=os.getenv('DATA_DIR') or os.getenv('CSE_CIC_IDS_DATA_DIR')
        guess="/content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018/Processed/"
        path = env if env and os.path.isdir(env) else (guess if os.path.isdir(guess) else None)
        if path is None: raise SystemExit("Please provide --data_dir or set DATA_DIR env var")
        args.data_dir = path
        print(f"[INFO] auto-detected data_dir={path}")
    return args

if __name__=='__main__':
    args=get_args()
    random.seed(args.seed); np.random.seed(args.seed); torch.manual_seed(args.seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed)
    train(args)


[INFO] auto-detected data_dir=/content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018/Processed/
Discovered families:
   Bot
   Brute Force::Web
   Brute Force::XSS
   DDOS attack::HOIC
   DDOS attack::LOIC-UDP
   DDoS attacks::LOIC-HTTP
   DoS attacks::GoldenEye
   DoS attacks::Hulk
   DoS attacks::SlowHTTPTest
   DoS attacks::Slowloris
   FTP::BruteForce
   Infilteration
   SQL Injection
   SSH::Bruteforce
Numeric dim (capped): 76 | Label: Label


[AE] 1/3: 100%|██████████| 47/47 [00:00<00:00, 136.35it/s]


[AE] mean MSE=0.605230


[AE] 2/3: 100%|██████████| 47/47 [00:00<00:00, 143.93it/s]


[AE] mean MSE=0.271079


[AE] 3/3: 100%|██████████| 47/47 [00:00<00:00, 144.06it/s]


[AE] mean MSE=0.171057
[Enc] Frozen.
Centroids built.


[Mini-ResAD] 1/5: 100%|██████████| 47/47 [00:01<00:00, 43.35it/s]


[Train] ep=1 L=39.44205 (occ=27.21350)
[VAL] AUC=0.852 AP=0.872 F1=0.801 Thr@FPR0.05=15.789808
[VAL][Per-class] (global thr) →
  [00] Bot | N=2000 (+:1000/-:1000) | AUC=0.659 AP=0.560 F1=0.000 P=0.000 R=0.000 Acc=0.484
  [01] Brute Force::Web | N=366 (+:114/-:252) | AUC=0.736 AP=0.560 F1=0.373 P=0.778 R=0.246 Acc=0.743
  [02] Brute Force::XSS | N=131 (+:51/-:80) | AUC=0.717 AP=0.501 F1=0.000 P=0.000 R=0.000 Acc=0.588
  [03] DDOS attack::HOIC | N=2000 (+:1000/-:1000) | AUC=0.950 AP=0.919 F1=0.637 P=0.934 R=0.483 Acc=0.725
  [04] DDOS attack::LOIC-UDP | N=1002 (+:309/-:693) | AUC=0.991 AP=0.942 F1=0.775 P=0.636 R=0.994 Acc=0.822
  [05] DDoS attacks::LOIC-HTTP | N=2000 (+:1000/-:1000) | AUC=0.987 AP=0.947 F1=0.988 P=0.977 R=1.000 Acc=0.988
  [06] DoS attacks::GoldenEye | N=2000 (+:1000/-:1000) | AUC=0.964 AP=0.921 F1=0.964 P=0.967 R=0.962 Acc=0.965
  [07] DoS attacks::Hulk | N=2000 (+:1000/-:1000) | AUC=0.966 AP=0.918 F1=0.958 P=0.921 R=0.999 Acc=0.957
  [08] DoS attacks::SlowHTTPTest | N

[Mini-ResAD] 2/5: 100%|██████████| 47/47 [00:01<00:00, 43.07it/s]


[Train] ep=2 L=21.30604 (occ=17.50270)
[VAL] AUC=0.887 AP=0.910 F1=0.810 Thr@FPR0.05=16.589989
[VAL][Per-class] (global thr) →
  [00] Bot | N=2000 (+:1000/-:1000) | AUC=0.845 AP=0.697 F1=0.000 P=0.000 R=0.000 Acc=0.480
  [01] Brute Force::Web | N=366 (+:114/-:252) | AUC=0.837 AP=0.647 F1=0.588 P=0.893 R=0.439 Acc=0.809
  [02] Brute Force::XSS | N=131 (+:51/-:80) | AUC=0.669 AP=0.472 F1=0.000 P=0.000 R=0.000 Acc=0.588
  [03] DDOS attack::HOIC | N=2000 (+:1000/-:1000) | AUC=0.970 AP=0.950 F1=0.636 P=0.931 R=0.483 Acc=0.724
  [04] DDOS attack::LOIC-UDP | N=1002 (+:309/-:693) | AUC=0.998 AP=0.982 F1=0.820 P=0.694 R=1.000 Acc=0.864
  [05] DDoS attacks::LOIC-HTTP | N=2000 (+:1000/-:1000) | AUC=0.999 AP=0.994 F1=0.980 P=0.962 R=1.000 Acc=0.980
  [06] DoS attacks::GoldenEye | N=2000 (+:1000/-:1000) | AUC=0.960 AP=0.934 F1=0.960 P=0.960 R=0.960 Acc=0.960
  [07] DoS attacks::Hulk | N=2000 (+:1000/-:1000) | AUC=0.964 AP=0.933 F1=0.967 P=0.936 R=1.000 Acc=0.966
  [08] DoS attacks::SlowHTTPTest | N

[Mini-ResAD] 3/5: 100%|██████████| 47/47 [00:01<00:00, 39.10it/s]


[Train] ep=3 L=16.68660 (occ=13.12713)
[VAL] AUC=0.890 AP=0.920 F1=0.843 Thr@FPR0.05=22.339529
[VAL][Per-class] (global thr) →
  [00] Bot | N=2000 (+:1000/-:1000) | AUC=0.899 AP=0.797 F1=0.638 P=0.905 R=0.493 Acc=0.721
  [01] Brute Force::Web | N=366 (+:114/-:252) | AUC=0.842 AP=0.679 F1=0.588 P=0.893 R=0.439 Acc=0.809
  [02] Brute Force::XSS | N=131 (+:51/-:80) | AUC=0.779 AP=0.679 F1=0.286 P=0.750 R=0.176 Acc=0.656
  [03] DDOS attack::HOIC | N=2000 (+:1000/-:1000) | AUC=0.977 AP=0.969 F1=0.639 P=0.945 R=0.483 Acc=0.728
  [04] DDOS attack::LOIC-UDP | N=1002 (+:309/-:693) | AUC=1.000 AP=1.000 F1=0.826 P=0.704 R=1.000 Acc=0.870
  [05] DDoS attacks::LOIC-HTTP | N=2000 (+:1000/-:1000) | AUC=0.999 AP=0.994 F1=0.982 P=0.965 R=1.000 Acc=0.982
  [06] DoS attacks::GoldenEye | N=2000 (+:1000/-:1000) | AUC=0.957 AP=0.943 F1=0.955 P=0.950 R=0.960 Acc=0.955
  [07] DoS attacks::Hulk | N=2000 (+:1000/-:1000) | AUC=0.986 AP=0.976 F1=0.970 P=0.943 R=1.000 Acc=0.970
  [08] DoS attacks::SlowHTTPTest | N

[Mini-ResAD] 4/5: 100%|██████████| 47/47 [00:01<00:00, 30.67it/s]


[Train] ep=4 L=14.20088 (occ=10.84755)
[VAL] AUC=0.887 AP=0.923 F1=0.844 Thr@FPR0.05=24.865425
[VAL][Per-class] (global thr) →
  [00] Bot | N=2000 (+:1000/-:1000) | AUC=0.932 AP=0.910 F1=0.646 P=0.927 R=0.496 Acc=0.729
  [01] Brute Force::Web | N=366 (+:114/-:252) | AUC=0.863 AP=0.706 F1=0.578 P=0.847 R=0.439 Acc=0.801
  [02] Brute Force::XSS | N=131 (+:51/-:80) | AUC=0.697 AP=0.479 F1=0.000 P=0.000 R=0.000 Acc=0.565
  [03] DDOS attack::HOIC | N=2000 (+:1000/-:1000) | AUC=0.981 AP=0.975 F1=0.637 P=0.936 R=0.483 Acc=0.725
  [04] DDOS attack::LOIC-UDP | N=1002 (+:309/-:693) | AUC=1.000 AP=0.999 F1=0.801 P=0.667 R=1.000 Acc=0.846
  [05] DDoS attacks::LOIC-HTTP | N=2000 (+:1000/-:1000) | AUC=0.999 AP=0.994 F1=0.978 P=0.957 R=1.000 Acc=0.978
  [06] DoS attacks::GoldenEye | N=2000 (+:1000/-:1000) | AUC=0.958 AP=0.959 F1=0.954 P=0.949 R=0.960 Acc=0.954
  [07] DoS attacks::Hulk | N=2000 (+:1000/-:1000) | AUC=0.996 AP=0.989 F1=0.979 P=0.959 R=1.000 Acc=0.979
  [08] DoS attacks::SlowHTTPTest | N

[Mini-ResAD] 5/5: 100%|██████████| 47/47 [00:01<00:00, 35.74it/s]


[Train] ep=5 L=12.47085 (occ=9.32526)
[VAL] AUC=0.884 AP=0.922 F1=0.846 Thr@FPR0.05=26.342413
[VAL][Per-class] (global thr) →
  [00] Bot | N=2000 (+:1000/-:1000) | AUC=0.907 AP=0.891 F1=0.644 P=0.919 R=0.496 Acc=0.726
  [01] Brute Force::Web | N=366 (+:114/-:252) | AUC=0.811 AP=0.734 F1=0.754 P=0.882 R=0.658 Acc=0.866
  [02] Brute Force::XSS | N=131 (+:51/-:80) | AUC=0.743 AP=0.644 F1=0.000 P=0.000 R=0.000 Acc=0.580
  [03] DDOS attack::HOIC | N=2000 (+:1000/-:1000) | AUC=0.949 AP=0.938 F1=0.634 P=0.924 R=0.483 Acc=0.722
  [04] DDOS attack::LOIC-UDP | N=1002 (+:309/-:693) | AUC=1.000 AP=0.999 F1=0.843 P=0.729 R=1.000 Acc=0.885
  [05] DDoS attacks::LOIC-HTTP | N=2000 (+:1000/-:1000) | AUC=0.999 AP=0.994 F1=0.973 P=0.948 R=1.000 Acc=0.973
  [06] DoS attacks::GoldenEye | N=2000 (+:1000/-:1000) | AUC=0.961 AP=0.968 F1=0.949 P=0.938 R=0.960 Acc=0.949
  [07] DoS attacks::Hulk | N=2000 (+:1000/-:1000) | AUC=0.993 AP=0.991 F1=0.964 P=0.930 R=1.000 Acc=0.963
  [08] DoS attacks::SlowHTTPTest | N=

In [None]:
!python resad_mini.py --data_dir "$DATA_DIR" \
  --min_per_class 5000 --feat_topk 76 --bottleneck 32 --hid 96 --epochs 5


python3: can't open file '/content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018/resad_mini.py': [Errno 2] No such file or directory


In [None]:
%python resad_tabular.py \
  --data_dir "/content/gdrive/MyDrive/DATASETS/CSE_CIC_IDS_ALL_DATA/CSE-CIC-IDS2018/Processed/" \
  --epochs_ae 10 --epochs 50 --batch 2048 \
  --bottleneck 128 --hid 256 --n_ref 256 --fixed_fpr 0.05
