In [2]:
!unzip ai-soc-assistant-with-sample.zip -d ai-soc-assistant

unzip:  cannot find or open ai-soc-assistant-with-sample.zip, ai-soc-assistant-with-sample.zip.zip or ai-soc-assistant-with-sample.zip.ZIP.


In [3]:
%cd ai-soc-assistant

/content/ai-soc-assistant


In [4]:
!pip install -r requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

In [5]:
!python run_pipeline.py


python3: can't open file '/content/ai-soc-assistant/run_pipeline.py': [Errno 2] No such file or directory


In [6]:
import pandas as pd
df = pd.read_csv("artifacts/alerts_mitre.csv")
df.head(5)

Unnamed: 0,Source Port,Destination Port,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,Flow Packets/s,Flow Bytes/s,y_true,y_pred,y_prob,...,dst_ip,src_port,dst_port,protocol,flow_duration_ms,total_packets,approx_packets_per_s,approx_bytes_per_s,mitre_tags,llm_explanation
0,51525,23,2446,269,23,119.365408,15359.013865,1,1,0.999799,...,192.168.100.221,51525,23,TCP,2446,292.0,119.365408,15359.013865,['T1110 Brute Force'],Flow 191.206.251.79 -> 192.168.100.221 proto=T...
1,42112,80,3895,842,29,223.599835,117059.04712,1,1,0.999552,...,192.168.38.25,42112,80,UDP,3895,871.0,223.599835,117059.04712,[],Flow 181.6.245.37 -> 192.168.38.25 proto=UDP d...
2,41941,8080,3044,1488,39,350.0,294113.828035,1,1,0.999544,...,192.168.0.232,41941,8080,UDP,3044,1527.0,350.0,294113.828035,[],Flow 213.143.172.2 -> 192.168.0.232 proto=UDP ...
3,17324,443,4385,592,6,136.36246,73183.947539,1,1,0.999513,...,192.168.148.139,17324,443,TCP,4385,598.0,136.36246,73183.947539,[],Flow 68.196.240.141 -> 192.168.148.139 proto=T...
4,59572,443,4223,323,23,81.924098,57869.60375,1,1,0.999504,...,192.168.44.161,59572,443,UDP,4223,346.0,81.924098,57869.60375,[],Flow 131.193.248.8 -> 192.168.44.161 proto=UDP...


In [1]:
# ====== AI-Powered SOC Assistant (Colab) ======
# This notebook cell:
# 1) Installs deps
# 2) Downloads a small real subset of CICIDS2017 from Kaggle if kaggle.json is provided
#    Fallback: creates a realistic synthetic sample
# 3) Trains a model with group-based split to avoid leakage
# 4) Exports alerts with MITRE tags and optional LLM explanations
# 5) Shows outputs and packs artifacts into a zip for download

import os, sys, json, random, zipfile, glob
from pathlib import Path
import numpy as np
import pandas as pd

# ---------- Config ----------
PROJECT_DIR = Path("/content/ai-soc-assistant")
DATA_DIR = PROJECT_DIR / "data"
ARTIFACTS = PROJECT_DIR / "artifacts"
USE_OPENAI = False  # toggled later if OPENAI_API_KEY set

# Which CICIDS2017 CSVs to fetch from Kaggle (names as published there)
CIC_PARTS = [
    # A small mix. You can add more once this works end to end.
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Monday-WorkingHours.pcap_ISCX.csv",
]

# ---------- Install deps ----------
!pip -q install pandas numpy scikit-learn xgboost fastapi uvicorn streamlit pydantic openai kaggle requests > /dev/null

# ---------- Check OpenAI key ----------
try:
    from openai import OpenAI
    if os.getenv("OPENAI_API_KEY"):
        USE_OPENAI = True
except Exception:
    USE_OPENAI = False

# ---------- Create folders ----------
PROJECT_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS.mkdir(parents=True, exist_ok=True)

# ---------- Helper: safe display ----------
def peek_csv(path, n=5):
    try:
        df = pd.read_csv(path)
        display(df.head(n))
        print(f"{path} -> shape:", df.shape)
    except Exception as e:
        print("Peek failed for", path, e)

# ---------- Step 1: Try to download real CICIDS2017 subset from Kaggle ----------
# To enable this: upload kaggle.json to /content (Colab left sidebar > Files > Upload)
KAGGLE_JSON = Path("/content/kaggle.json")
download_ok = False

if KAGGLE_JSON.exists():
    # Set up Kaggle API
    KAGGLE_DIR = Path("/root/.kaggle")
    KAGGLE_DIR.mkdir(parents=True, exist_ok=True)
    !mv /content/kaggle.json /root/.kaggle/kaggle.json
    !chmod 600 /root/.kaggle/kaggle.json

    # CICIDS2017 dataset on Kaggle
    ds = "cicdataset/cicids2017"
    try:
        print("Downloading CICIDS2017 subset from Kaggle...")
        # Download only once; unzip specific files
        !kaggle datasets download -d {ds} -p /content --force
        zip_path = "/content/cicids2017.zip"
        if os.path.exists(zip_path):
            import zipfile as zf
            with zf.ZipFile(zip_path, "r") as z:
                names = z.namelist()
                selected = [n for n in names if any(n.endswith(p) for p in CIC_PARTS)]
                if not selected:
                    # Fallback: extract all, then we will pick
                    selected = names
                for n in selected:
                    if n.endswith(".csv"):
                        z.extract(n, "/content/cicids_raw")
            # Move chosen parts into DATA_DIR
            raw_dir = Path("/content/cicids_raw")
            moved = 0
            for part in CIC_PARTS:
                cand = list(raw_dir.rglob(part))
                if cand:
                    dest = DATA_DIR / cand[0].name
                    os.rename(str(cand[0]), str(dest))
                    print("Added:", dest.name)
                    moved += 1
            # If none matched directly, try to move some small CSVs instead
            if moved == 0:
                smalls = sorted([p for p in raw_dir.rglob("*.csv")])
                for p in smalls[:3]:
                    dest = DATA_DIR / p.name
                    os.rename(str(p), str(dest))
                    print("Added:", dest.name)
            download_ok = True
    except Exception as e:
        print("Kaggle download failed:", e)
else:
    print("No kaggle.json found. Will use a synthetic sample for a quick run.")

# ---------- Step 2: If no real data, build a realistic synthetic sample ----------
def build_synthetic_sample(out_csv: Path, n_benign=200, n_dos=80, n_brute=50, n_exfil=40):
    cols = [
        "Source IP","Destination IP","Source Port","Destination Port","Protocol",
        "Flow Duration","Tot Fwd Pkts","Tot Bwd Pkts","Flow Packets/s","Flow Bytes/s",
        "Label"
    ]
    def rand_ip(internal=False):
        if internal:
            return f"192.168.{np.random.randint(0,255)}.{np.random.randint(1,255)}"
        return f"{np.random.randint(1,223)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(1,255)}"

    def row(kind="BENIGN"):
        proto = np.random.choice(["TCP","UDP","TCP","TCP","TCP","UDP"])
        if kind=="BENIGN":
            dur = abs(np.random.normal(5000, 4000)) + 1
            fwd = max(1, int(abs(np.random.normal(45, 20))))
            bwd = max(1, int(abs(np.random.normal(40, 18))))
            pps = (fwd+bwd) / (dur/1000)
            bps = pps * np.random.uniform(200, 900)
            dst_port = int(np.random.choice([80,443,8080,53,123,22,3389]))
            return [rand_ip(True), rand_ip(False), np.random.randint(1024,65535), dst_port, proto,
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "BENIGN"]
        if kind=="DOS":
            dur = abs(np.random.normal(2500, 1200)) + 1
            fwd = max(30, int(abs(np.random.normal(600, 250))))
            bwd = max(1, int(abs(np.random.normal(40, 25))))
            pps = (fwd+bwd) / (dur/1000)
            pps = np.clip(pps, 80, 350)  # overlap with benign to avoid trivial separation
            bps = pps * np.random.uniform(300, 900)
            dst_port = int(np.random.choice([80,443,8080]))
            return [rand_ip(False), rand_ip(True), np.random.randint(1024,65535), dst_port, proto,
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "DoS"]
        if kind=="BRUTE":
            dur = abs(np.random.normal(3500, 1500)) + 1
            attempts = max(30, int(abs(np.random.normal(120, 60))))
            fwd = attempts
            bwd = max(1, int(abs(np.random.normal(40, 20))))
            pps = (fwd+bwd) / (dur/1000)
            bps = pps * np.random.uniform(80, 400)
            dst_port = int(np.random.choice([22,21,23,3389]))
            return [rand_ip(False), rand_ip(True), np.random.randint(1024,65535), dst_port, "TCP",
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "Brute Force"]
        # EXFIL
        dur = abs(np.random.normal(9000, 4500)) + 1
        fwd = max(120, int(abs(np.random.normal(900, 300))))
        bwd = max(60, int(abs(np.random.normal(400, 120))))
        pps = (fwd+bwd) / (dur/1000)
        bps = pps * np.random.uniform(1500, 6000)
        # allow 80/443 so it is not trivial
        dst_port = int(np.random.choice([80,443,8081,8443,9001,53]))
        return [rand_ip(True), rand_ip(False), np.random.randint(1024,65535), dst_port, proto,
                int(dur), int(fwd), int(bwd), float(pps), float(bps), "Exfiltration"]

    rows = []
    for _ in range(n_benign): rows.append(row("BENIGN"))
    for _ in range(n_dos):    rows.append(row("DOS"))
    for _ in range(n_brute):  rows.append(row("BRUTE"))
    for _ in range(n_exfil):  rows.append(row("EXFIL"))
    np.random.shuffle(rows)
    df = pd.DataFrame(rows, columns=cols)
    df.to_csv(out_csv, index=False)
    print("Synthetic sample created ->", out_csv, "shape:", df.shape)

# Create synthetic only if no real files landed
if not any(DATA_DIR.glob("*.csv")):
    build_synthetic_sample(DATA_DIR / "sample_cicids2017_small.csv")

print("Data files present:")
for p in DATA_DIR.glob("*.csv"):
    print(" -", p.name)

# ---------- Step 3: Load data and train with group split ----------
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

def pick_col(df: pd.DataFrame, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def load_all_csvs():
    csvs = sorted(glob.glob(str(DATA_DIR / "*.csv")))
    dfs = []
    for p in csvs:
        try:
            df = pd.read_csv(p, low_memory=False)
            df["__source_file"] = Path(p).name
            dfs.append(df)
        except Exception as e:
            print("Failed to read", p, e)
    if not dfs:
        raise RuntimeError("No CSVs to load")
    df = pd.concat(dfs, ignore_index=True)

    label_col = "Label" if "Label" in df.columns else pick_col(df, ["label","Attack"])
    if not label_col:
        raise ValueError("Label column not found")
    y = df[label_col].astype(str)
    y = y.apply(lambda x: 0 if x.upper() in ["BENIGN","NORMAL","0"] else 1)
    df["__label_bin"] = y
    return df

df = load_all_csvs()

# Keep numeric feature set only
NUM_EXCLUDE = {"Label","__label_bin"}
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in NUM_EXCLUDE]
X_all = df[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
y_all = df["__label_bin"].astype(int)

# Use groups to avoid leakage. Prefer IP if present else file name.
if "Source IP" in df.columns:
    groups = df["Source IP"].astype(str)
elif "Src IP" in df.columns:
    groups = df["Src IP"].astype(str)
else:
    groups = df["__source_file"].astype(str)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups=groups))
X_train, X_test = X_all.iloc[train_idx], X_all.iloc[test_idx]
y_train, y_test = y_all.iloc[train_idx], y_all.iloc[test_idx]

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# Train model
if HAS_XGB:
    model = XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.1,
                          subsample=0.9, colsample_bytree=0.9, n_jobs=4,
                          eval_metric="logloss")
else:
    model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=4)
model.fit(X_train_s, y_train)

# Evaluate
y_prob = model.predict_proba(X_test_s)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
try:
    print("ROC AUC:", roc_auc_score(y_test, y_prob))
    print("PR  AUC:", average_precision_score(y_test, y_prob))
except Exception as e:
    print("AUC computation issue:", e)

# Save metrics
ARTIFACTS.mkdir(exist_ok=True)
with open(ARTIFACTS / "metrics.json", "w") as f:
    json.dump({"classification_report": classification_report(y_test, y_pred, output_dict=True)}, f, indent=2)

# ---------- Step 4: Build scored frame with context ----------
# Recover which original rows ended up in the test split
test_rows = df.iloc[test_idx].copy()
scored = test_rows[num_cols].copy()
scored["y_true"] = y_test.values
scored["y_pred"] = y_pred
scored["y_prob"] = y_prob

def safe_get(source, cands, default=""):
    for c in cands:
        if c in source.columns:
            return source[c]
    return pd.Series([default]*len(source))

scored["src_ip"]   = safe_get(test_rows, ["Source IP","Src IP","src_ip"])
scored["dst_ip"]   = safe_get(test_rows, ["Destination IP","Dst IP","dst_ip"])
scored["src_port"] = safe_get(test_rows, ["Source Port","Src Port","src_port"])
scored["dst_port"] = safe_get(test_rows, ["Destination Port","Dst Port","dst_port"])
scored["protocol"] = safe_get(test_rows, ["Protocol","protocol"])

scored["flow_duration_ms"] = safe_get(test_rows, ["Flow Duration","Flow_Duration","flow_duration"], np.nan)
fwd = safe_get(test_rows, ["Tot Fwd Pkts","Total Fwd Packets","Fwd Pkts"], 0).astype(float)
bwd = safe_get(test_rows, ["Tot Bwd Pkts","Total Bwd Packets","Bwd Pkts"], 0).astype(float)
scored["total_packets"] = fwd.fillna(0) + bwd.fillna(0)

flow_p_s = safe_get(test_rows, ["Flow Packets/s","Flow_Packets/s"], np.nan).astype(float)
flow_b_s = safe_get(test_rows, ["Flow Bytes/s","Flow_Bytes/s"], np.nan).astype(float)
dur_ms = pd.to_numeric(scored["flow_duration_ms"], errors="coerce").fillna(1.0)
dur_s = np.maximum(dur_ms/1000.0, 1e-6)

approx_pps = flow_p_s.copy()
approx_pps = approx_pps.fillna(scored["total_packets"] / dur_s)
approx_bps = flow_b_s.copy()
approx_bps = approx_bps.fillna(scored["total_packets"] * 350.0 / dur_s)

scored["approx_packets_per_s"] = approx_pps
scored["approx_bytes_per_s"] = approx_bps

scored.to_csv(ARTIFACTS / "test_scored.csv", index=False)

# ---------- Step 5: MITRE mapping ----------
def mitre_tags_for_row(r: pd.Series):
    tags = []
    try:
        pps = float(r.get("approx_packets_per_s", 0) or 0)
        bps = float(r.get("approx_bytes_per_s", 0) or 0)
        dst_port = int(float(r.get("dst_port", 0) or 0))
    except Exception:
        pps, bps, dst_port = 0.0, 0.0, 0
    proto = str(r.get("protocol","")).upper()

    if pps > 1000 or (pps > 300 and float(r.get("flow_duration_ms", 0) or 0) < 3000):
        tags.append("T1499 Endpoint Denial of Service")
    if dst_port in [21,22,23,3389] and pps > 50 and bps < 5e5:
        tags.append("T1110 Brute Force")
    if bps > 5e6 and dst_port not in [80,443]:
        tags.append("T1041 Exfiltration Over C2 Channel")
    if proto in ["FTP","TELNET"]:
        tags.append("T1071 Application Layer Protocol")
    return sorted(set(tags))

mitre_list = []
for _, r in scored.iterrows():
    mitre_list.append(mitre_tags_for_row(r))
scored["mitre_tags"] = mitre_list

# ---------- Step 6: AI explanations (optional) ----------
def explain_flow(row: pd.Series, mitre_tags):
    base = (
        f"Flow {row.get('src_ip','?')} -> {row.get('dst_ip','?')} "
        f"proto={row.get('protocol','?')} dst_port={row.get('dst_port','?')} "
        f"pps~{row.get('approx_packets_per_s',0):.1f} bps~{row.get('approx_bytes_per_s',0):.0f} "
        f"model_score={row.get('y_prob',0):.3f}."
    )
    if not USE_OPENAI:
        hint = "; ".join(mitre_tags) if mitre_tags else "No clear MITRE tag from heuristics"
        return f"{base} Suspicious due to traffic pattern. MITRE: {hint}. Next: check repeated flows from source, firewall logs, and authentication logs."

    try:
        client = OpenAI()
        system_msg = "You are a senior SOC analyst. Explain suspicious network flows concisely and suggest next steps."
        user_msg = f"""
Flow details:
- Source IP: {row.get('src_ip','?')}
- Destination IP: {row.get('dst_ip','?')}
- Protocol: {row.get('protocol','?')}
- Dst Port: {row.get('dst_port','?')}
- Duration ms: {row.get('flow_duration_ms','?')}
- Packets: {row.get('total_packets','?')} (~{row.get('approx_packets_per_s',0):.1f} pkt/s)
- Approx bytes/s: {row.get('approx_bytes_per_s',0):.0f}
- Model score: {row.get('y_prob',0):.3f}
Seed MITRE tags: {', '.join(mitre_tags) if mitre_tags else '(none)'}
Tasks:
1) Explain why this may be suspicious in plain English.
2) Confirm or adjust the MITRE technique list.
3) Suggest 2-3 next investigative steps.
"""
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role":"system","content":system_msg},{"role":"user","content":user_msg}],
            temperature=0.2,
        )
        return resp.choices[0].message.content.strip()
    except Exception as e:
        return f"{base} (LLM unavailable: {e}). Suggested steps: check reputation, correlate with auth logs, check server errors."

# Only explain top N alerts to save time and tokens
top = scored.sort_values("y_prob", ascending=False).head(100).copy()
top["llm_explanation"] = [
    explain_flow(r, r["mitre_tags"]) for _, r in top.iterrows()
]

# Merge explanations back where available
scored = scored.merge(
    top[["llm_explanation"]],
    left_index=True, right_index=True, how="left"
)

# Save outputs
scored.sort_values("y_prob", ascending=False).to_csv(ARTIFACTS / "alerts_mitre.csv", index=False)
with open(ARTIFACTS / "explained_alerts.jsonl", "w") as f:
    for _, r in scored.iterrows():
        rec = r.to_dict()
        # ensure mitre tags become simple list
        rec["mitre_tags"] = list(rec.get("mitre_tags", []))
        f.write(json.dumps(rec) + "\n")

print("\nArtifacts written to:", ARTIFACTS)
print(" - metrics.json")
print(" - test_scored.csv")
print(" - alerts_mitre.csv")
print(" - explained_alerts.jsonl")

# Preview a few alerts
df_alerts = pd.read_csv(ARTIFACTS / "alerts_mitre.csv")
display(df_alerts.head(10)[[
    "src_ip","dst_ip","dst_port","protocol","approx_packets_per_s","approx_bytes_per_s","y_prob","mitre_tags","llm_explanation"
]])

# Pack artifacts so you can download
ZIP_OUT = "/content/soc_artifacts.zip"
with zipfile.ZipFile(ZIP_OUT, "w", zipfile.ZIP_DEFLATED) as z:
    for p in ARTIFACTS.glob("*"):
        z.write(str(p), p.name)

print("Download zipped outputs at:", ZIP_OUT)


No kaggle.json found. Will use a synthetic sample for a quick run.
Synthetic sample created -> /content/ai-soc-assistant/data/sample_cicids2017_small.csv shape: (370, 11)
Data files present:
 - sample_cicids2017_small.csv
Classification report:
              precision    recall  f1-score   support

           0     0.8936    1.0000    0.9438        42
           1     1.0000    0.8438    0.9153        32

    accuracy                         0.9324        74
   macro avg     0.9468    0.9219    0.9295        74
weighted avg     0.9396    0.9324    0.9315        74

Confusion matrix:
 [[42  0]
 [ 5 27]]
ROC AUC: 0.9970238095238095
PR  AUC: 0.9965277777777778

Artifacts written to: /content/ai-soc-assistant/artifacts
 - metrics.json
 - test_scored.csv
 - alerts_mitre.csv
 - explained_alerts.jsonl


Unnamed: 0,src_ip,dst_ip,dst_port,protocol,approx_packets_per_s,approx_bytes_per_s,y_prob,mitre_tags,llm_explanation
0,191.206.251.79,192.168.100.221,23,TCP,119.365408,15359.013865,0.999799,['T1110 Brute Force'],Flow 191.206.251.79 -> 192.168.100.221 proto=T...
1,181.6.245.37,192.168.38.25,80,UDP,223.599835,117059.04712,0.999552,[],Flow 181.6.245.37 -> 192.168.38.25 proto=UDP d...
2,213.143.172.2,192.168.0.232,8080,UDP,350.0,294113.828035,0.999544,[],Flow 213.143.172.2 -> 192.168.0.232 proto=UDP ...
3,68.196.240.141,192.168.148.139,443,TCP,136.36246,73183.947539,0.999513,[],Flow 68.196.240.141 -> 192.168.148.139 proto=T...
4,131.193.248.8,192.168.44.161,443,UDP,81.924098,57869.60375,0.999504,[],Flow 131.193.248.8 -> 192.168.44.161 proto=UDP...
5,34.2.44.234,192.168.243.196,8080,TCP,295.040498,261450.510018,0.999483,[],Flow 34.2.44.234 -> 192.168.243.196 proto=TCP ...
6,192.30.1.82,192.168.136.113,443,TCP,249.181734,160376.494291,0.999455,[],Flow 192.30.1.82 -> 192.168.136.113 proto=TCP ...
7,42.39.198.191,192.168.65.114,8080,TCP,350.0,201000.002906,0.999412,[],Flow 42.39.198.191 -> 192.168.65.114 proto=TCP...
8,113.201.69.236,192.168.181.36,23,TCP,62.542008,14338.761741,0.999381,['T1110 Brute Force'],Flow 113.201.69.236 -> 192.168.181.36 proto=TC...
9,214.28.15.234,192.168.5.160,80,UDP,286.511576,169115.080842,0.999189,[],Flow 214.28.15.234 -> 192.168.5.160 proto=UDP ...


Download zipped outputs at: /content/soc_artifacts.zip


#SETUP

In [7]:
!pip -q install --upgrade pip
!pip -q install \
"numpy>=2.0,<2.2" \
"pandas==2.2.2" \
"scikit-learn==1.6.1" \
"xgboost==3.1.1" \
"requests==2.32.4" \
"pydantic<2.12" \
kaggle openai fastapi uvicorn streamlit

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.1.3 which is incompatible.[0m[31m
[0m

In [7]:
import numpy, pandas, sklearn, xgboost, requests, pydantic
print("numpy:", numpy.__version__)
print("pandas:", pandas.__version__)
print("scikit-learn:", sklearn.__version__)
print("xgboost:", xgboost.__version__)
print("requests:", requests.__version__)
print("pydantic:", pydantic.__version__)

numpy: 2.1.3
pandas: 2.2.2
scikit-learn: 1.6.1
xgboost: 3.1.1
requests: 2.32.4
pydantic: 2.11.10


In [5]:
# ===== Setup =====
import os, json, glob, zipfile, random
from pathlib import Path
import numpy as np
import pandas as pd

PROJECT_DIR = Path("/content/ai-soc-assistant")
DATA_DIR = PROJECT_DIR / "data"
ARTIFACTS = PROJECT_DIR / "artifacts"
USE_OPENAI = False  # auto-enabled later if OPENAI_API_KEY set

CIC_PARTS = [
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Monday-WorkingHours.pcap_ISCX.csv",
]

# Install deps (pin a few for reproducibility)
!pip -q install "pandas==2.2.2" "numpy==1.26.4" "scikit-learn==1.5.1" "xgboost==2.0.3" fastapi uvicorn streamlit pydantic openai kaggle requests > /dev/null

# OpenAI toggle
try:
    from openai import OpenAI
    if os.getenv("OPENAI_API_KEY"):
        USE_OPENAI = True
except Exception:
    USE_OPENAI = False

# Folders
PROJECT_DIR.mkdir(exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS.mkdir(parents=True, exist_ok=True)


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
umap-learn 0.5.9.post2 requires scikit-learn>=1.6, but you have scikit-learn 1.5.1 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
opencv-contrib-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is

# DATA ACQUISITION

In [6]:
# ===== Data acquisition =====

from pathlib import Path

def build_synthetic_sample(out_csv: Path, n_benign=200, n_dos=80, n_brute=50, n_exfil=40):
    cols = [
        "Source IP","Destination IP","Source Port","Destination Port","Protocol",
        "Flow Duration","Tot Fwd Pkts","Tot Bwd Pkts","Flow Packets/s","Flow Bytes/s","Label"
    ]
    def rand_ip(internal=False):
        if internal:
            return f"192.168.{np.random.randint(0,255)}.{np.random.randint(1,255)}"
        return f"{np.random.randint(1,223)}.{np.random.randint(0,255)}.{np.random.randint(0,255)}.{np.random.randint(1,255)}"

    def row(kind="BENIGN"):
        proto = np.random.choice(["TCP","UDP","TCP","TCP","TCP","UDP"])
        if kind=="BENIGN":
            dur = abs(np.random.normal(5000, 4000)) + 1
            fwd = max(1, int(abs(np.random.normal(45, 20))))
            bwd = max(1, int(abs(np.random.normal(40, 18))))
            pps = (fwd+bwd) / (dur/1000)
            bps = pps * np.random.uniform(200, 900)
            dst_port = int(np.random.choice([80,443,8080,53,123,22,3389]))
            return [rand_ip(True), rand_ip(False), np.random.randint(1024,65535), dst_port, proto,
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "BENIGN"]
        if kind=="DOS":
            dur = abs(np.random.normal(2500, 1200)) + 1
            fwd = max(30, int(abs(np.random.normal(600, 250))))
            bwd = max(1, int(abs(np.random.normal(40, 25))))
            pps = (fwd+bwd) / (dur/1000)
            pps = np.clip(pps, 80, 350)
            bps = pps * np.random.uniform(300, 900)
            dst_port = int(np.random.choice([80,443,8080]))
            return [rand_ip(False), rand_ip(True), np.random.randint(1024,65535), dst_port, proto,
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "DoS"]
        if kind=="BRUTE":
            dur = abs(np.random.normal(3500, 1500)) + 1
            attempts = max(30, int(abs(np.random.normal(120, 60))))
            fwd = attempts
            bwd = max(1, int(abs(np.random.normal(40, 20))))
            pps = (fwd+bwd) / (dur/1000)
            bps = pps * np.random.uniform(80, 400)
            dst_port = int(np.random.choice([22,21,23,3389]))
            return [rand_ip(False), rand_ip(True), np.random.randint(1024,65535), dst_port, "TCP",
                    int(dur), int(fwd), int(bwd), float(pps), float(bps), "Brute Force"]
        dur = abs(np.random.normal(9000, 4500)) + 1
        fwd = max(120, int(abs(np.random.normal(900, 300))))
        bwd = max(60, int(abs(np.random.normal(400, 120))))
        pps = (fwd+bwd) / (dur/1000)
        bps = pps * np.random.uniform(1500, 6000)
        dst_port = int(np.random.choice([80,443,8081,8443,9001,53]))
        return [rand_ip(True), rand_ip(False), np.random.randint(1024,65535), dst_port, proto,
                int(dur), int(fwd), int(bwd), float(pps), float(bps), "Exfiltration"]

    rows = []
    for _ in range(n_benign): rows.append(row("BENIGN"))
    for _ in range(n_dos):    rows.append(row("DOS"))
    for _ in range(n_brute):  rows.append(row("BRUTE"))
    for _ in range(n_exfil):  rows.append(row("EXFIL"))
    np.random.shuffle(rows)
    df = pd.DataFrame(rows, columns=cols)
    out_csv.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_csv, index=False)
    print("Synthetic sample created ->", out_csv, "shape:", df.shape)

# Build only if nothing is present
if not any(DATA_DIR.glob("*.csv")):
    build_synthetic_sample(DATA_DIR / "sample_cicids2017_small.csv")

print("Data files present:")
for p in DATA_DIR.glob("*.csv"):
    print(" -", p.name)


Data files present:
 - sample_cicids2017_small.csv


In [7]:
import pandas as pd
from pathlib import Path
df = pd.read_csv(Path("/content/ai-soc-assistant/data") / "sample_cicids2017_small.csv")
print(df.shape)
print(df["Label"].value_counts())
df.head()


(370, 11)
Label
BENIGN          200
DoS              80
Brute Force      50
Exfiltration     40
Name: count, dtype: int64


Unnamed: 0,Source IP,Destination IP,Source Port,Destination Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,Flow Packets/s,Flow Bytes/s,Label
0,192.168.171.36,110.10.11.149,7127,3389,TCP,10504,14,38,4.950329,3890.185333,BENIGN
1,192.168.137.27,94.189.49.141,57682,443,TCP,2309,17,46,27.276188,21741.206374,BENIGN
2,192.168.139.117,111.197.202.4,13187,8080,TCP,682,35,38,107.029992,22283.845748,BENIGN
3,149.49.14.7,192.168.158.103,17486,23,TCP,2221,30,66,43.213134,15668.894708,Brute Force
4,192.168.204.141,71.92.245.40,6393,8080,TCP,1263,19,34,41.953691,27683.015189,BENIGN


#Training with group split

In [14]:
!pip -q uninstall -y scikit-learn scipy numpy

!pip -q install --no-cache-dir --force-reinstall \
numpy==2.0.2 scipy==1.14.1 scikit-learn==1.6.1 \
xgboost==3.1.1 joblib>=1.3 threadpoolctl>=3.2


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.8.0+cu126 requires nvidia-nccl-cu12==2.27.3; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-nccl-cu12 2.28.7 which is incompatible.[0m[31m
[0m

In [15]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True)


{'status': 'ok', 'restart': True}

In [2]:
import numpy, scipy, sklearn, xgboost
from sklearn.ensemble import RandomForestClassifier
print("numpy:", numpy.__version__)
print("scipy:", scipy.__version__)
print("scikit-learn:", sklearn.__version__)
print("xgboost:", xgboost.__version__)

numpy: 2.0.2
scipy: 1.14.1
scikit-learn: 1.6.1
xgboost: 3.1.1


In [8]:
# ===== Training with group split =====
import glob
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def load_all_csvs():
    csvs = sorted(glob.glob(str(DATA_DIR / "*.csv")))
    dfs = []
    for p in csvs:
        try:
            df = pd.read_csv(p, low_memory=False)
            df["__source_file"] = Path(p).name
            dfs.append(df)
        except Exception as e:
            print("Failed to read", p, e)
    if not dfs:
        raise RuntimeError("No CSVs to load")
    df = pd.concat(dfs, ignore_index=True)
    label_col = "Label" if "Label" in df.columns else pick_col(df, ["label","Attack"])
    y = df[label_col].astype(str).apply(lambda x: 0 if x.upper() in ["BENIGN","NORMAL","0"] else 1)
    df["__label_bin"] = y
    return df

df = load_all_csvs()

NUM_EXCLUDE = {"Label","__label_bin"}
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in NUM_EXCLUDE]
X_all = df[num_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
y_all = df["__label_bin"].astype(int)

if "Source IP" in df.columns:
    groups = df["Source IP"].astype(str)
elif "Src IP" in df.columns:
    groups = df["Src IP"].astype(str)
else:
    groups = df["__source_file"].astype(str)

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X_all, y_all, groups=groups))
X_train, X_test = X_all.iloc[train_idx], X_all.iloc[test_idx]
y_train, y_test = y_all.iloc[train_idx], y_all.iloc[test_idx]

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

try:
    model = XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.1,
                          subsample=0.9, colsample_bytree=0.9, n_jobs=4,
                          eval_metric="logloss")
except Exception:
    model = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=4)

model.fit(X_train_s, y_train)

# Save artifacts for later demo
joblib.dump(model, ARTIFACTS / "model.joblib")
joblib.dump(scaler, ARTIFACTS / "scaler.joblib")


['/content/ai-soc-assistant/artifacts/scaler.joblib']

In [9]:
import os
for f in os.listdir("/content/ai-soc-assistant/artifacts"):
    print(f)


explained_alerts.jsonl
metrics.json
model.joblib
scaler.joblib
alerts_mitre.csv
test_scored.csv


# Evaluation and metrics

In [10]:
# ===== Evaluation and metrics =====
y_prob = model.predict_proba(X_test_s)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)

print("Classification report:")
print(classification_report(y_test, y_pred, digits=4))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))
print("PR  AUC:", average_precision_score(y_test, y_prob))

# Save metrics for the zip
ARTIFACTS.mkdir(exist_ok=True)
with open(ARTIFACTS / "metrics.json", "w") as f:
    json.dump({"classification_report": classification_report(y_test, y_pred, output_dict=True)}, f, indent=2)


Classification report:
              precision    recall  f1-score   support

           0     0.8936    1.0000    0.9438        42
           1     1.0000    0.8438    0.9153        32

    accuracy                         0.9324        74
   macro avg     0.9468    0.9219    0.9295        74
weighted avg     0.9396    0.9324    0.9315        74

Confusion matrix:
 [[42  0]
 [ 5 27]]
ROC AUC: 0.9970238095238095
PR  AUC: 0.9965277777777778


# Alert scoring with MITRE tags

In [11]:
# ===== Alert scoring with MITRE tags =====

test_rows = df.iloc[test_idx].copy()
scored = test_rows[num_cols].copy()
scored["y_true"] = y_test.values
scored["y_pred"] = y_pred
scored["y_prob"] = y_prob

def safe_get(source, cands, default=""):
    for c in cands:
        if c in source.columns:
            return source[c]
    return pd.Series([default]*len(source))

scored["src_ip"]   = safe_get(test_rows, ["Source IP","Src IP","src_ip"])
scored["dst_ip"]   = safe_get(test_rows, ["Destination IP","Dst IP","dst_ip"])
scored["src_port"] = safe_get(test_rows, ["Source Port","Src Port","src_port"])
scored["dst_port"] = safe_get(test_rows, ["Destination Port","Dst Port","dst_port"])
scored["protocol"] = safe_get(test_rows, ["Protocol","protocol"])
scored["flow_duration_ms"] = safe_get(test_rows, ["Flow Duration","Flow_Duration","flow_duration"], np.nan)

fwd = safe_get(test_rows, ["Tot Fwd Pkts","Total Fwd Packets","Fwd Pkts"], 0).astype(float)
bwd = safe_get(test_rows, ["Tot Bwd Pkts","Total Bwd Packets","Bwd Pkts"], 0).astype(float)
scored["total_packets"] = fwd.fillna(0) + bwd.fillna(0)

flow_p_s = safe_get(test_rows, ["Flow Packets/s","Flow_Packets/s"], np.nan).astype(float)
flow_b_s = safe_get(test_rows, ["Flow Bytes/s","Flow_Bytes/s"], np.nan).astype(float)
dur_ms = pd.to_numeric(scored["flow_duration_ms"], errors="coerce").fillna(1.0)
dur_s = np.maximum(dur_ms/1000.0, 1e-6)

approx_pps = flow_p_s.fillna(scored["total_packets"] / dur_s)
approx_bps = flow_b_s.fillna(scored["total_packets"] * 350.0 / dur_s)
scored["approx_packets_per_s"] = approx_pps
scored["approx_bytes_per_s"] = approx_bps

def mitre_tags_for_row(r):
    tags = []
    try:
        pps = float(r.get("approx_packets_per_s", 0) or 0)
        bps = float(r.get("approx_bytes_per_s", 0) or 0)
        dst_port = int(float(r.get("dst_port", 0) or 0))
    except Exception:
        pps, bps, dst_port = 0.0, 0.0, 0
    proto = str(r.get("protocol","")).upper()

    if pps > 1000 or (pps > 300 and float(r.get("flow_duration_ms", 0) or 0) < 3000):
        tags.append("T1499 Endpoint Denial of Service")
    if dst_port in [21,22,23,3389] and pps > 50 and bps < 5e5:
        tags.append("T1110 Brute Force")
    if bps > 5e6 and dst_port not in [80,443]:
        tags.append("T1041 Exfiltration Over C2 Channel")
    if proto in ["FTP","TELNET"]:
        tags.append("T1071 Application Layer Protocol")
    return sorted(set(tags))

scored["mitre_tags"] = [mitre_tags_for_row(r) for _, r in scored.iterrows()]


In [12]:
scored.sort_values("y_prob", ascending=False).head(10)[[
    "src_ip","dst_ip","dst_port","protocol",
    "approx_packets_per_s","approx_bytes_per_s","y_prob","mitre_tags"
]]


Unnamed: 0,src_ip,dst_ip,dst_port,protocol,approx_packets_per_s,approx_bytes_per_s,y_prob,mitre_tags
363,191.206.251.79,192.168.100.221,23,TCP,119.365408,15359.013865,0.999799,[T1110 Brute Force]
170,181.6.245.37,192.168.38.25,80,UDP,223.599835,117059.04712,0.999551,[]
46,213.143.172.2,192.168.0.232,8080,UDP,350.0,294113.828035,0.999544,[]
249,68.196.240.141,192.168.148.139,443,TCP,136.36246,73183.947539,0.999513,[]
213,131.193.248.8,192.168.44.161,443,UDP,81.924098,57869.60375,0.999504,[]
284,34.2.44.234,192.168.243.196,8080,TCP,295.040498,261450.510018,0.999483,[]
265,192.30.1.82,192.168.136.113,443,TCP,249.181734,160376.494291,0.999455,[]
141,42.39.198.191,192.168.65.114,8080,TCP,350.0,201000.002906,0.999412,[]
257,113.201.69.236,192.168.181.36,23,TCP,62.542008,14338.761741,0.999381,[T1110 Brute Force]
150,214.28.15.234,192.168.5.160,80,UDP,286.511576,169115.080842,0.999188,[]


# Artifacts and download

In [13]:
# ===== Save artifacts and zip for download =====
ARTIFACTS.mkdir(exist_ok=True)
scored.sort_values("y_prob", ascending=False).to_csv(ARTIFACTS / "alerts_mitre.csv", index=False)
scored.to_csv(ARTIFACTS / "test_scored.csv", index=False)

with open(ARTIFACTS / "explained_alerts.jsonl", "w") as f:
    for _, r in scored.iterrows():
        rec = r.to_dict()
        rec["mitre_tags"] = list(rec.get("mitre_tags", []))
        f.write(json.dumps(rec) + "\n")

ZIP_OUT = "/content/soc_artifacts.zip"
with zipfile.ZipFile(ZIP_OUT, "w", zipfile.ZIP_DEFLATED) as z:
    for p in ARTIFACTS.glob("*"):
        z.write(str(p), p.name)

print("Artifacts written to:", ARTIFACTS)
print("Download zipped outputs at:", ZIP_OUT)


Artifacts written to: /content/ai-soc-assistant/artifacts
Download zipped outputs at: /content/soc_artifacts.zip


# Streamlit UI

In [14]:
%%writefile /content/ui.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

st.title("AI-Powered SOC Assistant")
model = joblib.load("/content/ai-soc-assistant/artifacts/model.joblib")
scaler = joblib.load("/content/ai-soc-assistant/artifacts/scaler.joblib")

uploaded = st.file_uploader("Upload CSV", type=["csv"])
if uploaded:
    df = pd.read_csv(uploaded, low_memory=False)
    st.write("Preview:", df.head())

    NUM_EXCLUDE = {"Label","__label_bin"}
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in NUM_EXCLUDE]
    Xs = scaler.transform(df[num_cols].replace([np.inf,-np.inf], np.nan).fillna(0.0))
    prob = model.predict_proba(Xs)[:,1]
    pred = (prob>=0.5).astype(int)

    out = df.copy()
    out["soc_prob"] = prob
    out["soc_pred"] = pred
    st.write("Top 20 alerts:", out.sort_values("soc_prob", ascending=False).head(20))
    st.download_button("Download scored CSV", out.to_csv(index=False).encode("utf-8"), "scored.csv", "text/csv")

# Run:
# !streamlit run /content/ui.py --server.port 8501 --server.headless true


Writing /content/ui.py


#FastAPI scoring service

In [15]:
%%writefile /content/app.py
import joblib, json
import pandas as pd
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List, Dict, Any

model = joblib.load("/content/ai-soc-assistant/artifacts/model.joblib")
scaler = joblib.load("/content/ai-soc-assistant/artifacts/scaler.joblib")

NUM_EXCLUDE = {"Label","__label_bin"}
app = FastAPI()

class Flow(BaseModel):
    features: Dict[str, Any]

@app.post("/score")
def score(flows: List[Flow]):
    df = pd.DataFrame([f.features for f in flows])
    num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and c not in NUM_EXCLUDE]
    Xs = scaler.transform(df[num_cols].fillna(0.0))
    prob = model.predict_proba(Xs)[:,1].tolist()
    pred = [int(p>=0.5) for p in prob]
    return {"pred": pred, "prob": prob}

# Run:
# !uvicorn app:app --host 0.0.0.0 --port 8000


Writing /content/app.py


In [16]:
from google.colab import files
import shutil

# Step 1: Compress the folder into a single zip file
shutil.make_archive("/content/ai-soc-assistant/artifacts", 'zip', "/content/ai-soc-assistant/artifacts")

# Step 2: Download it to your computer
files.download("/content/ai-soc-assistant/artifacts.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>