# SIEVE → QLoRA (Colab + Kaggle Compatible)

This notebook cleans SIEVE-style CSV logs, builds stratified splits, exports JSONL, fine-tunes an **Instruct LLM** with **QLoRA**, evaluates macro-F1, and (optionally) pushes your adapter/dataset to the **Hugging Face Hub**.

**Works on**: Google Colab or Kaggle (or any GPU Jupyter).

## 0. Environment Check & Installs
The cells below detect Colab/Kaggle automatically and install compatible versions of the required libraries.

In [None]:
import os, sys, platform, subprocess, textwrap

def in_colab():
    try:
        import google.colab  # type: ignore
        return True
    except Exception:
        return False

def in_kaggle():
    return "KAGGLE_URL_BASE" in os.environ or "KAGGLE_KERNEL_RUN_TYPE" in os.environ

print("Colab:", in_colab())
print("Kaggle:", in_kaggle())
print("Python:", sys.version)

# Install core deps (pin lightly for broad compatibility)
%pip -q install -U datasets transformers accelerate peft trl bitsandbytes scikit-learn huggingface_hub

In [None]:
# CUDA status
import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))

## 1. Hugging Face Login (Optional but recommended)
Login to access gated models, push adapters, and host private datasets.

In [None]:
from huggingface_hub import login
DO_HF_LOGIN = True  # set False if you don't want to login right now
if DO_HF_LOGIN:
    try:
        login()  # paste your token from https://huggingface.co/settings/tokens
    except Exception as e:
        print("HF login skipped or failed:", e)

## 2. Data Ingestion Options

You have **three** ways to provide data:
- **A. Upload files** (Colab: left sidebar "Files" → Upload, Kaggle: Add data).
- **B. Mount Google Drive** (Colab only).
- **C. Use Kaggle Dataset path** (Kaggle only).

Set `DATA_MODE` and paths in the next cell.

In [None]:
from pathlib import Path

# ==== Configure here ====
DATA_MODE = "UPLOAD"  # "UPLOAD" | "GDRIVE" | "KAGGLE"

# If using UPLOAD, place your CSV(s) in the working directory. Example:
CSV_PATH = Path("2-cat-sample.csv")   # change to your file name

# If using Google Drive in Colab:
GDRIVE_PATH = "/content/drive/MyDrive/sieve_data/2-cat-sample.csv"

# If using Kaggle, attach dataset and set the path:
KAGGLE_DATA_PATH = "/kaggle/input/your-dataset-folder/2-cat-sample.csv"

# Output directory for all artifacts (JSONL, labels, checkpoints)
OUT_DIR = Path("sieve_prepped")
OUT_DIR.mkdir(exist_ok=True)

def resolve_data_path():
    if DATA_MODE == "UPLOAD":
        return CSV_PATH
    elif DATA_MODE == "GDRIVE":
        return Path(GDRIVE_PATH)
    elif DATA_MODE == "KAGGLE":
        return Path(KAGGLE_DATA_PATH)
    else:
        raise ValueError("Unknown DATA_MODE")
        
data_path = resolve_data_path()
print("Using data:", data_path.resolve())

In [None]:
# (Optional) Mount Google Drive in Colab
if 'google.colab' in str(get_ipython()):
    from google.colab import drive
    if DATA_MODE == "GDRIVE":
        drive.mount('/content/drive')
        print("Drive mounted. Ensure GDRIVE_PATH points to your CSV.")

## 3. Robust CSV Loader for Mixed-Format Logs

This loader:
- Handles Apple Numbers "Table 1" wrappers
- Joins extra columns back into the `log` text (keeps all data)
- Produces a clean 2-column dataframe: `category` (label), `log` (text)

In [None]:
import csv
import pandas as pd
import numpy as np

def load_sieve_csv(csv_path):
    rows = []
    with open(csv_path, "r", encoding="utf-8", errors="ignore") as f:
        reader = csv.reader(f)
        header = next(reader, None)
        # Try to detect header names; otherwise assume first row is header-like
        # Normalize common cases: category / log
        def norm(x): return str(x).strip().lower()
        category_idx, log_start = None, None

        # If there's a header row with category/log
        if header and any('category' in norm(h) for h in header) and any('log' in norm(h) for h in header):
            # set indices
            for i, h in enumerate(header):
                if 'category' in norm(h) and category_idx is None:
                    category_idx = i
                if 'log' in norm(h) and log_start is None:
                    log_start = i
        else:
            # No reliable header detected; assume first col = category, rest join as log
            category_idx, log_start = 0, 1
            # If the first row looks like real data, include it
            if header:
                if len(header) > 1:
                    rows.append([header[0], ",".join(header[1:]).strip()])
                else:
                    rows.append([None, ",".join(header).strip()])

        # Process remaining rows
        for row in reader:
            if len(row) == 0:
                continue
            if category_idx is not None and log_start is not None and log_start < len(row):
                category = row[category_idx]
                log = ",".join(row[log_start:]).strip()
            else:
                # fallback: first cell is category, rest is log
                category = row[0] if len(row) > 0 else None
                log = ",".join(row[1:]).strip() if len(row) > 1 else ""
            rows.append([category, log])

    df = pd.DataFrame(rows, columns=["category", "log"])
    # Cleanup
    df = df.dropna(subset=["category", "log"])
    df["category"] = df["category"].astype(str).str.strip()
    df["log"] = df["log"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
    df = df.drop_duplicates(subset=["category","log"]).reset_index(drop=True)
    return df

df = load_sieve_csv(str(data_path))
print(df.shape)
df.head()

## 4. Normalize → `label`, `text` and Inspect Balance

In [None]:
df = df.rename(columns={"category":"label", "log":"text"})
print(df["label"].value_counts())
df.head()

## 5. Stratified Train/Val/Test Split
Keeps the class ratio consistent across splits.

In [None]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

## 6. Baseline: TF-IDF + LinearSVC (Sanity Check)
If this is reasonably good (e.g., macro-F1 ≥ 0.7), your data is learnable.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score

baseline = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=False, strip_accents=None, ngram_range=(1,2), min_df=1)),
    ("clf", LinearSVC())
])

baseline.fit(train_df["text"], train_df["label"])
val_pred = baseline.predict(val_df["text"])
test_pred = baseline.predict(test_df["text"])

print("Validation Report:\n", classification_report(val_df["label"], val_pred, digits=4))
print("Macro-F1 (Val):", f1_score(val_df["label"], val_pred, average="macro"))
print("Macro-F1 (Test):", f1_score(test_df["label"], test_pred, average="macro"))

## 7. Export JSONL (Instruction + Classification) and labels.txt

In [None]:
import json, os
from pathlib import Path

OUT_DIR = Path("sieve_prepped")
OUT_DIR.mkdir(exist_ok=True)

def write_jsonl(df_in, path, mode="instruction"):
    with open(path, "w", encoding="utf-8") as f:
        for _, r in df_in.iterrows():
            if mode == "instruction":
                obj = {
                    "instruction": "Classify the SIEM event type for this log line.",
                    "input": r["text"],
                    "output": r["label"]
                }
            else:
                obj = {"input": r["text"], "label": r["label"]}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

files = {
    "train_ins": OUT_DIR / "train.instruction.jsonl",
    "val_ins":   OUT_DIR / "val.instruction.jsonl",
    "test_ins":  OUT_DIR / "test.instruction.jsonl",
    "train_cls": OUT_DIR / "train.classification.jsonl",
    "val_cls":   OUT_DIR / "val.classification.jsonl",
    "test_cls":  OUT_DIR / "test.classification.jsonl",
}

write_jsonl(train_df, files["train_ins"], "instruction")
write_jsonl(val_df,   files["val_ins"],   "instruction")
write_jsonl(test_df,  files["test_ins"],  "instruction")
write_jsonl(train_df, files["train_cls"], "classification")
write_jsonl(val_df,   files["val_cls"],   "classification")
write_jsonl(test_df,  files["test_cls"],  "classification")

# labels.txt
labels = sorted(df["label"].unique())
with open(OUT_DIR / "labels.txt", "w", encoding="utf-8") as f:
    for lab in labels:
        f.write(lab + "\n")

print("Exported to:", OUT_DIR.resolve())
print("Labels:", labels[:10], "...")

## 8. QLoRA Fine-Tuning (HF TRL + PEFT)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"  # change if needed
MAX_SEQ_LEN = 1024
EPOCHS = 3
LR = 1e-4
GRAD_ACCUM = 32
PER_DEVICE_BATCH = 2
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
SEED = 42

train_jsonl = str(Path("sieve_prepped") / "train.instruction.jsonl")
val_jsonl   = str(Path("sieve_prepped") / "val.instruction.jsonl")
test_jsonl  = str(Path("sieve_prepped") / "test.instruction.jsonl")
labels_txt  = str(Path("sieve_prepped") / "labels.txt")

raw_ds = load_dataset("json", data_files={"train": train_jsonl, "validation": val_jsonl, "test": test_jsonl})

def to_text(example):
    return {"text": f"Instruction: {example['instruction']}\nInput: {example['input']}\nAnswer: {example['output']}"}

ds = raw_ds.map(to_text, remove_columns=raw_ds["train"].column_names)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=bnb_cfg, device_map="auto")
model.config.use_cache = False

peft_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

train_cfg = SFTConfig(
    output_dir="sieve-llm-qlora",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    max_seq_length=MAX_SEQ_LEN,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    bf16=True,
    seed=SEED,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    args=train_cfg
)

trainer.train()
trainer.save_model("sieve-llm-qlora")
print("Saved QLoRA adapter to sieve-llm-qlora/")

## 9. Validation: Constrained Mapping to Labels + Macro-F1

In [None]:
import re, json
from transformers import pipeline
from sklearn.metrics import classification_report, f1_score

labels = [l.strip() for l in open(labels_txt, "r", encoding="utf-8").read().splitlines()]

gen = pipeline(
    "text-generation",
    model="sieve-llm-qlora",
    tokenizer=tokenizer,
    max_new_tokens=8,
    temperature=0.0,
    do_sample=False
)

def classify_one(logline: str) -> str:
    prompt = f"Instruction: Classify the SIEM event type for this log line.\nInput: {logline}\nAnswer:"
    out = gen(prompt)[0]["generated_text"]
    tail = out.split("Answer:", 1)[-1].strip().lower()

    # exact contains match
    for lab in labels:
        if lab.lower() in tail:
            return lab

    # token cleanup + fuzzy match
    token = re.sub(r"[^a-z0-9\-]+", " ", tail).strip().split()
    token = token[0] if token else ""
    if token:
        import difflib
        guess = difflib.get_close_matches(token, [l.lower() for l in labels], n=1, cutoff=0.0)
        if guess:
            idx = [l.lower() for l in labels].index(guess[0])
            return labels[idx]
    return labels[0]  # fallback

val_rows = [json.loads(l) for l in open(val_jsonl, "r", encoding="utf-8").read().splitlines()]
y_true = [r["output"] for r in val_rows]
y_pred = [classify_one(r["input"]) for r in val_rows]

print(classification_report(y_true, y_pred, digits=4))
print("Macro-F1:", f1_score(y_true, y_pred, average="macro"))

## 10. (Optional) Push Adapter & Dataset to Hugging Face Hub

In [None]:
from huggingface_hub import create_repo, upload_folder, HfApi

PUSH_ADAPTER = False         # set True to push LoRA adapter
PUSH_DATASET = False         # set True to push JSONL as a dataset
HF_USERNAME = "your-username"  # change this

if PUSH_ADAPTER:
    adapter_repo = f"{HF_USERNAME}/sieve-llm-qlora-adapter"
    create_repo(adapter_repo, private=True, exist_ok=True)
    upload_folder(repo_id=adapter_repo, folder_path="sieve-llm-qlora", path_in_repo=".")
    print("Adapter pushed:", f"https://huggingface.co/{adapter_repo}")

if PUSH_DATASET:
    ds_repo = f"{HF_USERNAME}/sieve-2cat-jsonl"
    api = HfApi()
    api.create_repo(ds_repo, repo_type="dataset", private=True, exist_ok=True)
    upload_folder(repo_id=ds_repo, repo_type="dataset", folder_path=str(OUT_DIR), path_in_repo=".")
    print("Dataset pushed:", f"https://huggingface.co/datasets/{ds_repo}")