# SIEVE → QLoRA (Colab + Kaggle) with Google Drive Sync
This notebook cleans SIEVE-style CSV logs, builds stratified splits, exports JSONL, fine-tunes an **Instruct LLM** with **QLoRA**, evaluates macro-F1, and supports Google Drive sync (Colab) and ZIP export (Kaggle).

## Verify the environment and install packages

In [None]:
import os, sys

def IN_COLAB():
    try:
        import google.colab  # type: ignore
        return True
    except Exception:
        return False

def IN_KAGGLE():
    return "KAGGLE_URL_BASE" in os.environ or "KAGGLE_KERNEL_RUN_TYPE" in os.environ

print("Colab:", IN_COLAB())
print("Kaggle:", IN_KAGGLE())
print("Python:", sys.version)

%pip -q install -U datasets transformers accelerate peft trl bitsandbytes scikit-learn huggingface_hub

## Cuda Status (Optional) may just rmove this has no real requirement during testing or production

In [None]:
# CUDA status
import torch
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))

## Setup Paths currently Data Storage and Checkpoints

In [None]:
# Google Drive sync (Colab) and paths
from pathlib import Path

DATA_DIR = Path("sieve_prepped")
CKPT_DIR = Path("sieve-llm-qlora")

if IN_COLAB():
    from google.colab import drive
    drive.mount('/content/drive')
    DATA_DIR = Path("/content/drive/MyDrive/Capstone/Training/Classification/sieve_prepped")
    CKPT_DIR = Path("/content/drive/MyDrive/Capstone/Training/Classification/sieve_checkpoints/sieve-llm-qlora")
elif IN_KAGGLE():
    DATA_DIR = Path("/kaggle/working/sieve_prepped")
    CKPT_DIR = Path("/kaggle/working/sieve-llm-qlora")

DATA_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR.mkdir(parents=True, exist_ok=True)
print("DATA_DIR:", DATA_DIR.resolve())
print("CKPT_DIR:", CKPT_DIR.resolve())

# Pass Hugging Face Token to interface

In [None]:
# (Optional) Hugging Face login
from huggingface_hub import login
DO_HF_LOGIN = True
if DO_HF_LOGIN:
    try:
        login()
    except Exception as e:
        print("HF login skipped or failed:", e)

## Load and verify CSV formatting

In [None]:
# Robust CSV loader
import csv
import pandas as pd

def load_sieve_csv(csv_path):
    rows = []
    with open(csv_path, "r", encoding="utf-8", errors="ignore") as f:
        reader = csv.reader(f)
        header = next(reader, None)
        def norm(x): return str(x).strip().lower()
        category_idx, log_start = None, None

        if header and any('category' in norm(h) for h in header) and any('log' in norm(h) for h in header):
            for i, h in enumerate(header):
                if 'category' in norm(h) and category_idx is None:
                    category_idx = i
                if 'log' in norm(h) and log_start is None:
                    log_start = i
        else:
            category_idx, log_start = 0, 1
            if header:
                if len(header) > 1:
                    rows.append([header[0], ",".join(header[1:]).strip()])
                else:
                    rows.append([None, ",".join(header).strip()])

        for row in reader:
            if len(row) == 0:
                continue
            if category_idx is not None and log_start is not None and log_start < len(row):
                category = row[category_idx]
                log = ",".join(row[log_start:]).strip()
            else:
                category = row[0] if len(row) > 0 else None
                log = ",".join(row[1:]).strip() if len(row) > 1 else ""
            rows.append([category, log])
    df = pd.DataFrame(rows, columns=["category","log"]).dropna(subset=["category","log"])
    df["category"] = df["category"].astype(str).str.strip()
    df["log"] = df["log"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
    df = df.drop_duplicates(subset=["category","log"]).reset_index(drop=True)
    return df

df = load_sieve_csv(str(CSV_PATH))
print(df.shape)
df.head()

## Verify and normalize CSV data

In [None]:
df = df.rename(columns={"category":"label", "log":"text"})
print(df["label"].value_counts())
df.head()

## Configurable Training Data Subset

In [None]:
# Stratified split
from sklearn.model_selection import train_test_split
controlled_subset_df, _ = train_test_split(
    df,
    test_size=0.9,             # keep only 10 % for small_df
    stratify=df["label"],      # preserve label distribution
    random_state=42
)

print(f"Subset size: {len(controlled_subset_df)} ({len(controlled_subset_df)/len(df)*100:.1f}% of total)")
print(controlled_subset_df["label"].value_counts(normalize=True).round(3))

train_df, temp_df = train_test_split(
    controlled_subset_df, 
    test_size=0.2,          # 80/20 split for train vs temp
    random_state=42, 
    stratify=df["label"]
)

val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5,              # half of temp → 10 % val, 10 % test (of subset)
    random_state=42, 
    stratify=temp_df["label"]
)

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")

## Check if the dataset meets the minimum score for training (> 0.7)

In [None]:
# Baseline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score

baseline = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=False, strip_accents=None, ngram_range=(1,2), min_df=1)),
    ("clf", LinearSVC())
])
baseline.fit(train_df["text"], train_df["label"])
val_pred = baseline.predict(val_df["text"])
test_pred = baseline.predict(test_df["text"])
print("Validation Report:\n", classification_report(val_df["label"], val_pred, digits=4))
print("Macro-F1 (Val):", f1_score(val_df["label"], val_pred, average="macro"))
print("Macro-F1 (Test):", f1_score(test_df["label"], test_pred, average="macro"))

## Export JSONL + labels.txt into DATA_DIR

In [None]:
import json
def write_jsonl(df_in, path, mode="instruction"):
    with open(path, "w", encoding="utf-8") as f:
        for _, r in df_in.iterrows():
            if mode == "instruction":
                obj = {"instruction":"Classify the SIEM event type for this log line.", "input":r["text"], "output":r["label"]}
            else:
                obj = {"input": r["text"], "label": r["label"]}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

(files := {
    "train_ins": DATA_DIR / "train.instruction.jsonl",
    "val_ins":   DATA_DIR / "val.instruction.jsonl",
    "test_ins":  DATA_DIR / "test.instruction.jsonl",
    "train_cls": DATA_DIR / "train.classification.jsonl",
    "val_cls":   DATA_DIR / "val.classification.jsonl",
    "test_cls":  DATA_DIR / "test.classification.jsonl",
})
write_jsonl(train_df, files["train_ins"], "instruction")
write_jsonl(val_df,   files["val_ins"],   "instruction")
write_jsonl(test_df,  files["test_ins"],  "instruction")
write_jsonl(train_df, files["train_cls"], "classification")
write_jsonl(val_df,   files["val_cls"],   "classification")
write_jsonl(test_df,  files["test_cls"],  "classification")

labels = sorted(df["label"].unique())
with open(DATA_DIR / "labels.txt", "w", encoding="utf-8") as f:
    for lab in labels: f.write(lab + "\n")

print("Exported to:", DATA_DIR.resolve())
print("Labels:", labels)

## QLoRA fine-tuning (saved to CKPT_DIR)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
MAX_SEQ_LEN = 512
EPOCHS = 3
LR = 1e-4
GRAD_ACCUM = 64
PER_DEVICE_BATCH = 1
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
SEED = 42

train_jsonl = str(DATA_DIR / "train.instruction.jsonl")
val_jsonl   = str(DATA_DIR / "val.instruction.jsonl")
test_jsonl  = str(DATA_DIR / "test.instruction.jsonl")
labels_txt  = str(DATA_DIR / "labels.txt")

raw_ds = load_dataset("json", data_files={"train": train_jsonl, "validation": val_jsonl, "test": test_jsonl})
def to_text(ex): return {"text": f"Instruction: {ex['instruction']}\nInput: {ex['input']}\nAnswer: {ex['output']}"}
ds = raw_ds.map(to_text, remove_columns=raw_ds["train"].column_names)

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tok.pad_token is None: tok.pad_token = tok.eos_token
tok.model_max_length = MAX_SEQ_LEN

bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype="bfloat16")
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=bnb, device_map="auto")
model.config.use_cache = False

peft_cfg = LoraConfig(r=LORA_R, lora_alpha=LORA_ALPHA, target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"], lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

#cfg = SFTConfig(output_dir=str(CKPT_DIR), num_train_epochs=EPOCHS, per_device_train_batch_size=PER_DEVICE_BATCH, gradient_accumulation_steps=GRAD_ACCUM, learning_rate=LR, logging_steps=50, eval_strategy="steps", eval_steps=200, save_steps=200, bf16=True, seed=SEED, dataset_text_field="text")

# Memory-friendly config
mf_cfg = SFTConfig(
    output_dir=str(CKPT_DIR), num_train_epochs=EPOCHS, per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM, learning_rate=LR,
    
    fp16=True, bf16=False, optim="paged_adamw_8bit", gradient_checkpointing=True, dataloader_num_workers=0, 
    dataloader_pin_memory=False,
    
    eval_strategy="no", save_steps=800, save_total_limit=2, logging_steps=50, seed=SEED, dataset_text_field="text"
)

# trainer = SFTTrainer(model=model, tokenizer=tok, train_dataset=ds["train"], eval_dataset=ds["validation"], args=cfg)
trainer = SFTTrainer(model=model, train_dataset=ds["train"], eval_dataset=None, args=mf_cfg)

# To resume from a Drive checkpoint:
# trainer.train(resume_from_checkpoint=str(CKPT_DIR / "checkpoint-XXXX"))
trainer.train()
trainer.save_model(str(CKPT_DIR))
print("Saved QLoRA adapter to:", CKPT_DIR)

## Validation (macro-F1) with constrained mapping

In [None]:
import re, json
from transformers import pipeline
from sklearn.metrics import classification_report, f1_score

labels = [l.strip() for l in open(labels_txt, "r", encoding="utf-8").read().splitlines()]
gen = pipeline("text-generation", model=str(CKPT_DIR), tokenizer=tok, max_new_tokens=8, temperature=0.0, do_sample=False)

def classify_one(s: str) -> str:
    out = gen(f"Instruction: Classify the SIEM event type for this log line.\nInput: {s}\nAnswer:")[0]["generated_text"]
    tail = out.split("Answer:", 1)[-1].strip().lower()
    for lab in labels:
        if lab.lower() in tail: return lab
    token = re.sub(r"[^a-z0-9\-]+", " ", tail).strip().split()
    token = token[0] if token else ""
    if token:
        import difflib
        guess = difflib.get_close_matches(token, [l.lower() for l in labels], n=1, cutoff=0.0)
        if guess:
            idx = [l.lower() for l in labels].index(guess[0])
            return labels[idx]
    return labels[0]

val_rows = [json.loads(l) for l in open(val_jsonl, "r", encoding="utf-8").read().splitlines()]
y_true = [r["output"] for r in val_rows]
y_pred = [classify_one(r["input"]) for r in val_rows]
print(classification_report(y_true, y_pred, digits=4))
print("Macro-F1:", f1_score(y_true, y_pred, average="macro"))

## Kaggle: ZIP checkpoints for download; Colab: persists to Drive

In [None]:
import shutil, os
zip_path = str(CKPT_DIR) + ".zip"
if os.path.exists(zip_path): os.remove(zip_path)
shutil.make_archive(str(CKPT_DIR), 'zip', root_dir=str(CKPT_DIR))
print("Created:", zip_path)