# Colab Training for SOC-LLM (QLoRA)
Train in short sessions, push LoRA adapters to Hugging Face Hub, and resume later.
**Remember:** Use the same `config/axo.yaml` everywhere.

## 1) Setup environment

In [None]:
!pip -q install -U datasets transformers accelerate peft trl bitsandbytes scikit-learn

## 1.1) Check environment

In [None]:
import torch, platform

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("Device count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("CUDA device:", torch.cuda.get_device_name(0))
print("Python:", platform.python_version())

## 2) Authenticate to Hugging Face Hub

In [None]:
from getpass import getpass
import os
HF_TOKEN = getpass("Enter your Hugging Face write token: ")
os.environ["HF_TOKEN"] = HF_TOKEN
HF_REPO = "com-otu/soc-llm-lora"  # change me
os.environ["HF_REPO"] = HF_REPO
print("HF env set.")


## 3) Paths

In [None]:
from pathlib import Path
DATA_DIR = Path("data/training/classification/sieve_prepped")  # adjust if needed

TRAIN_JSONL = DATA_DIR / "train.instruction.jsonl"
VAL_JSONL   = DATA_DIR / "val.instruction.jsonl"
TEST_JSONL  = DATA_DIR / "test.instruction.jsonl"
LABELS_TXT  = DATA_DIR / "labels.txt"

assert TRAIN_JSONL.exists() and VAL_JSONL.exists()
labels = [l.strip() for l in open(LABELS_TXT, "r", encoding="utf-8").read().splitlines()]
labels

## 4) Train (QLoRA)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"  # change if needed
MAX_SEQ_LEN = 1024
EPOCHS = 3
LR = 1e-4
GRAD_ACCUM = 32
PER_DEVICE_BATCH = 2
LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05
SEED = 42

train_jsonl = str(Path("sieve_prepped") / "train.instruction.jsonl")
val_jsonl   = str(Path("sieve_prepped") / "val.instruction.jsonl")
test_jsonl  = str(Path("sieve_prepped") / "test.instruction.jsonl")
labels_txt  = str(Path("sieve_prepped") / "labels.txt")

#Load Dataset
raw_ds = load_dataset("json", data_files={"train": TRAIN_JSONL, "validation": VAL_JSONL, "test": TEST_JSONL})

def to_text(example):
    return {"text": f"Instruction: {example['instruction']}\nInput: {example['input']}\nAnswer: {example['output']}"}

ds = raw_ds.map(to_text, remove_columns=raw_ds["train"].column_names)

#Tokenizer and 4Bit Qlora
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="bfloat16"
)

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, quantization_config=bnb_cfg, device_map="auto")
model.config.use_cache = False

#Add Lora Adapter using PEFT
peft_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_cfg)
model.print_trainable_parameters()

#Trainer config
train_cfg = SFTConfig(
    output_dir="sieve-llm-qlora",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    max_seq_length=MAX_SEQ_LEN,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    bf16=True,
    seed=SEED,
    dataset_text_field="text",
)

#Start Training
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    args=train_cfg
)

trainer.train()
trainer.save_model("sieve-llm-qlora")
print("Saved QLoRA adapter to sieve-llm-qlora/")


## 5) Validation: Constrained Mapping to Labels + Macro-F1

In [None]:
import re
from transformers import pipeline
from sklearn.metrics import classification_report, f1_score

gen = pipeline(
    "text-generation",
    model="sieve-llm-2cat",
    tokenizer=tokenizer,
    max_new_tokens=8,
    temperature=0.0,   # deterministic
    do_sample=False
)

def classify_one(logline: str) -> str:
    prompt = f"Instruction: Classify the SIEM event type for this log line.\nInput: {logline}\nAnswer:"
    out = gen(prompt)[0]["generated_text"]
    tail = out.split("Answer:", 1)[-1].strip().lower()

    # exact match first
    for lab in labels:
        if lab.lower() in tail:
            return lab

    # remove punctuation and compare tokens
    token = re.sub(r"[^a-z0-9\-]+", " ", tail).strip().split()
    token = token[0] if token else ""
    if token:
        import difflib
        guess = difflib.get_close_matches(token, [l.lower() for l in labels], n=1, cutoff=0.0)
        if guess:
            # map back to original label casing
            idx = [l.lower() for l in labels].index(guess[0])
            return labels[idx]
    # final fallback: first label
    return labels[0]

# Build X/y from the *original* validation jsonl (avoid leakage via 'text')
import json
val_rows = [json.loads(l) for l in open(VAL_JSONL, "r", encoding="utf-8").read().splitlines()]
y_true = [r["output"] for r in val_rows]
y_pred = [classify_one(r["input"]) for r in val_rows]

print(classification_report(y_true, y_pred, digits=4))
print("Macro-F1:", f1_score(y_true, y_pred, average="macro"))

#!python scripts/eval_soc_json.py --gold data/val.jsonl


## 6) Push LoRA checkpoint to HF Hub

In [None]:
from huggingface_hub import create_repo, upload_folder, HfApi

PUSH_ADAPTER = False         # set True to push LoRA adapter
PUSH_DATASET = False         # set True to push JSONL as a dataset
HF_USERNAME = "com-otu"

if PUSH_ADAPTER:
    adapter_repo = f"{HF_USERNAME}/sieve-llm-qlora-adapter"
    create_repo(adapter_repo, private=True, exist_ok=True)
    upload_folder(repo_id=adapter_repo, folder_path="sieve-llm-qlora", path_in_repo=".")
    print("Adapter pushed:", f"https://huggingface.co/{adapter_repo}")

if PUSH_DATASET:
    ds_repo = f"{HF_USERNAME}/sieve-2cat-jsonl"
    api = HfApi()
    api.create_repo(ds_repo, repo_type="dataset", private=True, exist_ok=True)
    upload_folder(repo_id=ds_repo, repo_type="dataset", folder_path=str(DATA_DIR), path_in_repo=".")
    print("Dataset pushed:", f"https://huggingface.co/datasets/{ds_repo}")
#!axolotl push-lora config/axo.yaml --repo $HF_REPO --token $HF_TOKEN


## 6) Resume training from last checkpoint (optional)

In [None]:
# Resume:
!axolotl train config/axo.yaml --resume_from_checkpoint $HF_REPO
