## 1) Setup environment

!pip -q install pandas 

## 2) Merge extra columns into column 2

In [1]:
import csv
import pandas as pd

rows = []
with open("../data/test/sieve.csv", "r", encoding="utf-8", errors="ignore") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if len(row) > 2:
            # first cell = category, join everything else as log
            category = row[0]
            log = ",".join(row[1:]).strip()
            rows.append([category, log])
        elif len(row) == 2:
            rows.append(row)
        else:
            # empty or malformed line; keep placeholder for inspection
            rows.append([None, ",".join(row)])

df = pd.DataFrame(rows, columns=["category", "log"])
print(df.shape)
df.head()


(100000, 2)


Unnamed: 0,category,log
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 3)Load and Inspect SIEVE dataset

In [2]:
import pandas as pd

csv_path = "../data/test/sieve.csv"
df = pd.read_csv(csv_path, on_bad_lines='skip')

print(df.shape)
df.head()

(81251, 2)


Unnamed: 0,category,log
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 4) Pre-processing of Dataset

In [3]:
# Relabel columns
df = df.rename(columns={"category": "label", "log": "text"})

#Eliminate NaN rows or Empty rows
df = df.dropna(subset=["label", "text"])
df = df.drop_duplicates(subset=["label", "text"]).reset_index(drop=True)

#Remove whitespaces
df["label"] = df["label"].astype(str).str.strip()
df["text"] = df["text"].astype(str).str.strip()

print(df["label"].value_counts())
df.head()                               

label
directory-changed               3334
file-action-failure             3334
directory-deleted               3334
directory-created               3334
file-deleted                    3333
file-read                       3333
file-modification               3333
network-traffic                 3333
user-session-open               3333
process-shutdown                3333
process-error                   3333
http-request-failure            3333
system-configuration-changed    3182
http-request-success            3163
ids-alert                       3154
process-started                 3110
connection-opened               2917
process-info                    2857
connection-failed               2821
authentication-success          2793
process-ended                   2777
authentication-failed           2757
connection-closed               2420
file-write                      2222
user-logout                     2106
user-creation                   1665
hardware-monitoring             

Unnamed: 0,label,text
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 5) Stratified Split

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])
print("Train set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train set size: (64980, 2)
Validation set size: (8123, 2)
Test set size: (8123, 2)
Train: 64980, Val: 8123, Test: 8123


## 6) Check if data is adequate for training

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(lowercase=False, strip_accents=None, ngram_range=(1,2), min_df=1)),
    ("clf", LinearSVC())
])

pipeline.fit(train_df["text"], train_df["label"])
val_pred = pipeline.predict(val_df["text"])
test_pred = pipeline.predict(test_df["text"])

print("Validation Report:")
print(classification_report(val_df["label"], val_pred))
print(f"Macro-F1 (Val): {f1_score(val_df['label'], val_pred, average='macro'):.4f}")
print(f"Macro-F1 (Test): {f1_score(test_df['label'], test_pred, average='macro'):.4f}")


Validation Report:
                              precision    recall  f1-score   support

       authentication-failed       1.00      0.93      0.96       276
      authentication-success       0.94      1.00      0.97       279
           connection-closed       1.00      1.00      1.00       242
           connection-failed       0.95      1.00      0.97       282
           connection-opened       1.00      0.95      0.97       292
           directory-changed       0.95      0.76      0.84       333
           directory-created       1.00      1.00      1.00       334
           directory-deleted       0.84      0.90      0.87       334
         file-action-failure       1.00      1.00      1.00       334
                file-deleted       0.89      0.83      0.86       333
           file-modification       0.80      0.96      0.87       333
                   file-read       1.00      1.00      1.00       334
                  file-write       1.00      1.00      1.00       222


## 7) Generate JsonL files for finetuning

In [8]:
import json
from pathlib import Path

out_dir = Path("../data/training/classification/sieve_prepped")
out_dir.mkdir(exist_ok=True)

def write_jsonl(df, path, mode="instruction"):
    with open(path, "w", encoding="utf-8") as f:
        for _, r in df.iterrows():
            if mode == "instruction":
                obj = {
                    "instruction": "Classify the SIEM event type for this log line.",
                    "input": r["text"],
                    "output": r["label"]
                }
            else:
                obj = {"input": r["text"], "label": r["label"]}
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")

# Save both formats
write_jsonl(train_df, out_dir / "train.instruction.jsonl", "instruction")
write_jsonl(val_df,   out_dir / "val.instruction.jsonl", "instruction")
write_jsonl(test_df,  out_dir / "test.instruction.jsonl", "instruction")

write_jsonl(train_df, out_dir / "train.classification.jsonl", "classification")
write_jsonl(val_df,   out_dir / "val.classification.jsonl", "classification")
write_jsonl(test_df,  out_dir / "test.classification.jsonl", "classification")

print("Files saved to:", out_dir)


Files saved to: ..\data\training\classification\sieve_prepped


## 8) Verify formating of JsonL

In [9]:
import json
pd.read_json(out_dir / "val.instruction.jsonl", lines=True).head()

Unnamed: 0,instruction,input,output
0,Classify the SIEM event type for this log line.,"date=1995-05-20 time=23:09:45 logid=""075851808...",network-traffic
1,Classify the SIEM event type for this log line.,2008-05-24T07:03:19 84.177.136.115 - informati...,http-request-success
2,Classify the SIEM event type for this log line.,Jan 29 17:11:29 trade su[51084]: + ??? dlindse...,authentication-success
3,Classify the SIEM event type for this log line.,Jul 16 20:14:35 bastion snort: [1:485:4] ICMP ...,ids-alert
4,Classify the SIEM event type for this log line.,"85.36.119.73 ""OPTIONS /half/ago/million/radio/...",http-request-success


## 9) Create Labels File for consistency

In [10]:
labels = sorted(df["label"].unique())
with open(out_dir / "labels.txt", "w", encoding="utf-8") as f:
    for label in labels:
        f.write(label + "\n")

print("Labels saved to:", out_dir / "labels.txt")

Labels saved to: ..\data\training\classification\sieve_prepped\labels.txt
