## 1) Setup environment

!pip -q install pandas 

## 2) Merge extra columns into column 2

In [11]:
import csv
import pandas as pd

rows = []
with open("../data/test/sieve.csv", "r", encoding="utf-8", errors="ignore") as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if len(row) > 2:
            # first cell = category, join everything else as log
            category = row[0]
            log = ",".join(row[1:]).strip()
            rows.append([category, log])
        elif len(row) == 2:
            rows.append(row)
        else:
            # empty or malformed line; keep placeholder for inspection
            rows.append([None, ",".join(row)])

df = pd.DataFrame(rows, columns=["category", "log"])
print(df.shape)
df.head()


(100000, 2)


Unnamed: 0,category,log
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 3)Load and Inspect SIEVE dataset

In [9]:
import pandas as pd

csv_path = "../data/test/sieve.csv"
df = pd.read_csv(csv_path, on_bad_lines='skip')

print(df.shape)
df.head()

(81251, 2)


Unnamed: 0,category,log
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 4) Pre-processing of Dataset

In [13]:
# Relabel columns
df = df.rename(columns={"category": "label", "log": "text"})

#Eliminate NaN rows or Empty rows
df = df.dropna(subset=["label", "text"])
df = df.drop_duplicates(subset=["label", "text"]).reset_index(drop=True)

#Remove whitespaces
df["label"] = df["label"].astype(str).str.strip()
df["text"] = df["text"].astype(str).str.strip()

print(df["label"].value_counts())
df.head()                               

label
authentication-failed           3334
authentication-success          3334
connection-failed               3334
connection-opened               3334
database-operation              3334
directory-changed               3334
directory-deleted               3334
directory-created               3334
file-action-failure             3334
file-deleted                    3333
file-read                       3333
file-modification               3333
file-write                      3333
hardware-monitoring             3333
process-info                    3333
http-request-failure            3333
http-request-success            3333
ids-alert                       3333
network-filtered                3333
network-traffic                 3333
process-ended                   3333
process-error                   3333
user-creation                   3333
process-shutdown                3333
process-started                 3333
system-configuration-changed    3333
user-logout                     

Unnamed: 0,label,text
0,authentication-failed,[Thu Dec 17 02:47:06 1992] [error] [client 42....
1,authentication-failed,[Sun Mar 05 13:11:21 2017] [error] [client 15....
2,authentication-failed,[Tue Oct 06 14:38:18 1987] [error] [client 178...
3,authentication-failed,[Fri Mar 23 00:47:56 1979] [error] [client 91....
4,authentication-failed,[Fri Jun 03 16:48:40 1994] [error] [client 174...


## 5) Stratified Split

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df["label"])
print("Train set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

SyntaxError: invalid syntax (973304174.py, line 1)