In [None]:
!pip install transformers datasets -q

In [None]:
deps = [
    "pandas",
    "numpy",
    "torch",
    "scikit-learn",
    "transformers",
    "datasets"
] 

import pkg_resources
with open("/content/drive/MyDrive/Datathon 2025 (HidupJ0kow1)/requirements_1.txt", "w") as f:
    for pkg in deps:
        try:
            v = pkg_resources.get_distribution(pkg).version
            f.write(f"{pkg}=={v}\n")
        except:
            pass

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset
import torch.serialization
from torch.serialization import add_safe_globals

In [None]:
test_args = TrainingArguments(output_dir="./test")
print(test_args.output_dir)

In [None]:
import transformers
print(transformers.__version__)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/datathon_test/train_df.csv')

In [None]:
required_columns = ['attack_cat_enc', 'dur', 'dsport', 'sbytes', 'spkts', 'pkt_ratio', 'sjit', 'sload']

missing_columns = [col for col in required_columns if col not in data.columns]

if missing_columns:
    print("Kolom berikut TIDAK ditemukan dalam dataset:", missing_columns)
else:
    print("Semua kolom yang dibutuhkan ADA dalam dataset.")

In [None]:
df_normal = data[data['attack_cat_enc'] == 5].copy()
df_attack = data[data['attack_cat_enc'] != 5].copy()

In [None]:
def explain_normal(row):
    return f"Trafik normal terdeteksi dengan durasi {row['dur']} detik, data keluar masuk sebesar {row['sbytes']}B/{row['dbytes']}B, dan port sumber {row['sport']}."

In [None]:
attack_mapping = {
    0: 'analysis', 1: 'backdoors', 2: 'dos exploits', 3: 'fuzzers',
    4: 'generic', 6: 'reconnaissance', 7: 'shellcode', 8: 'worms'
}

def explain_attack(row):
    attack_type = attack_mapping.get(row['attack_cat_enc'], 'unknown')
    return f"Serangan {attack_type} terdeteksi dengan durasi {row['dur']} detik, dari port sumber {row['sport']} ke port tujuan {row['dsport']}, ukuran data {row['sbytes']}B, jumlah paket {row['spkts']}, dan rasio paket {row['pkt_ratio']:.2f}."

In [None]:
def generate_xai_text(row):
    if row['attack_cat_enc'] == 5:
        if row['dur'] < 0.05 and row['sbytes'] < 1000:
            return "Trafik normal berdurasi pendek dengan jumlah byte kecil."
        elif row['dur'] > 5 and row['sbytes'] > 10000:
            return "Trafik normal berdurasi panjang dengan jumlah byte cukup besar."
        else:
            return "Trafik normal dengan karakteristik umum."
    else:
        if row['dur'] > 5 and row['sbytes'] > 10000:
            return "Durasi panjang dan jumlah byte besar mengindikasikan potensi serangan."
        elif row['dbytes'] > 10000 and row['pkt_ratio'] > 2:
            return "Perbandingan paket yang tidak seimbang dan byte besar mengindikasikan serangan."
        elif row['jit_ratio'] > 5:
            return "Variasi jeda paket tinggi terdeteksi, indikasi lalu lintas serangan."
        else:
            return "Trafik mencurigakan dengan indikasi serangan ringan."

In [None]:
df_normal['xai_template'] = df_normal.apply(explain_normal, axis=1)
df_normal['xai_logic'] = df_normal.apply(generate_xai_text, axis=1)

df_attack['xai_template'] = df_attack.apply(explain_attack, axis=1)
df_attack['xai_logic'] = df_attack.apply(generate_xai_text, axis=1)

In [None]:
df_attack.head()

In [None]:
df_normal['target_text'] = df_normal['xai_template'] + ". Penjelasan tambahan: " + df_normal['xai_logic']

df_attack['target_text'] = df_attack['xai_template'] + ". Penjelasan tambahan: " + df_attack['xai_logic']

In [None]:
df_normal.to_csv('/content/drive/MyDrive/datathon_test/df_normal.csv', index=False)
df_attack.to_csv('/content/drive/MyDrive/datathon_test/df_attack.csv', index=False)

In [None]:
df_normal = pd.read_csv('/content/drive/MyDrive/datathon_test/df_normal.csv')
df_attack = pd.read_csv('/content/drive/MyDrive/datathon_test/df_attack.csv')

In [None]:
X_normal = df_normal.drop(columns=['target_text', 'attack_cat_enc', 'xai_template', 'xai_logic'])
y_normal = df_normal['target_text']

X_attack = df_attack.drop(columns=['target_text', 'attack_cat_enc', 'xai_template', 'xai_logic'])
y_attack = df_attack['target_text']

In [None]:
scaler_normal = StandardScaler()
X_normal_scaled = scaler_normal.fit_transform(X_normal)

scaler_attack = StandardScaler()
X_attack_scaled = scaler_attack.fit_transform(X_attack)

tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [None]:
input_normal = [f"fitur: {','.join(map(str, row))}" for row in X_normal_scaled]
input_attack = [f"fitur: {','.join(map(str, row))}" for row in X_attack_scaled]

In [None]:
from sklearn.utils import resample
input_normal, y_normal = resample(input_normal, y_normal, n_samples=1000000, random_state=42)

In [None]:
def tokenize_in_batches(input_texts, label_texts, tokenizer, batch_size=10000, max_len=128):
    input_batches = []
    label_batches = []

    total_batches = (len(input_texts) + batch_size - 1) // batch_size

    for i in range(0, len(input_texts), batch_size):
        batch_idx = i // batch_size + 1
        print(f"Tokenizing batch {batch_idx}/{total_batches} ...")

        input_batch = input_texts[i:i + batch_size]
        label_batch = label_texts[i:i + batch_size]

        tokenized_inputs = tokenizer(
            input_batch,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )

        tokenized_labels = tokenizer(
            label_batch,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        )["input_ids"]

        input_batches.append(tokenized_inputs)
        label_batches.append(tokenized_labels)

    # Gabungkan semua batch jadi satu tensor besar
    input_ids = {
        key: torch.cat([batch[key] for batch in input_batches], dim=0)
        for key in input_batches[0]
    }
    label_ids = torch.cat(label_batches, dim=0)

    print("Tokenization selesai.")
    return input_ids, label_ids

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

tokenized_normal, labels_normal = tokenize_in_batches(
    input_normal,
    y_normal.tolist(),
    tokenizer,
    batch_size=10000
)

tokenized_attack, labels_attack = tokenize_in_batches(
    input_attack,
    y_attack.tolist(),
    tokenizer,
    batch_size=10000
)

In [None]:
class TextGenDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [None]:
import random

subset_size = 200_000

indices = random.sample(range(len(labels_normal)), subset_size)
tokenized_normal_subset = {k: [v[i] for i in indices] for k, v in tokenized_normal.items()}
labels_normal_subset = [labels_normal[i] for i in indices]

dataset_normal = TextGenDataset(tokenized_normal_subset, labels_normal_subset)

In [None]:
dataset_normal = TextGenDataset(tokenized_normal, labels_normal)
dataset_attack = TextGenDataset(tokenized_attack, labels_attack)

In [None]:
torch.save(dataset_normal, '/content/drive/MyDrive/datathon_test/dataset_normal_resamp.pt')
torch.save(dataset_attack, '/content/drive/MyDrive/datathon_test/dataset_attack.pt')

In [None]:
add_safe_globals([TextGenDataset])

dataset_normal = torch.load("/content/drive/MyDrive/datathon_test/dataset_normal_resamp.pt", weights_only=False)
dataset_attack = torch.load("/content/drive/MyDrive/datathon_test/dataset_attack.pt", weights_only=False)

In [None]:
model_normal = T5ForConditionalGeneration.from_pretrained("t5-small")
model_attack = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,                          # Fokus batasan step
    per_device_train_batch_size=192,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    save_steps=1000,
    save_total_limit=2,
    save_strategy="steps",
    # evaluation_strategy="no",
    fp16=True,
    dataloader_num_workers=2,
    report_to="tensorboard"
)

trainer_normal = Trainer(
    model=model_normal,
    args=training_args,
    train_dataset=dataset_normal
)

trainer_attack = Trainer(
    model=model_attack,
    args=training_args,
    train_dataset=dataset_attack
)

trainer_normal.train()
trainer_attack.train()

In [None]:
from transformers import T5Tokenizer

model_normal.save_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-normal-explainer-v1")
model_attack.save_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-attack-explainer-v1")

tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenizer.save_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-normal-explainer-v1")
tokenizer.save_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-attack-explainer-v1")

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-normal-explainer-v1")
tokenizer = T5Tokenizer.from_pretrained("/content/drive/MyDrive/datathon_test/saved_model/t5-normal-explainer-v1")

model.eval()

In [None]:
dummy_input = "fitur: 0.1,0.2,0.3,0.4,0.5"
input_ids = tokenizer.encode(dummy_input, return_tensors="pt")

with torch.no_grad():
    output_ids = model.generate(input_ids, max_length=50)

print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

In [None]:
def vector_to_prompt(vec):
    return "fitur: " + ",".join(f"{x:.10f}" for x in vec)

test_vectors = [
    [1.3100519691, -0.5522472851, -0.0294114342, -0.3108599332, -0.2576223264,
     0.9559007926, 8.5694000790, -0.1012008205, -0.6827390346, -0.4971613772,
     0.6366064686, -0.5344575700, 1.6566076941, 0.2286318827],  # Sample 1

    [0.2851886442, -0.3681769346, -0.0268378593, -0.2612596601, -0.2483472394,
     -0.1363363572, -0.0915017137, -0.1009390248, -0.6743034715, -0.3486130093,
     0.6366064686, -0.5536035882, 0.9660000000, 0.1800000000]  # Sample 2
]

for idx, vec in enumerate(test_vectors):
    prompt = vector_to_prompt(vec)
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    output_ids = model.generate(
        input_ids,
        max_length=500,  # Lebih panjang
        num_beams=2,     # Beam search
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"🧪 Input #{idx+1}:\n{prompt}\n🧠 Output:\n{output}\n{'-'*50}")

In [None]:
# Contoh lihat isi dataset_normal
for i in range(3):
    print("INPUT:", tokenizer.decode(dataset_normal[i]['input_ids'], skip_special_tokens=True))
    print("LABEL:", tokenizer.decode(dataset_normal[i]['labels'], skip_special_tokens=True))
    print("---")