In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import torch
import os

# Path direktori output model
output_dir = "./output"

# Kategori model
categories = [
    "attackComplexity", "attackVector", "availability", "confidentiality", 
    "integrity", "privilegeReq", "scope", "userInteraction"
]

# Data uji (misalnya format CSV atau JSON yang sudah diproses sebelumnya)
test_data_path = "./data/test.csv"  # Ubah dengan path data uji Anda

# Load data uji
import pandas as pd

test_data = pd.read_csv(test_data_path)
texts = test_data['text']  # Ganti sesuai kolom teks di data uji
labels = test_data['label']  # Ganti sesuai kolom label di data uji

# Fungsi evaluasi model
def evaluate_model(model_path, tokenizer, texts, labels):
    # Load model dan tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # Tokenisasi data
    inputs = tokenizer(list(texts), padding=True, truncation=True, return_tensors="pt", max_length=512)
    labels = torch.tensor(labels)

    # Evaluasi dengan GPU jika tersedia
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    labels = labels.to(device)

    # Prediksi
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1).cpu().numpy()

    # Hitung metrik
    acc = accuracy_score(labels.cpu().numpy(), preds)
    f1 = f1_score(labels.cpu().numpy(), preds, average="weighted")
    recall = recall_score(labels.cpu().numpy(), preds, average="weighted")
    precision = precision_score(labels.cpu().numpy(), preds, average="weighted")

    return acc, f1, recall, precision

# Loop melalui setiap kategori model
results = {}
for category in categories:
    model_path = os.path.join(output_dir, category)
    print(f"Evaluating model for category: {category}")

    acc, f1, recall, precision = evaluate_model(model_path, None, texts, labels)

    results[category] = {
        "Accuracy": acc,
        "F1-Score": f1,
        "Recall": recall,
        "Precision": precision
    }

# Print hasil
for category, metrics in results.items():
    print(f"\nCategory: {category}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
