In [None]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Folder model dan nama kategori
model_folders = {
    "attack_vector": "output/attackVector",
    "attack_complexity": "output/attackComplexity",
    "privileges_required": "output/privilegeReq",
    "user_interaction": "output/userinteraction",
    "scope": "output/scope",
    "integrity_impact": "output/integrity",
    "confidentiality_impact": "output/confidentiality",
    "availability_impact": "output/availability",
}

# Memuat tokenizer (gunakan satu tokenizer karena semua model berbasis DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Memuat model
models = {}
for category, folder in model_folders.items():
    model = DistilBertForSequenceClassification.from_pretrained(folder)
    models[category] = model

# Menampilkan hasil pemuatan
print("Semua model berhasil dimuat:")
print(list(models.keys()))

Semua model berhasil dimuat:
['attack_vector', 'attack_complexity', 'privileges_required', 'user_interaction', 'scope', 'integrity_impact', 'confidentiality_impact', 'availability_impact']


In [None]:
# Input teks
input_text = "Improper conditions check in some Intel(R) Ethernet Controllers 800 series Linux drivers before version 1.4.11 may allow an authenticated user to potentially enable information disclosure or denial of service via local access."

# Tokenisasi input
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Prediksi dengan semua model
outputs = {}
for category, model in models.items():
    with torch.no_grad():
        logits = model(**inputs).logits
        prediction = torch.argmax(logits, dim=-1).item()
        outputs[category] = prediction

# Menampilkan hasil prediksi
print("Hasil prediksi:")
for category, prediction in outputs.items():
    print(f"{category}: {prediction}")

Hasil prediksi:
attack_vector: 2
attack_complexity: 0
privileges_required: 0
user_interaction: 0
scope: 0
integrity_impact: 2
confidentiality_impact: 0
availability_impact: 0


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, balanced_accuracy_score

# -------------------------------------- MODEL -------------------------------------

def load_model(model_path):
    """
    Load the pre-trained model from the specified path.
    """
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model

def load_tokenizer(model_name, extra_tokens, token_file, model_path, config_path):
    """
    Load the tokenizer and model based on the model name and additional configurations.
    """
    if model_name == 'distilbert':
        from transformers import DistilBertTokenizerFast
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
    elif model_name == 'bert':
        from transformers import BertTokenizerFast
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    elif model_name == 'deberta':
        from transformers import DebertaTokenizerFast
        tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
    elif model_name == 'roberta':
        from transformers import RobertaTokenizerFast
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    else:
        raise ValueError(f"Unknown model type {model_name}")

    # Add custom tokens if provided
    if extra_tokens:
        add_tokens_from_file(token_file, tokenizer)
    
    return tokenizer

def add_tokens_from_file(token_file, tokenizer):
    """
    Add tokens from a file to the tokenizer.
    """
    with open(token_file, 'r') as file:
        token_list = [line.strip() for line in file]
    tokenizer.add_tokens(token_list)

# -------------------------------------- PREDICTION --------------------------------

def predict(input_description, model, tokenizer, device):
    """
    Make a prediction for a single input description using the trained model and tokenizer.
    """
    # Tokenize the input text
    inputs = tokenizer(input_description, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        softmax = torch.nn.Softmax(dim=1)
        probs = softmax(logits)
    
    predicted_class = torch.argmax(probs, dim=1).item()
    return predicted_class, probs

# -------------------------------------- MAIN -----------------------------------

def main():
    # Set the device (GPU or CPU)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("Device:", device)

    # Paths and configurations
    model_path = "output/privilegeReq//"
    config_path = "output/config.json"
    tokenizer_model_name = 'distilbert'  # Change to your model type (e.g., bert, roberta, etc.)
    extra_tokens = False  # Set to True if you want to load extra tokens
    token_file = "vocab/CVSS_5k.vocab"

    # Load model and tokenizer
    model = load_model(model_path).to(device)
    tokenizer = load_tokenizer(tokenizer_model_name, extra_tokens, token_file, model_path, config_path)

    # Example input description
    input_description = "A vulnerability in Cisco Intercloud Fabric for Business and Cisco Intercloud Fabric for Providers could allow an unauthenticated, remote attacker to connect to the database used by these products. More Information: CSCus99394. Known Affected Releases: 7.3(0)ZN(0.99)."

    # Get the predicted class and probabilities
    predicted_class, probs = predict(input_description, model, tokenizer, device)
    
    print(f"Predicted class: {predicted_class}")
    print(f"Predicted probabilities: {probs}")

if __name__ == '__main__':
    main()


Device: cpu
Predicted class: 0
Predicted probabilities: tensor([[0.9554, 0.0376, 0.0070]])
