In [1]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
import csv
from transformers import Trainer, TrainingArguments, AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import argparse
import os
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\O.Midiyanto\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\O.Midiyanto\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\O.Midiyanto\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\O.Midiyanto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\O.Midiyanto\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
class CVSSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_cvss_txt(split_dir, list_classes):
    """
    Reads a directory structure and returns texts and labels.
    Assumes directories named with class labels (e.g., LOW, HIGH).
    """
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["LOW", "HIGH"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            for i in range(len(list_classes)):
                if list_classes[i] == label_dir:
                    labels.append(i)
                else:
                    continue

    return texts, labels

def read_cvss_csv(file_name, num_label, list_classes):
    """
    Reads a CSV file containing texts and labels, and returns the texts and corresponding integer labels.
    This function handles UTF-8 encoding to avoid issues with non-ASCII characters.
    """
    texts = []
    labels = []

    # Use 'with open' to ensure the file is properly closed after reading
    with open(file_name, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')
        
        # Skip header row if it exists
        next(csv_reader, None)  # This will skip the header, if present
        
        for row in csv_reader:
            texts.append(row[0])  # Assuming the first column is the text
            for i in range(len(list_classes)):
                if list_classes[i] == row[num_label]:  # Match the label with classes
                    labels.append(i)
                    break  # Exit the loop once a match is found

    return texts, labels


In [3]:
def select_tokenizer_model(model_name, extra_tokens, token_file, num_labels):
    global lemmatization

    print("### Selecting Model and Tokenizer")

    if model_name == 'distilbert':
        from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertConfig
        config = DistilBertConfig.from_pretrained('distilbert-base-cased')
        config.num_labels = num_labels
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
        model = DistilBertForSequenceClassification(config)
    
    elif model_name == 'bert':
        from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
        config = BertConfig.from_pretrained('bert-base-uncased')
        config.num_labels = num_labels
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification(config)

    elif model_name == 'deberta':
        from transformers import DebertaConfig, DebertaTokenizerFast, DebertaForSequenceClassification
        config = DebertaConfig.from_pretrained('microsoft/deberta-base')
        config.num_labels = num_labels
        tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
        model = DebertaForSequenceClassification(config)

    elif model_name == 'albert':
        from transformers import AlbertConfig, AlbertTokenizerFast, AlbertForSequenceClassification
        config = AlbertConfig.from_pretrained('albert-base-v1')
        config.num_labels = num_labels
        tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v1')
        model = AlbertForSequenceClassification(config)

    elif model_name == 'roberta':
        from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForSequenceClassification
        config = RobertaConfig.from_pretrained('roberta-base')
        config.num_labels = num_labels
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification(config)

    ### Add Tokens
    if extra_tokens:
        add_tokens_from_file(token_file, tokenizer, lemmatization)
    number_tokens = len(tokenizer)

    print("### Number of tokens in Tokenizer")
    print(number_tokens)

    # print("### Configuration")
    # print(model.config)

    model.resize_token_embeddings(number_tokens) 
    
    return tokenizer, model

def add_tokens_from_file(token_file, tokenizer, lemmatize=False):
    print("### Adding Tokens")
    
    file_      = open(token_file, 'r')
    token_list = []
    
    for line in file_:
        if lemmatize:
            token_list.append(lemmatize_noun(line.rstrip("\n")))
        else:
            token_list.append(line.rstrip("\n"))
    file_.close()
    tokenizer.add_tokens(token_list)

In [4]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(sentence):
    word_list = word_tokenize(sentence)
    # lemmatized_output = ' '.join([lemmatize_word(w) for w in word_list]) # ALL LEMMATIZATION
    lemmatized_output = ' '.join([lemmatize_noun(w) for w in word_list]) # NOUN LEMMATIZATION (OLD)

    return lemmatized_output

def lemmatize(train_texts, test_texts=None):
    ### Lemmatize Sentences
    lemmatized_texts_train = []
    lemmatized_texts_test  = []
    for text in train_texts:
        lemmatized_texts_train.append(lemmatize_sentence(text))
    if test_texts is not None:
        for text in test_texts:
            lemmatized_texts_test.append(lemmatize_sentence(text))

    return lemmatized_texts_train, lemmatized_texts_test

def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    pos_tag = get_wordnet_pos(word)
    word_lemmatized = lemmatizer.lemmatize(word, pos_tag)

    if pos_tag == "r" or pos_tag == "R":
        try:
            lemmas = wordnet.synset(word+'.r.1').lemmas()
            pertainyms = lemmas[0].pertainyms()
            name = pertainyms[0].name()
            return name
        except Exception:
            return word_lemmatized
    else:
        return word_lemmatized

def lemmatize_noun(word):
    lemmatizer = WordNetLemmatizer()
    word_lemmatized = lemmatizer.lemmatize(word)

    return word_lemmatized



In [5]:
def get_pred_accuracy(target, output):
    output = output.argmax(axis=1) # -> multi label

    tot_right = np.sum(target == output)
    tot = target.size

    return (tot_right/tot) * 100

def get_binary_mean_accuracy(target, output):
    eps = 1e-20
    output = output.argmax(axis=1)

    # TP + FN
    gt_pos = np.sum((target == 1), axis=0).astype(float)
    # TN + FP
    gt_neg = np.sum((target == 0), axis=0).astype(float)
    # TP
    true_pos = np.sum((target == 1) * (output == 1), axis=0).astype(float)
    # TN
    true_neg = np.sum((target == 0) * (output == 0), axis=0).astype(float)

    label_pos_recall = 1.0 * true_pos / (gt_pos + eps)  # true positive
    label_neg_recall = 1.0 * true_neg / (gt_neg + eps)  # true negative
    
    # mean accuracy
    return (label_pos_recall + label_neg_recall) / 2

def get_evaluation_metrics(target, output, num_labels):
    accuracy      = get_pred_accuracy(target, output, num_labels)
    precision     = get_precision(target, output)
    recall        = get_recall(target, output)
    f1_score      = get_f1_score(target, output)

    return accuracy, precision, recall, f1_score

def infer(trainer, test_loader, num_labels):
    predicts   = trainer.predict(test_loader)
    soft       = torch.nn.Softmax(dim=1)
    pred_probs = torch.from_numpy(predicts.predictions)
    pred_probs = soft(pred_probs).numpy()
    gt_list    = predicts.label_ids

    return get_pred_accuracy(gt_list, pred_probs)

In [8]:
# Daftar kombinasi variabel untuk setiap kategori
categories = [
    {
        "name": "attackVector",
        "num_labels": 4,
        "classes_names": ['NETWORK', 'LOCAL', 'PHYSICAL', 'ADJACENT_NETWORK'],
        "label_position": 1,
        "output_dir": 'output1/attackVector'
    },
    {
        "name": "attackComplexity",
        "num_labels": 2,
        "classes_names": ['LOW', 'HIGH'],
        "label_position": 2,
        "output_dir": 'output1/attackComplexity'
    },
    {
        "name": "privilegeReq",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 3,
        "output_dir": 'output1/privilegeReq'
    },
    {
        "name": "userInteraction",
        "num_labels": 2,
        "classes_names": ['NONE', 'REQUIRED'],
        "label_position": 4,
        "output_dir": 'output1/userInteraction'
    },
    {
        "name": "scope",
        "num_labels": 2,
        "classes_names": ['UNCHANGED', 'CHANGED'],
        "label_position": 5,
        "output_dir": 'output1/scope'
    },
    {
        "name": "confidentiality",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 6,
        "output_dir": 'output1/confidentiality'
    },
    {
        "name": "integrity",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 7,
        "output_dir": 'output1/integrity'
    },
    {
        "name": "availability",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 8,
        "output_dir": 'output1/availability'
    }
]

In [None]:
def main():
    global lemmatization

    # variables
    model_name = 'distilbert'
    extra_tokens = True  # Menggunakan ekstra token
    token_file = 'vocab/CVSS_5k.vocab'  # File token
    lemmatization = True  # Menggunakan lemmatization

    # Parameter untuk tuning
    train_batch_size = 8  # Ukuran batch untuk training
    test_batch_size = 4  # Ukuran batch untuk testing
    epochs = 3  # Jumlah epoch
    learning_rate = 5e-5  # Learning rate
    weight_decay = 0  # Weight decay
    warmup_steps = 0  # Jumlah warmup steps
    warmup_ratio = 0  # Warmup ratio

    # Periksa ketersediaan GPU
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("### Device: ", device)
    if torch.cuda.is_available():
        devName = torch.cuda.get_device_name(0)
        print(f"GPU name is {devName}")

 # Loop untuk setiap kategori
    for category in categories:
        print(f"\n### Training model for {category['name']}")

        # Directories and variables for the current category
        output_dir = category["output_dir"]
        num_labels = category["num_labels"]
        classes_names = category["classes_names"]
        label_position = category["label_position"]

        # Buat output directory jika belum ada
        os.makedirs(output_dir, exist_ok=True)

        # Select Model
        tokenizer, model = select_tokenizer_model(model_name, extra_tokens=extra_tokens, token_file=token_file, num_labels=num_labels)

        # Splitting Dataset
        print("### Splitting Dataset")

        train_texts, train_labels = read_cvss_csv(f'data/train.csv', label_position, classes_names)
        test_texts, test_labels = read_cvss_csv(f'data/test.csv', label_position, classes_names)

        # Lemmatize Sentences
        if lemmatization:
            print("### Lemmatizing Sentences")
            lemmatized_train, lemmatized_test = lemmatize(train_texts, test_texts)

        # Tokenize Sentences
        print("### Tokenizing Sentences")

        if lemmatization:
            train_encodings = tokenizer(lemmatized_train, truncation=True, padding=True)
            test_encodings = tokenizer(lemmatized_test, truncation=True, padding=True)
        else:
            train_encodings = tokenizer(train_texts, truncation=True, padding=True)
            test_encodings = tokenizer(test_texts, truncation=True, padding=True)

        # Dataset Encodings
        print("### Encoding Dataset")

        train_dataset = CVSSDataset(train_encodings, train_labels)
        test_dataset = CVSSDataset(test_encodings, test_labels)

        # Training
        print("### Training")

        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=test_batch_size,
            learning_rate=learning_rate,
            save_strategy="epoch",
            weight_decay=weight_decay,
            warmup_steps=warmup_steps,
            warmup_ratio=warmup_ratio,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
        )

        trainer.train()
        trainer.save_model()
        acc = infer(trainer, test_dataset, num_labels)
        print(f"Accuracy for {category['name']} = {acc:.6f}")

if __name__ == '__main__':
    main()

### Device:  cuda
GPU name is NVIDIA GeForce RTX 4050 Laptop GPU

### Training model for attackVector
### Selecting Model and Tokenizer
### Adding Tokens
### Number of tokens in Tokenizer
33867
### Splitting Dataset
### Lemmatizing Sentences
### Tokenizing Sentences
### Encoding Dataset
### Training


  3%|▎         | 500/18120 [02:39<1:32:05,  3.19it/s]

{'loss': 0.52, 'grad_norm': 2.7700371742248535, 'learning_rate': 4.862030905077263e-05, 'epoch': 0.08}


  6%|▌         | 1000/18120 [05:18<1:30:49,  3.14it/s]

{'loss': 0.4279, 'grad_norm': 2.299497127532959, 'learning_rate': 4.7240618101545256e-05, 'epoch': 0.17}


  8%|▊         | 1500/18120 [07:55<1:28:26,  3.13it/s]

{'loss': 0.3959, 'grad_norm': 2.6241769790649414, 'learning_rate': 4.586092715231788e-05, 'epoch': 0.25}


 11%|█         | 2000/18120 [10:31<1:24:38,  3.17it/s]

{'loss': 0.3743, 'grad_norm': 7.912014484405518, 'learning_rate': 4.448123620309051e-05, 'epoch': 0.33}


 14%|█▍        | 2500/18120 [13:08<1:22:52,  3.14it/s]

{'loss': 0.3288, 'grad_norm': 0.3752608001232147, 'learning_rate': 4.310154525386314e-05, 'epoch': 0.41}


 17%|█▋        | 3000/18120 [15:44<1:19:20,  3.18it/s]

{'loss': 0.3559, 'grad_norm': 0.49020126461982727, 'learning_rate': 4.1721854304635764e-05, 'epoch': 0.5}


 19%|█▉        | 3500/18120 [18:21<1:17:32,  3.14it/s]

{'loss': 0.342, 'grad_norm': 0.27111145853996277, 'learning_rate': 4.034216335540839e-05, 'epoch': 0.58}


 22%|██▏       | 4000/18120 [20:58<1:13:16,  3.21it/s]

{'loss': 0.319, 'grad_norm': 0.8238117098808289, 'learning_rate': 3.896247240618102e-05, 'epoch': 0.66}


 25%|██▍       | 4500/18120 [23:34<1:11:08,  3.19it/s]

{'loss': 0.342, 'grad_norm': 4.729596138000488, 'learning_rate': 3.7582781456953645e-05, 'epoch': 0.75}


 28%|██▊       | 5000/18120 [26:11<1:08:13,  3.21it/s]

{'loss': 0.3278, 'grad_norm': 4.74317741394043, 'learning_rate': 3.620309050772627e-05, 'epoch': 0.83}


 30%|███       | 5500/18120 [28:48<1:05:36,  3.21it/s]

{'loss': 0.2978, 'grad_norm': 6.090857028961182, 'learning_rate': 3.48233995584989e-05, 'epoch': 0.91}


 33%|███▎      | 6000/18120 [31:24<1:03:04,  3.20it/s]

{'loss': 0.3152, 'grad_norm': 6.858245372772217, 'learning_rate': 3.3443708609271526e-05, 'epoch': 0.99}


 36%|███▌      | 6500/18120 [34:02<59:53,  3.23it/s]  

{'loss': 0.2872, 'grad_norm': 0.21792072057724, 'learning_rate': 3.206401766004415e-05, 'epoch': 1.08}


 39%|███▊      | 7000/18120 [36:38<58:01,  3.19it/s]  

{'loss': 0.2575, 'grad_norm': 14.522615432739258, 'learning_rate': 3.068432671081678e-05, 'epoch': 1.16}


 41%|████▏     | 7500/18120 [39:15<55:02,  3.22it/s]

{'loss': 0.2391, 'grad_norm': 0.05159483477473259, 'learning_rate': 2.9304635761589406e-05, 'epoch': 1.24}


 44%|████▍     | 8000/18120 [41:51<52:27,  3.22it/s]

{'loss': 0.277, 'grad_norm': 4.436601161956787, 'learning_rate': 2.792494481236203e-05, 'epoch': 1.32}


 47%|████▋     | 8500/18120 [44:27<50:50,  3.15it/s]

{'loss': 0.2488, 'grad_norm': 6.297393321990967, 'learning_rate': 2.654525386313466e-05, 'epoch': 1.41}


 50%|████▉     | 9000/18120 [47:04<47:41,  3.19it/s]

{'loss': 0.2567, 'grad_norm': 7.166050434112549, 'learning_rate': 2.5165562913907287e-05, 'epoch': 1.49}


 52%|█████▏    | 9500/18120 [49:40<45:41,  3.14it/s]

{'loss': 0.2541, 'grad_norm': 0.4552445411682129, 'learning_rate': 2.378587196467991e-05, 'epoch': 1.57}


 55%|█████▌    | 10000/18120 [52:16<42:13,  3.20it/s]

{'loss': 0.2368, 'grad_norm': 1.3384819030761719, 'learning_rate': 2.240618101545254e-05, 'epoch': 1.66}


 58%|█████▊    | 10500/18120 [54:53<39:48,  3.19it/s]

{'loss': 0.2342, 'grad_norm': 0.27500709891319275, 'learning_rate': 2.1026490066225165e-05, 'epoch': 1.74}


 61%|██████    | 11000/18120 [57:30<37:01,  3.20it/s]

{'loss': 0.2516, 'grad_norm': 15.332067489624023, 'learning_rate': 1.9646799116997795e-05, 'epoch': 1.82}


 63%|██████▎   | 11500/18120 [1:00:06<35:06,  3.14it/s]

{'loss': 0.2448, 'grad_norm': 0.1511230617761612, 'learning_rate': 1.826710816777042e-05, 'epoch': 1.9}


 66%|██████▌   | 12000/18120 [1:02:42<31:44,  3.21it/s]

{'loss': 0.2256, 'grad_norm': 2.8520750999450684, 'learning_rate': 1.688741721854305e-05, 'epoch': 1.99}


 69%|██████▉   | 12500/18120 [1:05:20<29:21,  3.19it/s]

{'loss': 0.234, 'grad_norm': 5.18375825881958, 'learning_rate': 1.5507726269315672e-05, 'epoch': 2.07}


 72%|███████▏  | 13000/18120 [1:07:57<26:45,  3.19it/s]

{'loss': 0.1899, 'grad_norm': 76.68836212158203, 'learning_rate': 1.4128035320088301e-05, 'epoch': 2.15}


 75%|███████▍  | 13500/18120 [1:10:33<24:26,  3.15it/s]

{'loss': 0.1922, 'grad_norm': 7.9875688552856445, 'learning_rate': 1.274834437086093e-05, 'epoch': 2.24}


 77%|███████▋  | 14000/18120 [1:13:09<21:19,  3.22it/s]

{'loss': 0.1946, 'grad_norm': 5.972846031188965, 'learning_rate': 1.1368653421633555e-05, 'epoch': 2.32}


 80%|████████  | 14500/18120 [1:15:46<19:01,  3.17it/s]

{'loss': 0.1842, 'grad_norm': 7.217455863952637, 'learning_rate': 9.988962472406182e-06, 'epoch': 2.4}


 83%|████████▎ | 15000/18120 [1:18:22<16:15,  3.20it/s]

{'loss': 0.1948, 'grad_norm': 0.5416224002838135, 'learning_rate': 8.609271523178809e-06, 'epoch': 2.48}


 86%|████████▌ | 15500/18120 [1:20:58<13:33,  3.22it/s]

{'loss': 0.1729, 'grad_norm': 0.7456077337265015, 'learning_rate': 7.229580573951435e-06, 'epoch': 2.57}


 88%|████████▊ | 16000/18120 [1:23:35<11:01,  3.21it/s]

{'loss': 0.1983, 'grad_norm': 34.45521926879883, 'learning_rate': 5.8498896247240626e-06, 'epoch': 2.65}


 91%|█████████ | 16500/18120 [1:26:11<08:35,  3.14it/s]

{'loss': 0.1796, 'grad_norm': 0.24986600875854492, 'learning_rate': 4.470198675496689e-06, 'epoch': 2.73}


 94%|█████████▍| 17000/18120 [1:28:47<05:49,  3.21it/s]

{'loss': 0.1848, 'grad_norm': 6.009553909301758, 'learning_rate': 3.090507726269316e-06, 'epoch': 2.81}


 97%|█████████▋| 17500/18120 [1:31:23<03:14,  3.19it/s]

{'loss': 0.1833, 'grad_norm': 9.270381927490234, 'learning_rate': 1.7108167770419427e-06, 'epoch': 2.9}


 99%|█████████▉| 18000/18120 [1:33:59<00:37,  3.22it/s]

{'loss': 0.2027, 'grad_norm': 0.039846546947956085, 'learning_rate': 3.3112582781456954e-07, 'epoch': 2.98}


100%|██████████| 18120/18120 [1:34:39<00:00,  3.19it/s]


{'train_runtime': 5679.1806, 'train_samples_per_second': 25.525, 'train_steps_per_second': 3.191, 'train_loss': 0.2681960056422825, 'epoch': 3.0}


100%|██████████| 3020/3020 [02:37<00:00, 19.20it/s]


Accuracy for attackVector = 93.576159

### Training model for attackComplexity
### Selecting Model and Tokenizer
### Adding Tokens
### Number of tokens in Tokenizer
33867
### Splitting Dataset
### Lemmatizing Sentences
### Tokenizing Sentences
### Encoding Dataset
### Training


  3%|▎         | 500/18120 [02:37<1:31:55,  3.19it/s]

{'loss': 0.1974, 'grad_norm': 0.083956778049469, 'learning_rate': 4.862030905077263e-05, 'epoch': 0.08}


  6%|▌         | 1000/18120 [05:16<1:27:35,  3.26it/s]

{'loss': 0.169, 'grad_norm': 0.25719785690307617, 'learning_rate': 4.7240618101545256e-05, 'epoch': 0.17}


  8%|▊         | 1500/18120 [07:55<1:26:09,  3.22it/s]

{'loss': 0.1485, 'grad_norm': 0.060893166810274124, 'learning_rate': 4.586092715231788e-05, 'epoch': 0.25}


 11%|█         | 2000/18120 [10:33<1:26:04,  3.12it/s]

{'loss': 0.1641, 'grad_norm': 0.21197843551635742, 'learning_rate': 4.448123620309051e-05, 'epoch': 0.33}


 14%|█▍        | 2500/18120 [13:08<1:21:13,  3.20it/s]

{'loss': 0.143, 'grad_norm': 0.22025230526924133, 'learning_rate': 4.310154525386314e-05, 'epoch': 0.41}


 17%|█▋        | 3000/18120 [15:45<1:18:49,  3.20it/s]

{'loss': 0.1476, 'grad_norm': 0.20624740421772003, 'learning_rate': 4.1721854304635764e-05, 'epoch': 0.5}


 19%|█▉        | 3500/18120 [18:21<1:16:31,  3.18it/s]

{'loss': 0.1428, 'grad_norm': 0.1841527372598648, 'learning_rate': 4.034216335540839e-05, 'epoch': 0.58}


 22%|██▏       | 4000/18120 [20:57<1:13:23,  3.21it/s]

{'loss': 0.1517, 'grad_norm': 0.1653788685798645, 'learning_rate': 3.896247240618102e-05, 'epoch': 0.66}


 25%|██▍       | 4500/18120 [23:33<1:10:07,  3.24it/s]

{'loss': 0.1404, 'grad_norm': 0.2682121992111206, 'learning_rate': 3.7582781456953645e-05, 'epoch': 0.75}


 28%|██▊       | 5000/18120 [26:09<1:09:22,  3.15it/s]

{'loss': 0.1567, 'grad_norm': 0.2100558876991272, 'learning_rate': 3.620309050772627e-05, 'epoch': 0.83}


 30%|███       | 5500/18120 [28:45<1:06:01,  3.19it/s]

{'loss': 0.1407, 'grad_norm': 0.15729781985282898, 'learning_rate': 3.48233995584989e-05, 'epoch': 0.91}


 33%|███▎      | 6000/18120 [31:21<1:03:13,  3.20it/s]

{'loss': 0.1389, 'grad_norm': 0.8957111239433289, 'learning_rate': 3.3443708609271526e-05, 'epoch': 0.99}


 36%|███▌      | 6500/18120 [33:59<1:02:14,  3.11it/s]

{'loss': 0.1473, 'grad_norm': 0.14737693965435028, 'learning_rate': 3.206401766004415e-05, 'epoch': 1.08}


 39%|███▊      | 7000/18120 [36:37<57:12,  3.24it/s]  

{'loss': 0.1377, 'grad_norm': 0.14239954948425293, 'learning_rate': 3.068432671081678e-05, 'epoch': 1.16}


 39%|███▉      | 7069/18120 [36:59<58:58,  3.12it/s]  

# INFER

In [7]:
from pathlib import Path
from nltk import tokenize
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import argparse
from sklearn.metrics import f1_score, precision_score, recall_score, balanced_accuracy_score, accuracy_score

In [8]:

# -------------------------------------- MODEL -------------------------------------

def load_model(model_path):
    from transformers import AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model


def select_tokenizer_model(model_name, extra_tokens, token_file, model_path, config_path):
    global lemmatization
    
    if model_name == 'distilbert':
        from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertConfig
        config = DistilBertConfig.from_pretrained(config_path)
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
        model = DistilBertForSequenceClassification(config)
    
    elif model_name == 'bert':
        from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
        config = BertConfig.from_pretrained(config_path)
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification(config)

    elif model_name == 'deberta':
        from transformers import DebertaConfig, DebertaTokenizerFast, DebertaForSequenceClassification
        config = DebertaConfig.from_pretrained(config_path)
        tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
        model = DebertaForSequenceClassification(config)

    elif model_name == 'albert':
        from transformers import AlbertConfig, AlbertTokenizerFast, AlbertForSequenceClassification
        config = AlbertConfig.from_pretrained(config_path)
        tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v1')
        model = AlbertForSequenceClassification(config)

    elif model_name == 'roberta':
        from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForSequenceClassification
        config = RobertaConfig.from_pretrained(config_path)
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification(config)
        
    ### Add Tokens
    if extra_tokens:
        add_tokens_from_file(token_file, tokenizer, lemmatization)
    number_tokens = len(tokenizer)

    print("### Number of tokens in Tokenizer")
    print(number_tokens)

    model.resize_token_embeddings(number_tokens) 

    return tokenizer, model

def add_tokens_from_file(token_file, tokenizer, lemmatize=False):
    print("### Adding Tokens")
    
    file_      = open(token_file, 'r')
    token_list = []
    
    for line in file_:
        if lemmatize:
            token_list.append(lemmatize_noun(line.rstrip("\n")))
        else:
            token_list.append(line.rstrip("\n"))
    file_.close()
    tokenizer.add_tokens(token_list)

# -------------------------------------- METRICS -----------------------------------

def get_pred_accuracy(target, output):
    output = output.argmax(axis=1) # -> multi label

    tot_right = np.sum(target == output)
    tot = target.size

    return (tot_right/tot) * 100

def get_accuracy_score(target, output):
    return accuracy_score(target, output)

def get_f1_score(target, output):
    return f1_score(target, output, average='weighted')

def get_precision_score(target, output):
    return precision_score(target, output, average='weighted')

def get_recall_score(target, output):
    return recall_score(target, output, average='weighted')

def get_mean_accuracy(target, output):
    eps = 1e-20
    output = output.argmax(axis=1)

    # TP + FN
    gt_pos = np.sum((target == 1), axis=0).astype(float)
    # TN + FP
    gt_neg = np.sum((target == 0), axis=0).astype(float)
    # TP
    true_pos = np.sum((target == 1) * (output == 1), axis=0).astype(float)
    # TN
    true_neg = np.sum((target == 0) * (output == 0), axis=0).astype(float)

    label_pos_recall = 1.0 * true_pos / (gt_pos + eps)  # true positive
    label_neg_recall = 1.0 * true_neg / (gt_neg + eps)  # true negative
    
    # mean accuracy
    return (label_pos_recall + label_neg_recall) / 2

def get_balanced_accuracy(target, output):
    return balanced_accuracy_score(target, output)

In [9]:
# Daftar kombinasi variabel untuk setiap kategori
categories = [
    {
        "name": "attackVector",
        "num_labels": 4,
        "classes_names": ['NETWORK', 'LOCAL', 'PHYSICAL', 'ADJACENT_NETWORK'],
        "label_position": 1,
        "output_dir": 'output/attackVector'
    },
    {
        "name": "attackComplexity",
        "num_labels": 2,
        "classes_names": ['LOW', 'HIGH'],
        "label_position": 2,
        "output_dir": 'output/attackComplexity'
    },
    {
        "name": "privilegeReq",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 3,
        "output_dir": 'output/privilegeReq'
    },
    {
        "name": "userInteraction",
        "num_labels": 2,
        "classes_names": ['NONE', 'REQUIRED'],
        "label_position": 4,
        "output_dir": 'output/userInteraction'
    },
    {
        "name": "scope",
        "num_labels": 2,
        "classes_names": ['UNCHANGED', 'CHANGED'],
        "label_position": 5,
        "output_dir": 'output/scope'
    },
    {
        "name": "confidentiality",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 6,
        "output_dir": 'output/confidentiality'
    },
    {
        "name": "integrity",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 7,
        "output_dir": 'output/integrity'
    },
    {
        "name": "availability",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 8,
        "output_dir": 'output/availability'
    }
]

In [None]:
# -------------------------------------- MAIN -----------------------------------

def main():
    global lemmatization
    # variables
    model_name = 'distilbert'
    extra_tokens = True  # Menggunakan ekstra token
    token_file = 'vocab/CVSS_5k.vocab'  # File token
    lemmatization = True  # Menggunakan lemmatization

    batch_size     = 2
    

    # Periksa ketersediaan GPU
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("### Device: ", device)
    if torch.cuda.is_available():
        devName = torch.cuda.get_device_name(0)
        print(f"GPU name is {devName}")

    # Loop untuk setiap kategori
    for category in categories:
        print(f"\n### Training model for {category['name']}")
        # Directories and variables for the current category
        root_dir = category["output_dir"]
        num_labels = category["num_labels"]
        list_classes = category["classes_names"]
        label_position = category["label_position"]
        model_path  = root_dir 
        config_path = root_dir + '/' + 'config.json'

        ### Select Model
        tokenizer, model = select_tokenizer_model(model_name, extra_tokens, token_file, model_path, config_path)

        ### Load Dataset
        print("### Loading Dataset")
        
        test_texts, test_labels = read_cvss_csv('data/train.csv', label_position, list_classes)

        ### Lemmatize Sentences
        if lemmatization:
            print("### Lemmatizing Sentences")
            lemmatized_test, _ = lemmatize(test_texts)

        ### Tokenize Sentences
        print("### Tokenizing Sentences")

        if lemmatization:
            test_encodings = tokenizer(lemmatized_test, truncation=True, padding=True)
            
        else:
            test_encodings = tokenizer(test_texts, truncation=True, padding=True)

        ### Dataset Encodings
        test_dataset = CVSSDataset(test_encodings, test_labels)

        print("### Dataset Encodings")

        model = load_model(model_path)
        model.to(device)

        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

        model.eval()
        pred_probs = []
        gt_list = []

        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            soft = torch.nn.Softmax(dim=1)
            output_logits = soft(outputs.logits)
            
            gt_list.append(labels.cpu().detach().numpy())
            pred_probs.append(output_logits.cpu().detach().numpy())

        gt_list = np.concatenate(gt_list, axis=0)
        pred_probs = np.concatenate(pred_probs, axis=0)
        pred_probs = pred_probs.argmax(axis=1)


        print("Accuracy = {:.6f}   F1-score = {:.6f}   Precision = {:.6f}   Recall = {:.6f}   mean Accuracy = {:.6f}".format(get_accuracy_score(gt_list, pred_probs), get_f1_score(gt_list, pred_probs), get_precision_score(gt_list, pred_probs), get_recall_score(gt_list, pred_probs), balanced_accuracy_score(gt_list, pred_probs)))

if __name__ == '__main__':
    main()

# LOAD AND PREDICT

In [15]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch

# Folder model dan nama kategori
model_folders = {
    "attack_vector": "output/attackVector",
    "attack_complexity": "output/attackComplexity",
    "privileges_required": "output/privilegeReq",
    "user_interaction": "output/userinteraction",
    "scope": "output/scope",
    "integrity_impact": "output/integrity",
    "confidentiality_impact": "output/confidentiality",
    "availability_impact": "output/availability",
}

# Memuat tokenizer (gunakan satu tokenizer karena semua model berbasis DistilBERT)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Memuat model
models = {}
for category, folder in model_folders.items():
    model = DistilBertForSequenceClassification.from_pretrained(folder)
    models[category] = model

# Menampilkan hasil pemuatan
print("Semua model berhasil dimuat:")
print(list(models.keys()))

Semua model berhasil dimuat:
['attack_vector', 'attack_complexity', 'privileges_required', 'user_interaction', 'scope', 'integrity_impact', 'confidentiality_impact', 'availability_impact']


In [20]:
# Input teks
input_text = "Improper conditions check in some Intel(R) Ethernet Controllers 800 series Linux drivers before version 1.4.11 may allow an authenticated user to potentially enable information disclosure or denial of service via local access."

# Tokenisasi input
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Prediksi dengan semua model
outputs = {}
for category, model in models.items():
    with torch.no_grad():
        logits = model(**inputs).logits
        prediction = torch.argmax(logits, dim=-1).item()
        outputs[category] = prediction

# Menampilkan hasil prediksi
print("Hasil prediksi:")
for category, prediction in outputs.items():
    print(f"{category}: {prediction}")

Hasil prediksi:
attack_vector: 2
attack_complexity: 0
privileges_required: 0
user_interaction: 0
scope: 0
integrity_impact: 2
confidentiality_impact: 0
availability_impact: 0


In [40]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, balanced_accuracy_score

# -------------------------------------- MODEL -------------------------------------

def load_model(model_path):
    """
    Load the pre-trained model from the specified path.
    """
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model

def load_tokenizer(model_name, extra_tokens, token_file, model_path, config_path):
    """
    Load the tokenizer and model based on the model name and additional configurations.
    """
    if model_name == 'distilbert':
        from transformers import DistilBertTokenizerFast
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
    elif model_name == 'bert':
        from transformers import BertTokenizerFast
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
    elif model_name == 'deberta':
        from transformers import DebertaTokenizerFast
        tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
    elif model_name == 'roberta':
        from transformers import RobertaTokenizerFast
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
    else:
        raise ValueError(f"Unknown model type {model_name}")

    # Add custom tokens if provided
    if extra_tokens:
        add_tokens_from_file(token_file, tokenizer)
    
    return tokenizer

def add_tokens_from_file(token_file, tokenizer):
    """
    Add tokens from a file to the tokenizer.
    """
    with open(token_file, 'r') as file:
        token_list = [line.strip() for line in file]
    tokenizer.add_tokens(token_list)

# -------------------------------------- PREDICTION --------------------------------

def predict(input_description, model, tokenizer, device):
    """
    Make a prediction for a single input description using the trained model and tokenizer.
    """
    # Tokenize the input text
    inputs = tokenizer(input_description, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)

    # Get predictions from the model
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        softmax = torch.nn.Softmax(dim=1)
        probs = softmax(logits)
    
    predicted_class = torch.argmax(probs, dim=1).item()
    return predicted_class, probs

# -------------------------------------- MAIN -----------------------------------

def main():
    # Set the device (GPU or CPU)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("Device:", device)

    # Paths and configurations
    model_path = "output/privilegeReq//"
    config_path = "output/config.json"
    tokenizer_model_name = 'distilbert'  # Change to your model type (e.g., bert, roberta, etc.)
    extra_tokens = False  # Set to True if you want to load extra tokens
    token_file = "vocab/CVSS_5k.vocab"

    # Load model and tokenizer
    model = load_model(model_path).to(device)
    tokenizer = load_tokenizer(tokenizer_model_name, extra_tokens, token_file, model_path, config_path)

    # Example input description
    input_description = "A vulnerability in Cisco Intercloud Fabric for Business and Cisco Intercloud Fabric for Providers could allow an unauthenticated, remote attacker to connect to the database used by these products. More Information: CSCus99394. Known Affected Releases: 7.3(0)ZN(0.99)."

    # Get the predicted class and probabilities
    predicted_class, probs = predict(input_description, model, tokenizer, device)
    
    print(f"Predicted class: {predicted_class}")
    print(f"Predicted probabilities: {probs}")

if __name__ == '__main__':
    main()


Device: cuda
Predicted class: 0
Predicted probabilities: tensor([[0.9554, 0.0376, 0.0070]], device='cuda:0')


In [None]:
NETWORK,LOW,LOW,NONE,UNCHANGED,HIGH,HIGH,HIGH