<h1><b>DOWNLOAD DATASET + EDA</b></h1>

In [1]:
import os
import json
import gzip
import requests
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

def download_nvd_feed(year):
    feed_url = f'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{year}.json.gz'
    response = requests.get(feed_url)

    with open(f'nvdcve-1.1-{year}.json.gz', 'wb') as f:
        f.write(response.content)

    return f'nvdcve-1.1-{year}.json.gz'


def extract_cve_from_item(item):
    if 'baseMetricV3' not in item['impact']:
        return None

    en_text = next(
        (desc['value'] for desc in item['cve']['description']['description_data']
         if desc['lang'] == 'en'), None)
    if en_text is None:
        return None

    en_text = en_text.replace('\n', ' ')

    return {
        'english_description': en_text,
        'cvssv3_attack_vector': item['impact']['baseMetricV3']['cvssV3']['attackVector'],
        'cvssv3_attack_complexity': item['impact']['baseMetricV3']['cvssV3']['attackComplexity'],
        'cvssv3_privileges_required': item['impact']['baseMetricV3']['cvssV3']['privilegesRequired'],
        'cvssv3_user_interaction': item['impact']['baseMetricV3']['cvssV3']['userInteraction'],
        'cvssv3_scope': item['impact']['baseMetricV3']['cvssV3']['scope'],
        'cvssv3_confidentiality_impact': item['impact']['baseMetricV3']['cvssV3']['confidentialityImpact'],
        'cvssv3_integrity_impact': item['impact']['baseMetricV3']['cvssV3']['integrityImpact'],
        'cvssv3_availability_impact': item['impact']['baseMetricV3']['cvssV3']['availabilityImpact'],
    }

def process_cve_data(years):
    all_cves = []
    skipped = 0
    processed = 0

    for year in years:
        filename = f'nvdcve-1.1-{year}.json.gz'

        if not os.path.exists(filename):
            print(f"Downloading data for year {year}...")
            filename = download_nvd_feed(year)
        else:
            print(f"File for year {year} already exists, using the existing file.")

        with gzip.open(filename, 'rt', encoding='utf-8') as f:
            nvd_data = json.load(f)

        print(f"Processing data for year {year}: {len(nvd_data['CVE_Items'])} total CVEs")

        year_cves = []
        for item in nvd_data['CVE_Items']:
            relevant_data = extract_cve_from_item(item)
            if relevant_data is None:
                skipped += 1
            else:
                year_cves.append(relevant_data)
                processed += 1

        all_cves.extend(year_cves)

    print(f"Processed CVEs: {processed}\nSkipped CVEs: {skipped}")
    return pd.DataFrame(all_cves)


years = [2022, 2023, 2024]
cvss_data = process_cve_data(years)

# Membuat kolom english_description diawali dengan tanda kutip
cvss_data['english_description'] = cvss_data['english_description'].apply(lambda x: f'"{x}"')

# CLEANING : Ganti semua newline dengan spasi di kolom 'english_description'
cvss_data['english_description'] = cvss_data['english_description'].str.replace(r'[\n\r]+', ' ', regex=True)

# Simpan data ke dalam file CSV jika diperlukan
cvss_data.to_csv('cvss_data.csv', index=False)

# Membuat direktori 'data' jika belum ada
if not os.path.exists('data'):
    os.makedirs('data')

# Memisahkan data menjadi training dan testing (80% train, 20% test)
train_data, test_data = train_test_split(cvss_data, test_size=0.2, random_state=42)

# Menyimpan data ke dalam file CSV
train_data.to_csv('data/train.csv', index=False)
test_data.to_csv('data/test.csv', index=False)

print("Data berhasil dipisahkan menjadi 'data/train.csv' dan 'data/test.csv'.")

Downloading data for year 2022...
Processing data for year 2022: 25287 total CVEs
Downloading data for year 2023...
Processing data for year 2023: 29079 total CVEs
Downloading data for year 2024...
Processing data for year 2024: 32859 total CVEs
Processed CVEs: 60400
Skipped CVEs: 26825
Data berhasil dipisahkan menjadi 'data/train.csv' dan 'data/test.csv'.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60400 entries, 0 to 60399
Data columns (total 9 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   english_description            60400 non-null  object
 1   cvssv3_attack_vector           60400 non-null  object
 2   cvssv3_attack_complexity       60400 non-null  object
 3   cvssv3_privileges_required     60400 non-null  object
 4   cvssv3_user_interaction        60400 non-null  object
 5   cvssv3_scope                   60400 non-null  object
 6   cvssv3_confidentiality_impact  60400 non-null  object
 7

In [None]:
!mkdir -p vocab
!wget https://raw.githubusercontent.com/omidiyanto/distilbert-cvss-prediction/refs/heads/master/vocab/CVSS_5k.vocab -P vocab/

In [None]:
print(cvss_data.describe(include='all'))

In [None]:
print(cvss_data.info())

In [None]:
print(cvss_data.head)

In [None]:
print("\nUnique Values per Column:")
print(cvss_data.nunique())

In [None]:
# Missing values per kolom
print("\nMissing Values per Column:")
print(cvss_data.isnull().sum())

# Persentase missing values
missing_percentage = cvss_data.isnull().mean() * 100
print("\nPercentage of Missing Values:")
print(missing_percentage)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Visualisasi distribusi per kategori
categories = [
    'cvssv3_attack_vector', 'cvssv3_attack_complexity', 'cvssv3_privileges_required',
    'cvssv3_user_interaction', 'cvssv3_scope', 'cvssv3_confidentiality_impact',
    'cvssv3_integrity_impact', 'cvssv3_availability_impact'
]

plt.figure(figsize=(15, 10))
for i, category in enumerate(categories):
    plt.subplot(4, 2, i + 1)
    sns.countplot(y=category, data=cvss_data, order=cvss_data[category].value_counts().index)
    plt.title(f"Distribution of {category.replace('_', ' ').capitalize()}")
    plt.xlabel('Count')
    plt.ylabel(category.replace('_', ' ').capitalize())

plt.tight_layout()
plt.show()

In [None]:
print(cvss_data.shape)

In [None]:
# Distribusi setiap kategori
for category in categories:
    print(f"\nDistribusi untuk {category}:")
    print(cvss_data[category].value_counts(normalize=True) * 100)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Menghitung panjang teks
cvss_data['description_length'] = cvss_data['english_description'].apply(len)

# Statistik deskriptif
length_stats = cvss_data['description_length'].describe()
print(length_stats)

# Visualisasi distribusi panjang teks
plt.figure(figsize=(10, 6))
sns.histplot(cvss_data['description_length'], kde=True, bins=30, color='blue')
plt.title('Distribusi Panjang Deskripsi')
plt.xlabel('Panjang Deskripsi')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate word count and add it as a new column
cvss_data['word_count'] = cvss_data['english_description'].apply(lambda x: len(x.split()))

# Statistik deskriptif
length_stats = cvss_data['word_count'].describe()
print(length_stats)

# Visualisasi distribusi panjang teks
plt.figure(figsize=(10, 6))
sns.histplot(cvss_data['word_count'], kde=True, bins=30, color='blue')
plt.title('Distribusi Jumlah Kata Deskripsi')
plt.xlabel('Jumlah Kata')
plt.ylabel('Frekuensi')
plt.show()

In [None]:
# Mencari indeks teks terpanjang dan terpendek
longest_index = cvss_data['english_description'].apply(len).idxmax()
shortest_index = cvss_data['english_description'].apply(len).idxmin()

# Mengambil teks terpanjang dan terpendek
longest_text = cvss_data.loc[longest_index, 'english_description']
shortest_text = cvss_data.loc[shortest_index, 'english_description']

# Menampilkan teks terpanjang dan terpendek
print("Teks Terpanjang:\n", longest_text)
print("\nTeks Terpendek:\n", shortest_text)

<h1><b>TRAINING MODEL</b></h1>

In [19]:
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
import csv
from transformers import Trainer, TrainingArguments, AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
import argparse
import os
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
class CVSSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def read_cvss_txt(split_dir, list_classes):
    """
    Reads a directory structure and returns texts and labels.
    Assumes directories named with class labels (e.g., LOW, HIGH).
    """
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["LOW", "HIGH"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            for i in range(len(list_classes)):
                if list_classes[i] == label_dir:
                    labels.append(i)
                else:
                    continue

    return texts, labels

def read_cvss_csv(file_name, num_label, list_classes):
    """
    Reads a CSV file containing texts and labels, and returns the texts and corresponding integer labels.
    This function handles UTF-8 encoding to avoid issues with non-ASCII characters.
    """
    texts = []
    labels = []

    # Use 'with open' to ensure the file is properly closed after reading
    with open(file_name, 'r', encoding='utf-8') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"')

        # Skip header row if it exists
        next(csv_reader, None)  # This will skip the header, if present

        for row in csv_reader:
            texts.append(row[0])  # Assuming the first column is the text
            for i in range(len(list_classes)):
                if list_classes[i] == row[num_label]:  # Match the label with classes
                    labels.append(i)
                    break  # Exit the loop once a match is found

    return texts, labels


In [21]:
def select_tokenizer_model(model_name, extra_tokens, token_file, num_labels):
    global lemmatization

    print("### Selecting Model and Tokenizer")

    if model_name == 'distilbert':
        from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertConfig
        config = DistilBertConfig.from_pretrained('distilbert-base-cased')
        config.num_labels = num_labels
        tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
        model = DistilBertForSequenceClassification(config)

    elif model_name == 'bert':
        from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
        config = BertConfig.from_pretrained('bert-base-uncased')
        config.num_labels = num_labels
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
        model = BertForSequenceClassification(config)

    elif model_name == 'deberta':
        from transformers import DebertaConfig, DebertaTokenizerFast, DebertaForSequenceClassification
        config = DebertaConfig.from_pretrained('microsoft/deberta-base')
        config.num_labels = num_labels
        tokenizer = DebertaTokenizerFast.from_pretrained('microsoft/deberta-base')
        model = DebertaForSequenceClassification(config)

    elif model_name == 'albert':
        from transformers import AlbertConfig, AlbertTokenizerFast, AlbertForSequenceClassification
        config = AlbertConfig.from_pretrained('albert-base-v1')
        config.num_labels = num_labels
        tokenizer = AlbertTokenizerFast.from_pretrained('albert-base-v1')
        model = AlbertForSequenceClassification(config)

    elif model_name == 'roberta':
        from transformers import RobertaConfig, RobertaTokenizerFast, RobertaForSequenceClassification
        config = RobertaConfig.from_pretrained('roberta-base')
        config.num_labels = num_labels
        tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
        model = RobertaForSequenceClassification(config)

    ### Add Tokens
    if extra_tokens:
        add_tokens_from_file(token_file, tokenizer, lemmatization)
    number_tokens = len(tokenizer)

    print("### Number of tokens in Tokenizer")
    print(number_tokens)

    # print("### Configuration")
    # print(model.config)

    model.resize_token_embeddings(number_tokens)

    return tokenizer, model

def add_tokens_from_file(token_file, tokenizer, lemmatize=False):
    print("### Adding Tokens")

    file_      = open(token_file, 'r')
    token_list = []

    for line in file_:
        if lemmatize:
            token_list.append(lemmatize_noun(line.rstrip("\n")))
        else:
            token_list.append(line.rstrip("\n"))
    file_.close()
    tokenizer.add_tokens(token_list)

In [22]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_sentence(sentence):
    word_list = word_tokenize(sentence)
    # lemmatized_output = ' '.join([lemmatize_word(w) for w in word_list]) # ALL LEMMATIZATION
    lemmatized_output = ' '.join([lemmatize_noun(w) for w in word_list]) # NOUN LEMMATIZATION (OLD)

    return lemmatized_output

def lemmatize(train_texts, test_texts=None):
    ### Lemmatize Sentences
    lemmatized_texts_train = []
    lemmatized_texts_test  = []
    for text in train_texts:
        lemmatized_texts_train.append(lemmatize_sentence(text))
    if test_texts is not None:
        for text in test_texts:
            lemmatized_texts_test.append(lemmatize_sentence(text))

    return lemmatized_texts_train, lemmatized_texts_test

def lemmatize_word(word):
    lemmatizer = WordNetLemmatizer()
    pos_tag = get_wordnet_pos(word)
    word_lemmatized = lemmatizer.lemmatize(word, pos_tag)

    if pos_tag == "r" or pos_tag == "R":
        try:
            lemmas = wordnet.synset(word+'.r.1').lemmas()
            pertainyms = lemmas[0].pertainyms()
            name = pertainyms[0].name()
            return name
        except Exception:
            return word_lemmatized
    else:
        return word_lemmatized

def lemmatize_noun(word):
    lemmatizer = WordNetLemmatizer()
    word_lemmatized = lemmatizer.lemmatize(word)

    return word_lemmatized



In [23]:
def get_pred_accuracy(target, output):
    output = output.argmax(axis=1) # -> multi label

    tot_right = np.sum(target == output)
    tot = target.size

    return (tot_right/tot) * 100

def get_binary_mean_accuracy(target, output):
    eps = 1e-20
    output = output.argmax(axis=1)

    # TP + FN
    gt_pos = np.sum((target == 1), axis=0).astype(float)
    # TN + FP
    gt_neg = np.sum((target == 0), axis=0).astype(float)
    # TP
    true_pos = np.sum((target == 1) * (output == 1), axis=0).astype(float)
    # TN
    true_neg = np.sum((target == 0) * (output == 0), axis=0).astype(float)

    label_pos_recall = 1.0 * true_pos / (gt_pos + eps)  # true positive
    label_neg_recall = 1.0 * true_neg / (gt_neg + eps)  # true negative

    # mean accuracy
    return (label_pos_recall + label_neg_recall) / 2

def get_evaluation_metrics(target, output, num_labels):
    accuracy      = get_pred_accuracy(target, output, num_labels)
    precision     = get_precision(target, output)
    recall        = get_recall(target, output)
    f1_score      = get_f1_score(target, output)

    return accuracy, precision, recall, f1_score

def infer(trainer, test_loader, num_labels):
    predicts   = trainer.predict(test_loader)
    soft       = torch.nn.Softmax(dim=1)
    pred_probs = torch.from_numpy(predicts.predictions)
    pred_probs = soft(pred_probs).numpy()
    gt_list    = predicts.label_ids

    return get_pred_accuracy(gt_list, pred_probs)

In [24]:
# Daftar kombinasi variabel untuk setiap kategori
categories = [
    {
        "name": "attackVector",
        "num_labels": 4,
        "classes_names": ['NETWORK', 'LOCAL', 'PHYSICAL', 'ADJACENT_NETWORK'],
        "label_position": 1,
        "output_dir": 'output/attackVector'
    },
    {
        "name": "attackComplexity",
        "num_labels": 2,
        "classes_names": ['LOW', 'HIGH'],
        "label_position": 2,
        "output_dir": 'output/attackComplexity'
    },
    {
        "name": "privilegeReq",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 3,
        "output_dir": 'output/privilegeReq'
    },
    {
        "name": "userInteraction",
        "num_labels": 2,
        "classes_names": ['NONE', 'REQUIRED'],
        "label_position": 4,
        "output_dir": 'output/userInteraction'
    },
    {
        "name": "scope",
        "num_labels": 2,
        "classes_names": ['UNCHANGED', 'CHANGED'],
        "label_position": 5,
        "output_dir": 'output/scope'
    },
    {
        "name": "confidentiality",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 6,
        "output_dir": 'output/confidentiality'
    },
    {
        "name": "integrity",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 7,
        "output_dir": 'output/integrity'
    },
    {
        "name": "availability",
        "num_labels": 3,
        "classes_names": ['NONE', 'LOW', 'HIGH'],
        "label_position": 8,
        "output_dir": 'output/availability'
    }
]

In [None]:
def main():
    global lemmatization

    # variables
    model_name = 'distilbert'
    extra_tokens = True  # Menggunakan ekstra token
    token_file = 'vocab/CVSS_5k.vocab'  # File token
    lemmatization = True  # Menggunakan lemmatization

    # Parameter untuk tuning
    train_batch_size = 8  # Ukuran batch untuk training
    test_batch_size = 4  # Ukuran batch untuk testing
    epochs = 3  # Jumlah epoch
    learning_rate = 5e-5  # Learning rate
    weight_decay = 0  # Weight decay
    warmup_steps = 0  # Jumlah warmup steps
    warmup_ratio = 0  # Warmup ratio

    # Periksa ketersediaan GPU
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    print("### Device: ", device)
    if torch.cuda.is_available():
        devName = torch.cuda.get_device_name(0)
        print(f"GPU name is {devName}")

 # Loop untuk setiap kategori
    for category in categories:
        print(f"\n### Training model for {category['name']}")

        # Directories and variables for the current category
        output_dir = category["output_dir"]
        num_labels = category["num_labels"]
        classes_names = category["classes_names"]
        label_position = category["label_position"]

        # Buat output directory jika belum ada
        os.makedirs(output_dir, exist_ok=True)

        # Select Model
        tokenizer, model = select_tokenizer_model(model_name, extra_tokens=extra_tokens, token_file=token_file, num_labels=num_labels)

        # Splitting Dataset
        print("### Splitting Dataset")

        train_texts, train_labels = read_cvss_csv(f'data/train.csv', label_position, classes_names)
        test_texts, test_labels = read_cvss_csv(f'data/test.csv', label_position, classes_names)

        # Lemmatize Sentences
        if lemmatization:
            print("### Lemmatizing Sentences")
            lemmatized_train, lemmatized_test = lemmatize(train_texts, test_texts)

        # Tokenize Sentences
        print("### Tokenizing Sentences")

        if lemmatization:
            train_encodings = tokenizer(lemmatized_train, truncation=True, padding=True)
            test_encodings = tokenizer(lemmatized_test, truncation=True, padding=True)
        else:
            train_encodings = tokenizer(train_texts, truncation=True, padding=True)
            test_encodings = tokenizer(test_texts, truncation=True, padding=True)

        # Dataset Encodings
        print("### Encoding Dataset")

        train_dataset = CVSSDataset(train_encodings, train_labels)
        test_dataset = CVSSDataset(test_encodings, test_labels)

        # Training
        print("### Training")

        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=epochs,
            per_device_train_batch_size=train_batch_size,
            per_device_eval_batch_size=test_batch_size,
            learning_rate=learning_rate,
            save_strategy="epoch",
            weight_decay=weight_decay,
            warmup_steps=warmup_steps,
            warmup_ratio=warmup_ratio,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
        )

        trainer.train()
        trainer.save_model()
        acc = infer(trainer, test_dataset, num_labels)
        print(f"Accuracy for {category['name']} = {acc:.6f}")

if __name__ == '__main__':
    main()


### Device:  cuda
GPU name is Tesla T4

### Training model for attackVector
### Selecting Model and Tokenizer
### Adding Tokens
### Number of tokens in Tokenizer
33867
### Splitting Dataset
### Lemmatizing Sentences
### Tokenizing Sentences
### Encoding Dataset
### Training


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
