<h1><b>Import Libraries</b></h1>

In [1]:
import os
import json
import gzip
import requests
import numpy as np
import pandas as pd
from typing import Dict, Any, Optional
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm





<h1><b>Code Functions</b></h1>

In [2]:
# Step 1: Download CVE Data
def download_nvd_feed(year):
    feed_url = f'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{year}.json.gz'
    response = requests.get(feed_url)

    with open(f'nvdcve-1.1-{year}.json.gz', 'wb') as f:
        f.write(response.content)

    return f'nvdcve-1.1-{year}.json.gz'

def extract_cve_from_item(item: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    if 'baseMetricV3' not in item['impact']:
        return None

    en_text = next(
        (item['value'] for item in item['cve']['description']['description_data']
         if item['lang'] == 'en'), None)
    if en_text is None:
        return None

    # Mengganti karakter newline \n dengan spasi atau menghapusnya
    en_text = en_text.replace('\n', ' ')  # Mengganti newline dengan spasi
    
    return {
        'cve_id': item['cve']['CVE_data_meta']['ID'],
        'assigner': item['cve']['CVE_data_meta']['ASSIGNER'],
        'cvssv3_vector_string': item['impact']['baseMetricV3']['cvssV3']['vectorString'],
        'cvssv3_attack_vector': item['impact']['baseMetricV3']['cvssV3']['attackVector'],
        'cvssv3_attack_complexity': item['impact']['baseMetricV3']['cvssV3']['attackComplexity'],
        'cvssv3_privileges_required': item['impact']['baseMetricV3']['cvssV3']['privilegesRequired'],
        'cvssv3_user_interaction': item['impact']['baseMetricV3']['cvssV3']['userInteraction'],
        'cvssv3_scope': item['impact']['baseMetricV3']['cvssV3']['scope'],
        'cvssv3_confidentiality_impact': item['impact']['baseMetricV3']['cvssV3']['confidentialityImpact'],
        'cvssv3_integrity_impact': item['impact']['baseMetricV3']['cvssV3']['integrityImpact'],
        'cvssv3_availability_impact': item['impact']['baseMetricV3']['cvssV3']['availabilityImpact'],
        'cvssv3_base_score': item['impact']['baseMetricV3']['cvssV3']['baseScore'],
        'cvssv3_base_severity': item['impact']['baseMetricV3']['cvssV3']['baseSeverity'],
        'english_description': en_text,
    }

def process_cve_data(years):
    all_cves = []
    skipped = 0
    processed = 0

    for year in years:
        filename = download_nvd_feed(year)

        with gzip.open(filename, 'rt', encoding='utf-8') as f:
            nvd_data = json.load(f)

        print(f"Processing data for year {year}: {len(nvd_data['CVE_Items'])} total CVEs")

        year_cves = []
        for item in nvd_data['CVE_Items']:
            relevant_data = extract_cve_from_item(item)
            if relevant_data is None:
                skipped += 1
            else:
                year_cves.append(relevant_data)
                processed += 1

        all_cves.extend(year_cves)

    print(f"Processed CVEs: {processed}\nSkipped CVEs: {skipped}")
    return pd.DataFrame(all_cves)




# Step 2: Generate Embeddings
def generate_embeddings(df):
    # Using sentence-transformers instead of Vertex AI
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(df['english_description'].tolist())
    df['description_embeddings'] = embeddings.tolist()
    return df



# Step 3: Train Classifiers
def train_classifiers(df):
    categories = [
        'cvssv3_attack_vector',
        'cvssv3_attack_complexity',
        'cvssv3_privileges_required',
        'cvssv3_user_interaction',
        'cvssv3_scope',
        'cvssv3_confidentiality_impact',
        'cvssv3_integrity_impact',
        'cvssv3_availability_impact',
    ]

    classifiers = {}
    X = df['description_embeddings'].tolist()

    for category in categories:
        print(f"Training SVM classifier for {category}...")
        y = df[category].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        clf = SVC()
        clf.fit(X_train, y_train)

        # Make predictions
        y_pred = clf.predict(X_test)

        # Calculate accuracy, precision, recall, and F1-score
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

        classifiers[category] = {
            'clf': clf,
            'acc': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        }

    return classifiers

<h1><b>PROCESSING DATA</b></h1>

kode ini akan mendownload dataset CVE dari 2022-2024, lalu memprosesnya dari json ke csv dan di load sebagai dataframe. Setelahnya juga akan di save sebagai file csv baru

In [None]:
# Run ini pertama kali jika belum pernah

# Process, Download, and Save Data CVE 
cve_df = process_cve_data([2022, 2023, 2024])

In [None]:
# Load Data yang sudah ada atau sudah didownload
cve_df = pd.read_csv('data_train.csv')

In [7]:
print(cve_df.shape)
cve_df.head

(59104, 14)


<bound method NDFrame.head of               cve_id                  assigner  \
0      CVE-2022-0001          secure@intel.com   
1      CVE-2022-0002          secure@intel.com   
2      CVE-2022-0004          secure@intel.com   
3      CVE-2022-0005          secure@intel.com   
4      CVE-2022-0010  cybersecurity@ch.abb.com   
...              ...                       ...   
59099  CVE-2024-9985           cve@cert.org.tw   
59100  CVE-2024-9986             cna@vuldb.com   
59101  CVE-2024-9987   security@pandorafms.com   
59102  CVE-2024-9996        psirt@autodesk.com   
59103  CVE-2024-9997        psirt@autodesk.com   

                               cvssv3_vector_string cvssv3_attack_vector  \
0      CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:C/C:H/I:N/A:N                LOCAL   
1      CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:C/C:H/I:N/A:N                LOCAL   
2      CVSS:3.1/AV:P/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H             PHYSICAL   
3      CVSS:3.1/AV:P/AC:L/PR:N/UI:N/S:U/C:L/I:N/A:N             P

<h1><b>GENERATE EMBEDDINGS</b></h1>

In [8]:
# Generate embeddings
cve_df = generate_embeddings(cve_df)

<h1><b>TRAINING MODEL</b></h1>

In [5]:
# Main Execution
def main():
    # Train classifiers
    classifiers = train_classifiers(cve_df)

    # Print classifier accuracies, precision, recall, and f1-score
    for category, metrics in classifiers.items():
        print(f"### {category} ###")
        print(f"accuracy: {metrics['acc']:.3f}")
        print(f"precision: {metrics['precision']:.3f}")
        print(f"f1-score: {metrics['f1_score']:.3f}")
        print(f"recall: {metrics['recall']:.3f}")
        print("")
    return cve_df, classifiers

# Run the main function
cve_dataframe, trained_classifiers = main()

Processing data for year 2022: 25266 total CVEs
Processing data for year 2023: 29062 total CVEs
Processed CVEs: 45884
Skipped CVEs: 8444
Training SVM classifier for cvssv3_base_severity...
Training SVM classifier for cvssv3_attack_vector...
Training SVM classifier for cvssv3_attack_complexity...
Training SVM classifier for cvssv3_privileges_required...
Training SVM classifier for cvssv3_user_interaction...
Training SVM classifier for cvssv3_scope...
Training SVM classifier for cvssv3_confidentiality_impact...
Training SVM classifier for cvssv3_integrity_impact...
Training SVM classifier for cvssv3_availability_impact...
cvssv3_base_severity           - accuracy:  0.721
cvssv3_attack_vector           - accuracy:  0.920
cvssv3_attack_complexity       - accuracy:  0.969
cvssv3_privileges_required     - accuracy:  0.797
cvssv3_user_interaction        - accuracy:  0.941
cvssv3_scope                   - accuracy:  0.970
cvssv3_confidentiality_impact  - accuracy:  0.856
cvssv3_integrity_impac

<h1><b>SAVE MODEL</b></h1>

In [6]:
import joblib

# Save the classifier for each category
for category, clf in trained_classifiers.items():
    joblib.dump(clf, f"{category}_classifier.joblib")
    print(f"Model for {category} saved successfully!")


Model for cvssv3_base_severity saved successfully!
Model for cvssv3_attack_vector saved successfully!
Model for cvssv3_attack_complexity saved successfully!
Model for cvssv3_privileges_required saved successfully!
Model for cvssv3_user_interaction saved successfully!
Model for cvssv3_scope saved successfully!
Model for cvssv3_confidentiality_impact saved successfully!
Model for cvssv3_integrity_impact saved successfully!
Model for cvssv3_availability_impact saved successfully!
