<a href="https://colab.research.google.com/github/ogatash-lab/ICICS2023EvalData/blob/main/CVSS_MLR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [58]:
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.core.fromnumeric import size
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import tqdm
import time
import json
import re
import csv
import urllib.request
import shutil

1.2.2


# Data

In [59]:
# For saving results
df_result = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_y_test.csv", header=0)
df_result["CVE-ID"] = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_X_test.csv", header=0)["CVE_ID"]
df_comparison = pd.DataFrame()


# metrics = ['cvssV3_attackVector','cvssV3_attackComplexity','cvssV3_privilegesRequired','cvssV3_userInteraction',
#            'cvssV3_scope','cvssV3_confidentialityImpact','cvssV3_integrityImpact','cvssV3_availabilityImpact']
metrics = ['cvssV3_attackVector']


acu = []

for metric in metrics:
  train_sentence = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_X_train.csv", header=0)
  test_sentence = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_X_test.csv", header=0)
  y_train = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_y_train.csv", header=0)
  y_test  = pd.read_csv("https://raw.githubusercontent.com/mus-shd/CVSS-BERT/main/data/cve_2018-2020_y_test.csv", header=0)
  y_train = pd.DataFrame(y_train[[metric]])
  y_test  = pd.DataFrame(y_test[[metric]])

  # Show data distribution
  print("----------y_train:", y_train.value_counts())
  print("----------y_test:", y_test.value_counts())


----------y_train: cvssV3_attackVector
NETWORK                16989
LOCAL                   5105
ADJACENT_NETWORK         566
PHYSICAL                 303
dtype: int64
----------y_test: cvssV3_attackVector
NETWORK                17101
LOCAL                   4988
ADJACENT_NETWORK         584
PHYSICAL                 290
dtype: int64



# Natural Language Processing (train)

In [60]:
# Natural Language Processing
# Creating BoW features with sklearn's CountVectorizer
vectorizer = CountVectorizer(stop_words="english")

X_train = vectorizer.fit_transform(train_sentence['Description'].values)
print('dimensions:', X_train.shape)

dimensions: (22963, 36104)


# Multinomial Logistic Regression (train)

In [61]:
# Create a classification model for MLR using vectorized features
lr = LogisticRegression(C=0.1, random_state=1, multi_class='multinomial', n_jobs=-1)
# lr = LogisticRegression(C=0.1, multi_class='multinomial', n_jobs=-1)
lr.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


# Natural Language Processing (test)

In [62]:
# Natural Language Processing
# Creating BoW features with sklearn's CountVectorizer.
X_test = vectorizer.transform(test_sentence['Description'].values)

# Multinomial Logistic Regression (test)

In [63]:
# Get confusion matrix labels from dataset
class_labels = list(y_train[metric].unique())

# Test data to confirm accuracy.
y_pred = lr.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=class_labels)

# Output results

In [64]:
# Add label
class_labels_actual = [f"A: {label}" for label in class_labels]
class_labels_predicted = [f"P: {label}" for label in class_labels]

# Branching when creating a table.
table = pd.DataFrame(cm, columns=class_labels_predicted, index=class_labels_actual)

# Save predicted value
df_result[f'pred_{metric}'] = y_pred
df_comparison[metric] = df_result[metric] == df_result[f'pred_{metric}']

# Accuracy
acu.append(accuracy_score(y_test, y_pred))
print("accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))
print(table)
print('-'*70)

accuracy: 0.9017114488525019
                  precision    recall  f1-score   support

ADJACENT_NETWORK     0.8289    0.4812    0.6089       584
           LOCAL     0.8527    0.7648    0.8064      4988
         NETWORK     0.9158    0.9638    0.9392     17101
        PHYSICAL     0.8366    0.4414    0.5779       290

        accuracy                         0.9017     22963
       macro avg     0.8585    0.6628    0.7331     22963
    weighted avg     0.8989    0.9017    0.8974     22963

                     P: PHYSICAL  P: NETWORK  P: LOCAL  P: ADJACENT_NETWORK
A: PHYSICAL                  128         102        56                    4
A: NETWORK                     7       16482       567                   45
A: LOCAL                      11        1153      3815                    9
A: ADJACENT_NETWORK            7         260        36                  281
----------------------------------------------------------------------


In [65]:
# Get the name (word) of the vectorized feature
# Binary time
if lr.coef_.shape[0] == 1:
  feature_names = vectorizer.get_feature_names_out()  # Get the name of the feature (word)
  coef = lr.coef_[0]  # For binary cases, the coefficients are a one-dimensional array (coefficients of class 0)

  # Extracting words that had a positive influence and words that had a negative influence
  # Get positive words sorted in descending order of coefficients
  sorted_indices = coef.argsort()[::-1]  # Get indexes sorted in descending order of coefficients
  positive_words = [feature_names[i] for i in sorted_indices[:10]]  # Get top 10 words
  positive_weights = [coef[i] for i in sorted_indices[:10]]  # Get weights of top 10 words
  negative_words = [feature_names[i] for i in coef.argsort()[:10]]
  negative_weights = [coef[i] for i in coef.argsort()[:10]]

  print('Positive words:', positive_words)
  print('Negative words:', negative_words)



# When the value is two or more
if lr.coef_.shape[0] != 1:
  # Extraction of positive and negative words for each classification category
  coef_matrix = lr.coef_


  for i, class_name in enumerate(lr.classes_):
    coef = coef_matrix[i]
    # Extracting words that had a positive influence and words that had a negative influence
    # Get positive words sorted in descending order of coefficients
    sorted_indices = coef.argsort()[::-1]  # Get indexes sorted in descending order of coefficients
    positive_words = [feature_names[i] for i in sorted_indices[:10]]  # Get top 10 words
    positive_weights = [coef[i] for i in sorted_indices[:10]]  # Get weights of top 10 words
    negative_words = [feature_names[i] for i in coef.argsort()[:10]]


    print("Class:", class_name)
    print("Positive words:", positive_words)
    print("Negative words:", negative_words)
    print('-' * 50)


print('-'*70)
print('-'*70)

for i in acu:
  print(i)

Class: ADJACENT_NETWORK
Positive words: ['adjacent', 'netgear', 'bluetooth', 'network', 'hyper', 'devices', 'authenticated', 'wi', 'fi', 'radio']
Negative words: ['local', 'discovered', 'function', 'users', 'file', 'crafted', 'memory', 'successful', 'php', 'fixed']
--------------------------------------------------
Class: LOCAL
Positive words: ['local', 'logon', 'artifex', 'dll', 'xpdf', 'crafted', '20063', '30452', '30102', 'executes']
Negative words: ['xss', 'unauthenticated', 'network', 'physical', 'safari', 'scripting', 'inclusion', 'traversal', 'devices', 'upload']
--------------------------------------------------
Class: NETWORK
Positive words: ['xss', 'remote', 'php', 'csrf', 'safari', 'inclusion', 'scripting', 'upload', 'traversal', 'edge']
Negative words: ['local', 'adjacent', 'physical', 'logon', 'netgear', 'artifex', 'xpdf', 'executes', 'usb', 'bluetooth']
--------------------------------------------------
Class: PHYSICAL
Positive words: ['physical', 'usb', 'physically', 'op

# Example

In [66]:
print("CVE-ID:", df_result.loc[0, "CVE-ID"])
print("cvssV3_attackVector:", df_result.loc[0, "cvssV3_attackVector"])
print("pred_cvssV3_attackVector:", df_result.loc[0, "pred_cvssV3_attackVector"])

CVE-ID: CVE-2019-4740
cvssV3_attackVector: NETWORK
pred_cvssV3_attackVector: NETWORK
