In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('SecurityVulnerabilities.csv')

# Show the first few rows
print(df.head())

# Show basic info about the dataset
print(df.info())

                                               Title                  Date  \
0  Ghost vulnerable to arbitrary file read via sy...  2023-08-15T20:35:20Z   
1  Yaklang Plugin's Fuzztag Component Allows Unau...  2023-08-15T20:08:17Z   
2  Scancode.io Reflected Cross-Site Scripting (XS...  2023-08-15T20:04:49Z   
3  When `ui.isAccessAllowed` is `undefined`, the ...  2023-08-15T20:04:14Z   
4    PandasAI vulnerable to arbitrary code execution  2023-08-15T18:31:32Z   

   Severity                                            Summary  \
0  Moderate  CVE-2023-40028was published\n                 ...   
1  Moderate  CVE-2023-40023was published\n                 ...   
2  Moderate  CVE-2023-40024was published\n                 ...   
3  Moderate  CVE-2023-40027was published\n                 ...   
4  Critical  CVE-2023-39661was published\n                 ...   

                                                Link  
0  https://github.com/advisories/GHSA-9c9v-w225-v5rg  
1  https://github.com/ad

In [3]:
# Drop rows with missing values in 'Summary' or 'Severity'
df = df.dropna(subset=['Summary', 'Severity'])

# For simplicity, keep only the columns we need
df = df[['Summary', 'Severity']]

# Check unique values in Severity
print(df['Severity'].value_counts())

Severity
Moderate    1576
High        1127
Critical     476
Low          171
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df['Summary'], df['Severity'], test_size=0.2, random_state=42, stratify=df['Severity'])

# Convert summary text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)  # Limit to 1000 features for simplicity
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
from sklearn.linear_model import LogisticRegression

# Train a simple logistic regression classifier
clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf, y_train)

In [6]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict on the test set
y_pred = clf.predict(X_test_tfidf)

# Show classification metrics
print(classification_report(y_test, y_pred))

# Show confusion matrix
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    Critical       0.63      0.23      0.34        95
        High       0.44      0.36      0.40       226
         Low       1.00      0.03      0.06        34
    Moderate       0.54      0.77      0.63       315

    accuracy                           0.52       670
   macro avg       0.65      0.35      0.36       670
weighted avg       0.54      0.52      0.48       670

[[ 22  30   0  43]
 [  7  82   0 137]
 [  0   7   1  26]
 [  6  68   0 241]]


In [7]:
# Show a few predictions with their summaries
for i in range(5):
    print(f"Summary: {X_test.iloc[i]}")
    print(f"Actual Severity: {y_test.iloc[i]}, Predicted: {y_pred[i]}\n")

Summary: CVE-2023-26128was published
                        forkeep-module-latest(npm)May 27, 2023
Actual Severity: High, Predicted: Moderate

Summary: CVE-2022-39384was published
                        for@openzeppelin/contracts(npm)Dec 14, 2021
Actual Severity: Moderate, Predicted: Moderate

Summary: CVE-2018-19184was published
                        forgithub.com/ethereum/go-ethereum/cmd/evm(Go)Jun 29, 2021
Actual Severity: High, Predicted: Moderate

Summary: CVE-2021-41122was published
                        forvyper(pip)Oct 6, 2021
Actual Severity: Moderate, Predicted: High

Summary: GHSA-6w5f-5wgr-qjg5was published
                        forgithub.com/edgelesssys/constellation/v2(Go)Mar 9, 2023
Actual Severity: High, Predicted: High



In [8]:
from textwrap import shorten

# Number of examples to display
num_examples = 5

print("\n=== Sample Predictions ===\n")
for i in range(num_examples):
    summary = shorten(X_test.iloc[i], width=100, placeholder="...")  # Limit summary to 100 chars
    actual = y_test.iloc[i]
    predicted = y_pred[i]
    print(f"Example {i+1}")
    print(f"{'-'*40}")
    print(f"Summary         : {summary}")
    print(f"Actual Severity : {actual}")
    print(f"Predicted       : {predicted}")
    print(f"{'='*40}\n")



=== Sample Predictions ===

Example 1
----------------------------------------
Summary         : CVE-2023-26128was published forkeep-module-latest(npm)May 27, 2023
Actual Severity : High
Predicted       : Moderate

Example 2
----------------------------------------
Summary         : CVE-2022-39384was published for@openzeppelin/contracts(npm)Dec 14, 2021
Actual Severity : Moderate
Predicted       : Moderate

Example 3
----------------------------------------
Summary         : CVE-2018-19184was published forgithub.com/ethereum/go-ethereum/cmd/evm(Go)Jun 29, 2021
Actual Severity : High
Predicted       : Moderate

Example 4
----------------------------------------
Summary         : CVE-2021-41122was published forvyper(pip)Oct 6, 2021
Actual Severity : Moderate
Predicted       : High

Example 5
----------------------------------------
Summary         : GHSA-6w5f-5wgr-qjg5was published forgithub.com/edgelesssys/constellation/v2(Go)Mar 9, 2023
Actual Severity : High
Predicted       : High



In [9]:
import pandas as pd
from textwrap import shorten

# Create a DataFrame for a few examples
examples = []
for i in range(num_examples):
    examples.append({
        "Summary": shorten(X_test.iloc[i], width=60, placeholder="..."),
        "Actual": y_test.iloc[i],
        "Predicted": y_pred[i]
    })
df_examples = pd.DataFrame(examples)
print(df_examples.to_string(index=False))


                                                     Summary   Actual Predicted
CVE-2023-26128was published forkeep-module-latest(npm)May...     High  Moderate
                              CVE-2022-39384was published... Moderate  Moderate
   CVE-2018-19184was published forgithub.com/ethereum/go-...     High  Moderate
        CVE-2021-41122was published forvyper(pip)Oct 6, 2021 Moderate      High
                         GHSA-6w5f-5wgr-qjg5was published...     High      High
