<a href="https://colab.research.google.com/github/nafizzl/CS-6320-Final-Project/blob/main/CS_6320_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import requests
import json
import time
import pandas as pd
from collections import defaultdict, Counter
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from google.colab import userdata
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import optuna

In [None]:
# --- Configuration ---
API_URL = "https://services.nvd.nist.gov/rest/json/cves/2.0"

# 2024 CWE Top 25 Most Dangerous Software Weaknesses
# (The current standard for 2025 analysis)
TOP_25_CWES = [
    "CWE-79",  "CWE-787", "CWE-89",  "CWE-352", "CWE-22",
    "CWE-125", "CWE-78",  "CWE-416", "CWE-862", "CWE-434",
    "CWE-94",  "CWE-20",  "CWE-77",  "CWE-287", "CWE-269",
    "CWE-502", "CWE-200", "CWE-863", "CWE-918", "CWE-119",
    "CWE-476", "CWE-798", "CWE-190", "CWE-400", "CWE-306"
]

def fetch_and_process_nvd_data():
    NVD_API_KEY = userdata.get('NVD_API_KEY') # Retrieve API key here
    headers = {
        "apiKey": NVD_API_KEY
    } if NVD_API_KEY else {}

    filtered_cves = []

    print(f"Fetching NVD data for each of the {len(TOP_25_CWES)} Top 25 CWEs...")

    for cwe_id_filter in TOP_25_CWES:
        params = {
            # "pubStartDate": START_DATE,
            # "pubEndDate": END_DATE,
            "cweId": cwe_id_filter
        }

        try:
            full_request_url = f"{API_URL}?cweId={params['cweId']}"
            print(f"Requesting URL: {full_request_url}")
            response = requests.get(API_URL, headers=headers, params=params, timeout=30)

            if response.status_code != 200:
                print(f"Error for {cwe_id_filter}: API returned {response.status_code}")
                break

            data = response.json()
            vulnerabilities = data.get("vulnerabilities", [])

            print(f"Processing {len(vulnerabilities)} vulnerabilities for {cwe_id_filter}")

            for item in vulnerabilities:
                cve_item = item.get("cve", {})
                cve_id = cve_item.get("id")

                descriptions = cve_item.get("descriptions", [])
                description_text = next((d["value"] for d in descriptions if d["lang"] == "en"), "No description")

                all_cwe_ids_for_cve = []
                weaknesses = cve_item.get("weaknesses", [])
                for weakness in weaknesses:
                    for desc in weakness.get("description", []):
                        if desc.get("lang") == "en":
                            all_cwe_ids_for_cve.append(desc.get("value"))

                row = {
                    "CVE_ID": cve_id,
                    "Published": cve_item.get("published"),
                    "Last_Modified": cve_item.get("lastModified"),
                    "Description": description_text,
                    "CWEs": ", ".join(all_cwe_ids_for_cve),
                    "Top_25_Match": True
                }
                filtered_cves.append(row)

        except Exception as e:
            print(f"Exception occurred for {cwe_id_filter}: {e}")
            continue

    print(f"\nCompleted. Found {len(filtered_cves)} vulnerabilities before deduplication.")

    df = pd.DataFrame(filtered_cves)
    if not df.empty:
        df = df.drop_duplicates(subset=['CVE_ID']).reset_index(drop=True)
        print(f"Found {len(df)} unique vulnerabilities after deduplication.")
    else:
        print("No matching vulnerabilities found.")

    return df

df = fetch_and_process_nvd_data()

if not df.empty:
    print("\nSample Data:")
    print(df.head())
else:
    print("No matching vulnerabilities found.")

Fetching NVD data for each of the 25 Top 25 CWEs...
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-79
Processing 2000 vulnerabilities for CWE-79
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-787
Processing 2000 vulnerabilities for CWE-787
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-89
Processing 2000 vulnerabilities for CWE-89
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-352
Processing 2000 vulnerabilities for CWE-352
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-22
Processing 2000 vulnerabilities for CWE-22
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-125
Processing 2000 vulnerabilities for CWE-125
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-78
Processing 2000 vulnerabilities for CWE-78
Requesting URL: https://services.nvd.nist.gov/rest/json/cves/2.0?cweId=CWE-416
Processing 2000 

In [None]:
# --- 1. Data Cleaning & Label Engineering ---

# Ensure we only look at the specific Top 25 CWEs we are interested in
TARGET_CWES = [
    "CWE-79",  "CWE-787", "CWE-89",  "CWE-352", "CWE-22",
    "CWE-125", "CWE-78",  "CWE-416", "CWE-862", "CWE-434",
    "CWE-94",  "CWE-20",  "CWE-77",  "CWE-287", "CWE-269",
    "CWE-502", "CWE-200", "CWE-863", "CWE-918", "CWE-119",
    "CWE-476", "CWE-798", "CWE-190", "CWE-400", "CWE-306"
]

def extract_primary_label(cwe_string):
    """
    Parses the comma-separated CWE string and returns the first match
    found in our TARGET_CWES list.
    """
    if not isinstance(cwe_string, str): return None
    cwes_in_row = [x.strip() for x in cwe_string.split(',')]

    # Return the first CWE that exists in our target list
    for cwe in cwes_in_row:
        if cwe in TARGET_CWES:
            return cwe
    return None

# Apply the filter
df['label_text'] = df['CWEs'].apply(extract_primary_label)
df = df.dropna(subset=['label_text']) # Remove rows that don't match our targets

# Encode Labels (Text -> Integers)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label_text'])

# Create mappings for the model config later
id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
num_labels = len(label_encoder.classes_)

print(f"Dataset prepared. Unique Classes: {num_labels}")
print(df[['Description', 'label_text', 'label']].head(3))

# --- 2. Train / Validation / Test Split ---

# Split: 70% Train, 15% Validation, 15% Test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Description'].tolist(),
    df['label'].tolist(),
    test_size=0.3,
    stratify=df['label'], # Important: maintains class balance
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts,
    test_labels,
    test_size=0.5,
    stratify=test_labels,
    random_state=42
)

print(f"Train size: {len(train_texts)}, Val size: {len(val_texts)}, Test size: {len(test_texts)}")

# --- 3. Tokenization ---

model_name = "markusbayer/CySecBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class CVEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def tokenize_data(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

train_dataset = CVEDataset(train_encodings, train_labels)
val_dataset = CVEDataset(val_encodings, val_labels)
test_dataset = CVEDataset(test_encodings, test_labels)

# # --- 4. Model Initialization ---

# model = AutoModelForSequenceClassification.from_pretrained(
#     model_name,
#     num_labels=num_labels,
#     id2label=id2label,
#     label2id=label2id
# )

# # --- 5. Training Setup ---

# def compute_metrics(pred):
#     labels = pred.label_ids
#     preds = pred.predictions.argmax(-1)
#     # precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
#     acc = accuracy_score(labels, preds)
#     return {
#         'accuracy': acc,
#         # 'f1': f1 # Uncomment if you want F1 score
#     }

# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=3,              # 3-5 epochs is usually sufficient for BERT
#     per_device_train_batch_size=16,  # Reduce to 8 if you run out of GPU memory
#     per_device_eval_batch_size=16,
#     warmup_steps=100,
#     weight_decay=0.01,
#     logging_dir='./logs',
#     logging_steps=10,
#     eval_strategy="epoch",     # Evaluate at the end of every epoch
#     save_strategy="epoch",           # Save model at the end of every epoch
#     load_best_model_at_end=True,     # Load the best model when finished
#     learning_rate=2e-5,              # Standard BERT learning rate
#     report_to="none",
#     optim='adamw_torch' # Explicitly set optimizer to disable fused optimizers
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     compute_metrics=compute_metrics,
# )

# # --- 6. Train the Model ---
# print("\nStarting Training...")
# trainer.train()

# # --- 7. Evaluation on Test Set ---
# print("\nEvaluating on Test Set...")
# test_results = trainer.predict(test_dataset)
# print(f"Test Set Metrics: {test_results.metrics}")

# # --- 8. Inference Example (Sanity Check) ---
# print("\n--- Inference Check ---")
# # Pick a random sample from test set
# sample_idx = 0
# sample_text = test_texts[sample_idx]
# true_label = id2label[test_labels[sample_idx]]

# # Predict
# inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
# with torch.no_grad():
#     logits = model(**inputs).logits

# predicted_class_id = logits.argmax().item()
# predicted_label = id2label[predicted_class_id]

# print(f"Description: {sample_text[:100]}...")
# print(f"True Label: {true_label}")
# print(f"Predicted:  {predicted_label}")

Dataset prepared. Unique Classes: 25
                                         Description label_text  label
0  Cross site scripting vulnerabilities in Apache...     CWE-79     18
1  Opera, when configured with the "Determine act...     CWE-79     18
2  Cross-site scripting (XSS) vulnerability in Ve...     CWE-79     18
Train size: 32252, Val size: 6911, Test size: 6912


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# --- 1. Data Cleaning & Label Engineering (Unchanged) ---

# Mock DataFrame for demonstration (Replace this with your actual data loading)
# df = pd.read_csv("your_data.csv")

# Ensure we only look at the specific Top 25 CWEs
TARGET_CWES = [
    "CWE-79",  "CWE-787", "CWE-89",  "CWE-352", "CWE-22",
    "CWE-125", "CWE-78",  "CWE-416", "CWE-862", "CWE-434",
    "CWE-94",  "CWE-20",  "CWE-77",  "CWE-287", "CWE-269",
    "CWE-502", "CWE-200", "CWE-863", "CWE-918", "CWE-119",
    "CWE-476", "CWE-798", "CWE-190", "CWE-400", "CWE-306"
]

def extract_primary_label(cwe_string):
    if not isinstance(cwe_string, str): return None
    cwes_in_row = [x.strip() for x in cwe_string.split(',')]
    for cwe in cwes_in_row:
        if cwe in TARGET_CWES:
            return cwe
    return None

# Apply filter (Assuming df is already loaded)
df['label_text'] = df['CWEs'].apply(extract_primary_label)
df = df.dropna(subset=['label_text'])

# Encode Labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label_text'])

id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in enumerate(label_encoder.classes_)}
num_labels = len(label_encoder.classes_)

# --- 2. Train / Validation / Test Split (Unchanged) ---

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Description'].tolist(),
    df['label'].tolist(),
    test_size=0.3,
    stratify=df['label'],
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts,
    test_labels,
    test_size=0.5,
    stratify=test_labels,
    random_state=42
)

# --- 3. Tokenization (Unchanged) ---

model_name = "markusbayer/CySecBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class CVEDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def tokenize_data(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_data(train_texts)
val_encodings = tokenize_data(val_texts)
test_encodings = tokenize_data(test_texts)

train_dataset = CVEDataset(train_encodings, train_labels)
val_dataset = CVEDataset(val_encodings, val_labels)
test_dataset = CVEDataset(test_encodings, test_labels)

print(f"Tokenizer class: {tokenizer.__class__}")
print(f"Name: {tokenizer.name_or_path}")
print(f"Fast: {tokenizer.is_fast}")
print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Special tokens: {tokenizer.special_tokens_map}")

# --- 4. Model Initialization Wrapper (NEW) ---
# We wrap this in a function so the Trainer can instantiate a fresh model for every trial

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

# --- 5. Hyperparameter Search Setup (NEW) ---

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc}

# Define the search space
def hp_space(trial):
    return {
        # Suggest a learning rate between 1e-5 and 5e-5 (logarithmic scale)
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        # Suggest batch sizes (8 or 16)
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16]),
        # Suggest number of epochs (2, 3, or 4)
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 4)
    }

# Initial arguments (defaults, will be overridden by search)
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=10,
    weight_decay=0.01,
    report_to="none",
    optim='adamw_torch'
)

# Initialize Trainer with 'model_init' instead of 'model'
trainer = Trainer(
    model_init=model_init,        # Pass the function, not the object
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# --- 6. Run Hyperparameter Search ---
print("\n--- Starting Hyperparameter Search ---")

best_run = trainer.hyperparameter_search(
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics['eval_accuracy'], # Optimize for Validation Accuracy
    direction="maximize",
    backend="optuna",
    n_trials=5  # How many combinations to try (Increase this for better results, e.g., 10 or 20)
)

print(f"\nBest Run ID: {best_run.run_id}")
print(f"Best Hyperparameters: {best_run.hyperparameters}")

# --- 7. Train Final Model with Best Parameters ---
print("\n--- Retraining with Best Parameters ---")

# Update arguments with the best found values
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

# Retrain the model on the training set using the discovered best params
trainer.train()

# --- 8. Evaluation on Test Set ---
print("\n--- Evaluating on Test Set (Unseen Data) ---")
test_results = trainer.predict(test_dataset)
print(f"Final Test Set Metrics: {test_results.metrics}")

# --- 9. Inference Example ---
sample_idx = 0
sample_text = test_texts[sample_idx]
true_label = id2label[test_labels[sample_idx]]

# Inference requires loading the model from the trainer (which now holds the best trained model)
model = trainer.model
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True).to(model.device)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
predicted_label = id2label[predicted_class_id]

print(f"\nDescription: {sample_text[:100]}...")
print(f"True Label: {true_label}")
print(f"Predicted:  {predicted_label}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizer class: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>
Name: markusbayer/CySecBERT
Fast: True
Vocab size: 30522
Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at markusbayer/CySecBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-12-03 06:29:20,162] A new study created in memory with name: no-name-dcc97f07-cf4d-49fd-99bf-7b98b68d4496



--- Starting Hyperparameter Search ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at markusbayer/CySecBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4354,0.630131,0.818984


[W 2025-12-03 06:44:15,744] Trial 0 failed with parameters: {'learning_rate': 2.6000519272829263e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 4} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/integrations/integration_utils.py", line 277, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2325, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 2790, in _inner_training_loop
    self._maybe_log_save_evaluate(
  File "/usr/local/lib/python3.12/dist-packages/transformers/trainer.py", line 3228, in _maybe_l

KeyboardInterrupt: 

In [None]:
from sklearn.linear_model import LogisticRegression
from transformers import AutoModel # Import AutoModel for base embeddings
import torch

# Define model_name explicitly in this cell
model_name = "markusbayer/CySecBERT"

# Initialize the base model to extract embeddings
model = AutoModel.from_pretrained(model_name)

def tokenizedTextToVector(encodings, batch_size=64):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)
  all_vectors = []

  num_samples = len(encodings['input_ids'])
  for i in range(0, num_samples, batch_size):
    batch_encodings = {key: val[i:i+batch_size] for key, val in encodings.items()}

    with torch.no_grad():
      # Move input tensors to the same device as the model
      inputs = {key: torch.tensor(val).to(device) for key, val in batch_encodings.items()}
      outputs = model(**inputs)
      # Extract the last hidden state for the [CLS] token (index 0) as the sentence embedding
      batch_vectors = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() # Move to CPU before numpy conversion
      all_vectors.append(batch_vectors)

  return np.concatenate(all_vectors, axis=0)

# Use the already tokenized data (train_encodings, test_encodings)
trainVectors = tokenizedTextToVector(train_encodings)
testVectors = tokenizedTextToVector(test_encodings)

lr = LogisticRegression(max_iter=1000) # Increased max_iter for better convergence
lr.fit(trainVectors, train_labels)
accuracy = lr.score(testVectors, test_labels)
print("Test Accuracy: ", accuracy)

accuracy = lr.score(trainVectors, train_labels)
print("Training Accuracy: ", accuracy)

Some weights of BertModel were not initialized from the model checkpoint at markusbayer/CySecBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy:  0.7155671296296297
Training Accuracy:  0.828723800074414


In [None]:
from sklearn.naive_bayes import GaussianNB
from transformers import AutoModel # Import AutoModel for base embeddings
import torch

# Define model_name explicitly in this cell
model_name = "markusbayer/CySecBERT"

# Initialize the base model to extract embeddings
model = AutoModel.from_pretrained(model_name)

def tokenizedTextToVector(encodings, batch_size=64):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)
  all_vectors = []

  num_samples = len(encodings['input_ids'])
  for i in range(0, num_samples, batch_size):
    batch_encodings = {key: val[i:i+batch_size] for key, val in encodings.items()}

    with torch.no_grad():
      # Move input tensors to the same device as the model
      inputs = {key: torch.tensor(val).to(device) for key, val in batch_encodings.items()}
      outputs = model(**inputs)
      # Extract the last hidden state for the [CLS] token (index 0) as the sentence embedding
      batch_vectors = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy() # Move to CPU before numpy conversion
      all_vectors.append(batch_vectors)

  return np.concatenate(all_vectors, axis=0)

# Use the already tokenized data (train_encodings, test_encodings)
trainVectors = tokenizedTextToVector(train_encodings)
testVectors = tokenizedTextToVector(test_encodings)

nb = GaussianNB()
nb.fit(trainVectors, train_labels)
accuracy = nb.score(testVectors, test_labels)
print("Test Accuracy: ", accuracy)
accuracy = nb.score(trainVectors, train_labels)
print("Training Accuracy: ", accuracy)

Some weights of BertModel were not initialized from the model checkpoint at markusbayer/CySecBERT and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Accuracy:  0.32002314814814814
Training Accuracy:  0.3256852288230187


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
vectorizer = CountVectorizer()
trainVector = vectorizer.fit_transform(train_texts)
testVector = vectorizer.transform(test_texts)
nb = MultinomialNB()
nb.fit(trainVector, train_labels)

accuracy = nb.score(trainVector, train_labels)
print("Training Accuracy: ", accuracy)

accuracy = nb.score(testVector, test_labels)
print("Test Accuracy: ", accuracy)

Training Accuracy:  0.8222435817933772
Test Accuracy:  0.7330729166666666


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer()
trainVector = vectorizer.fit_transform(train_texts)
testVector = vectorizer.transform(test_texts)
lr = LogisticRegression(max_iter=100)
lr.fit(trainVector, train_labels)

accuracy = lr.score(trainVector, train_labels)
print("Training Accuracy: ", accuracy)

accuracy = lr.score(testVector, test_labels)
print("Test Accuracy: ", accuracy)

Training Accuracy:  0.9863574351978172
Test Accuracy:  0.8233506944444444


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
