In [1]:
import os

directory = 'data'

texts = []
labels = []

for filename in os.listdir(directory):
    if filename.endswith('.txt'):  
        with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
            for line in file:
                if '|' in line:
                    parts = line.rsplit('|', 1)
                    if parts[1].strip() == "":
                        continue
                    text = parts[0].strip()
                    label = parts[1].strip() if len(parts) > 1 else 'O'  
                    texts.append(text)
                    labels.append(label)

In [2]:
from collections import Counter
Counter(labels)

Counter({'O': 10209,
         'AttackRansom': 1494,
         'AttackRansom, AttackDatabreach': 24,
         'DiscoverVulnerability': 1467,
         'AttackDatabreach': 1405,
         'PatchVulnerability': 949,
         'AttackPhishing': 1283,
         'PatchVulnerability, DiscoverVulnerability': 83,
         'DiscoverVulnerability , PatchVulnerability': 40,
         'DiscoverVulnerability, PatchVulnerability': 121,
         'AttackPhishing, AttackDatabreach': 26,
         'AttackDatabreach, AttackPhishing': 21,
         'DiscoverVulnerability, AttackDatabreach': 22,
         'PatchVulnerability, AttackRansom': 4,
         'PatchVulnerability , DiscoverVulnerability': 9,
         'DiscoverVulnerability, AttackPhishing': 2,
         'AttackRansom, AttackDatabreach, AttackPhishing': 1,
         'AttackDatabreach, AttackRansom': 14,
         'PatchVulnerability , AttackDatabreach': 1,
         'AttackDatabreach, DiscoverVulnerability': 8,
         'AttackDatabreach, PatchVulnerability': 3,

In [3]:
temp  = []
for label in labels:
    for t in label.split(','):
        temp.append(t.strip())

In [4]:
Counter(temp)

Counter({'O': 10209,
         'AttackRansom': 1581,
         'AttackDatabreach': 1571,
         'DiscoverVulnerability': 1771,
         'PatchVulnerability': 1228,
         'AttackPhishing': 1374})

In [5]:
import pandas as pd


df = pd.DataFrame({
    'Text': texts,
    'Label': labels
})

all_labels = ["AttackDatabreach", "AttackPhishing", "AttackRansom", "DiscoverVulnerability", "PatchVulnerability","O"]


def label_columns(row, label_list):
    label_data = {label: 0 for label in label_list}
    entries = row['Label'].split(',')
    for entry in entries:
        entry = entry.strip()
        if entry in label_data:
            label_data[entry] = 1
    return pd.Series(label_data)

label_df = df.apply(lambda row: label_columns(row, all_labels), axis=1)


final_df = pd.concat([df['Text'], label_df], axis=1)
final_df


Unnamed: 0,Text,AttackDatabreach,AttackPhishing,AttackRansom,DiscoverVulnerability,PatchVulnerability,O
0,Attackers start wiping data from CouchDB and H...,0,0,0,0,0,1
1,Researchers are now observing similar destruct...,0,0,0,0,0,1
2,Security researchers Victor Gevers and Niall M...,0,0,0,0,0,1
3,The two have put together spreadsheets on Goog...,0,0,0,0,0,1
4,"In the case of Hadoop, a framework used for di...",0,0,0,0,0,1
...,...,...,...,...,...,...,...
17264,"""Ransomware has become a billion-dollar cash c...",0,0,0,0,0,1
17265,In order to help prevent falling victim to ran...,0,0,0,0,0,1
17266,"Launched in 2016, the No More Ransom scheme br...",0,0,1,0,0,0
17267,The portal is available in 29 languages and si...,0,0,0,0,0,1


In [6]:
labels = ["AttackDatabreach", "AttackPhishing", "AttackRansom", "DiscoverVulnerability", "PatchVulnerability", "O"]

# Create label2id dictionary
label2id = {label: i for i, label in enumerate(labels)}

# Create id2label dictionary
id2label = {i: label for i, label in enumerate(labels)}

# Output the dictionaries
print("label2id:", label2id)
print("id2label:", id2label)


label2id: {'AttackDatabreach': 0, 'AttackPhishing': 1, 'AttackRansom': 2, 'DiscoverVulnerability': 3, 'PatchVulnerability': 4, 'O': 5}
id2label: {0: 'AttackDatabreach', 1: 'AttackPhishing', 2: 'AttackRansom', 3: 'DiscoverVulnerability', 4: 'PatchVulnerability', 5: 'O'}


In [7]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
from transformers import DataCollatorWithPadding

dataset = Dataset.from_pandas(final_df)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['Text'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Convert integer labels to floats for BCEWithLogitsLoss
def format_labels(example):
    labels = [float(example[col]) for col in ['AttackDatabreach', 'AttackPhishing', 'AttackRansom', 
                                              'DiscoverVulnerability', 'PatchVulnerability', 'O']]
    return {'labels': labels}

tokenized_datasets = tokenized_datasets.map(format_labels)

# Split the dataset into train, validation, and test sets
train_testvalid = tokenized_datasets.train_test_split(test_size=0.2, shuffle=True)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, shuffle=True)
train_dataset = train_testvalid['train']
valid_dataset = test_valid['train']
test_dataset = test_valid['test']

# Metrics computation
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Apply sigmoid function to logits and threshold at 0.5 for binary prediction
    preds = (1 / (1 + np.exp(-logits)) > 0.5).astype(float)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {'precision': precision, 'recall': recall, 'f1': f1}



data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Training
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01
)

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6, problem_type="multi_label_classification")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Map:   0%|          | 0/17269 [00:00<?, ? examples/s]

Map:   0%|          | 0/17269 [00:00<?, ? examples/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mt111599004[0m ([33mntut-biolab[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.170718,0.831597,0.796831,0.811533
2,0.231500,0.157981,0.839505,0.807583,0.821488
3,0.128700,0.178294,0.823116,0.804188,0.809432
4,0.087000,0.187704,0.816062,0.804754,0.808804
5,0.056500,0.208515,0.821589,0.791171,0.803554
6,0.039500,0.225088,0.819952,0.815507,0.816034
7,0.027100,0.233527,0.824789,0.813243,0.817461
8,0.027100,0.236844,0.815463,0.807583,0.810445
9,0.021500,0.245484,0.81867,0.810979,0.813508
10,0.016900,0.246568,0.818185,0.810413,0.812824


TrainOutput(global_step=4320, training_loss=0.07158374488353729, metrics={'train_runtime': 4206.3193, 'train_samples_per_second': 32.843, 'train_steps_per_second': 1.027, 'total_flos': 3.63500977425408e+16, 'train_loss': 0.07158374488353729, 'epoch': 10.0})

In [8]:
# Evaluate on test data
trainer.evaluate(test_dataset)


{'eval_loss': 0.2451830953359604,
 'eval_precision': 0.8347406598219786,
 'eval_recall': 0.8228990411731528,
 'eval_f1': 0.8261504714057493,
 'eval_runtime': 18.7643,
 'eval_samples_per_second': 92.036,
 'eval_steps_per_second': 11.511,
 'epoch': 10.0}

In [9]:
model_path = "./saved_model"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [10]:
# baseline

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier



# Convert labels to a list of sets for MultiLabelBinarizer
labels = final_df[['AttackDatabreach', 'AttackPhishing', 'AttackRansom', 
                   'DiscoverVulnerability', 'PatchVulnerability', 'O']].apply(
    lambda x: set(x[x == 1].index), axis=1)

# Binarize the labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

# Ensure you have the correct number of text samples and labels
assert len(final_df['Text']) == len(y), "Mismatch in number of samples between Text and labels."

# Feature extraction
vectorizer = TfidfVectorizer(max_features=1000)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(final_df['Text'], y, test_size=0.2, random_state=42)

# Logistic Regression in a OneVsRest framework
pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

pipeline.fit(X_train, y_train)

# Prediction and evaluation
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

                       precision    recall  f1-score   support

     AttackDatabreach       0.67      0.35      0.46       304
       AttackPhishing       0.90      0.48      0.63       268
         AttackRansom       0.87      0.64      0.74       326
DiscoverVulnerability       0.78      0.44      0.56       354
                    O       0.80      0.87      0.84      2051
   PatchVulnerability       0.87      0.54      0.67       243

            micro avg       0.81      0.71      0.76      3546
            macro avg       0.82      0.55      0.65      3546
         weighted avg       0.81      0.71      0.74      3546
          samples avg       0.72      0.72      0.72      3546



  _warn_prf(average, modifier, msg_start, len(result))
