In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel
import logging
import numpy as np
import sklearn
import re
import nltk
from nltk.corpus import stopwords
import ssl

In [None]:
# Define metrics function
def metrics_fn(truth, predictions):
    if predictions.ndim == 1:
        predictions = np.expand_dims(predictions, axis=1)
    return {
        'accuracy': (truth == predictions.argmax(axis=1)).mean(),
        'f1': sklearn.metrics.f1_score(truth, predictions.argmax(axis=1), average='weighted'),
        'precision': sklearn.metrics.precision_score(truth, predictions.argmax(axis=1), average='weighted'),
        'recall': sklearn.metrics.recall_score(truth, predictions.argmax(axis=1), average='weighted')
    }

In [None]:
# Load dataset
df = pd.read_excel('TheHackerNews_Dataset.xlsx')

In [None]:
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('stopwords')
stop_words = stopwords.words('english')

def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove special characters and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # tokenize text
    tokens = text.split()
    return " ".join(tokens)

In [None]:
# Fill in NaN values with an empty string
df = df.fillna('')

In [None]:
# Preprocess data
df['Text'] = df.Title +". " + df.Article
df = df.drop(['Article','Title','Link'], axis=1)
df = df[['Text', 'Label']]
df['Text'] = df['Text'].apply(lambda x: x.replace('\\', " "))
df['Label'] = df['Label'].map({'Cyber_Attack':0, 'Malware':1, 'Vulnerability':2, 'Data_Breaches':3})

# Apply preprocessing to text column
df['Text'] = df['Text'].apply(preprocess_text)

In [None]:
# Split dataset into train and eval sets
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Define model parameters
model_args = {
    'reprocess_input_data': True,
    'num_train_epochs': 5,
    'overwrite_output_dir': True,
    'learning_rate': 2e-5,
    'evaluate_during_training': True,
    'evaluate_during_training_steps': 1000,
    'evaluate_during_training_verbose': True,
    'use_pretrained_model': True,
    'architecture': 'bert',
    'early_stopping_patience': 3,  # stop if no improvement for 3 epochs
    'dropout': 0.1,  # add 10% dropout
    'train_batch_size': 8,  # use smaller batch size
}

In [None]:
# Create classification model and train on data
model = ClassificationModel('roberta', 'roberta-base', num_labels=4, args=model_args, use_cuda=False)
model.train_model(train_df, eval_df=eval_df, metrics=metrics_fn)