# Import

In [41]:
import os
import random
import re
from collections import Counter
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import hamming_loss
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch.utils.data import RandomSampler
from tqdm import tqdm
from transformers import RobertaForSequenceClassification, RobertaTokenizer


def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn

# Configuration

In [42]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running on device {DEVICE}")

RANDOM_SEED = 0
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.benchmark = False

PATH_TO_DATASET = os.path.join("dataset")
BERT_MODEL_TYPE = 'microsoft/codebert-base'

MAX_FEATURES = 500
BATCH_SIZE = 2
NUM_EPOCHS = 30
NUM_LABELS = 20
LR = 0.001

TEST_SIZE = 0.2
FILE_TYPE = "source"
FILE_EXT = ".sol"
FILE_ID = "sol"

Running on device cpu


# Dataset

Create PyTorch dataset feeding either source code, bytecode or runtime to the models.

## Preprocessing

In [43]:
def preprocess_hex(hex_data: str) -> str:
    # Reads a hex file and converts it to a byte string
    byte_data = bytes.fromhex(hex_data.strip())

    # Convert byte data to a readable ASCII string, ignoring non-ASCII characters
    return ' '.join(f'{byte:02x}' for byte in byte_data)

In [44]:
def preprocess_solidity_code(code: str) -> str:
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)

    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

    # Remove blank lines (lines only containing whitespace)
    lines = code.split('\n')
    non_blank_lines = [line for line in lines if line.strip() != '']
    code = '\n'.join(non_blank_lines)

    return code

In [45]:
def preprocess(data: str):
    return preprocess_solidity_code(data) if FILE_TYPE == "source" else preprocess_hex(data)

## Labels Management

In [46]:
def init_docs_and_gt(data: pd.DataFrame) -> Tuple:
    docs, labels, gt = {}, {}, {}
    for _, row in tqdm(data.iterrows(), desc="Initializing documents and groundtruth data"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, FILE_TYPE, str(file_id) + FILE_EXT)
        if os.path.exists(path_to_file):

            # Initialize the documents
            docs[item_id] = preprocess(open(path_to_file, 'r', encoding="utf8").read())

            # Initialize the label
            labels[item_id] = [0] * NUM_LABELS

            # Initialize the groundtruth
            prop = row["property"].lower()
            if prop not in gt.keys():
                gt[prop] = len(gt.values())

    return list(docs.values()), labels, gt

In [47]:
def set_labels(data: pd.DataFrame, labels: Dict, gt: Dict) -> List:
    for _, row in tqdm(data.iterrows(), desc="Setting up the labels"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, FILE_TYPE, str(file_id) + FILE_EXT)
        if os.path.exists(path_to_file):

            # Set label   
            prop = row["property"].lower()
            if row['property_holds'] == 't':
                labels[item_id][gt[prop]] = 1

    return list(labels.values())

## Initialization of the dataset

In [48]:
# Read the dataset from CSV
dataset = pd.read_csv(os.path.join(PATH_TO_DATASET, "consolidated.csv"), sep=";")

# Count the frequency of each item in the column
frequency = dataset['dataset'].value_counts()

# Find the item with the maximum occurrence
most_frequent_item = frequency.idxmax()
most_frequent_count = frequency.max()

print(f"The most frequent item in the column is '{most_frequent_item}' and it appears {most_frequent_count} times.")

# Exclude outliers from the dataset
dataset = dataset[dataset["dataset"] == most_frequent_item]

# Initialize the documents and the groundtruth
documents, labels, gt = init_docs_and_gt(dataset)

# Set the labels for the multilabel classification problem
labels = set_labels(dataset, labels, gt)

The most frequent item in the column is 'CodeSmells' and it appears 10395 times.


Initializing documents and groundtruth data: 10395it [00:01, 7451.97it/s]
Setting up the labels: 10395it [00:00, 49058.04it/s]


# BERT-like Models

In [49]:
model = RobertaForSequenceClassification.from_pretrained(BERT_MODEL_TYPE, num_labels=20, ignore_mismatched_sizes=True)
model.config.problem_type = "multi_label_classification"
model.to(DEVICE)

tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL_TYPE, ignore_mismatched_sizes=True)

optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
def compute_metrics(true_labels, pred_labels):
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='samples', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='samples', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='samples', zero_division=0)
    hamming = hamming_loss(true_labels, pred_labels)
    return accuracy, precision, recall, f1, hamming

In [51]:
def train_and_evaluate(model, train_dataloader, test_dataloader):
    for epoch in range(NUM_EPOCHS):
        model.train()
        train_losses, train_metrics = [], []

        # Training loop
        for batch in tqdm(train_dataloader, desc="Training"):
            batch = tuple(b.to(model.device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            model.zero_grad()
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            train_losses.append(loss.item())
            loss.backward()
            optimizer.step()

            # Compute metrics for the batch
            with torch.no_grad():
                predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
                batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
                train_metrics.append(batch_metrics)

        avg_train_loss = np.mean(train_losses)
        avg_train_metrics = np.mean(train_metrics, axis=0)
        print(
            f"\nEpoch {epoch + 1}/{NUM_EPOCHS} | Train loss: {avg_train_loss:.4f} | Train Metrics: Precision: {avg_train_metrics[1]:.4f}, Recall: {avg_train_metrics[2]:.4f}, F1: {avg_train_metrics[3]:.4f}, Hamming Loss: {avg_train_metrics[4]:.4f}\n")

        # Validation phase
        model.eval()
        test_losses, test_metrics = [], []

        for batch in tqdm(test_dataloader, desc="Testing"):
            batch = tuple(b.to(model.device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            with torch.no_grad():
                outputs = model(**inputs)
                loss = loss_fn(outputs.logits, inputs['labels'])
                test_losses.append(loss.item())
                predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
                batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
                test_metrics.append(batch_metrics)

        avg_test_loss = np.mean(test_losses)
        avg_test_metrics = np.mean(test_metrics, axis=0)
        print(
            f"\nEpoch {epoch + 1}/{NUM_EPOCHS} | Test loss: {avg_test_loss:.4f} | Test Metrics: Precision: {avg_test_metrics[1]:.4f}, Recall: {avg_test_metrics[2]:.4f}, F1: {avg_test_metrics[3]:.4f}, Hamming Loss: {avg_test_metrics[4]:.4f}\n")


In [52]:
def evaluate_on_test_set(model, test_dataloader):
    model.eval()
    test_losses, test_metrics = [], []

    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating on Test Set"):
            batch = tuple(b.to(model.device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            test_losses.append(loss.item())

            predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
            batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
            test_metrics.append(batch_metrics)

    avg_test_loss = np.mean(test_losses)
    avg_test_metrics = np.mean(test_metrics, axis=0)
    print(
        f"\nTest Set Evaluation | Loss: {avg_test_loss:.4f} | Precision: {avg_test_metrics[1]:.4f}, Recall: {avg_test_metrics[2]:.4f}, F1: {avg_test_metrics[3]:.4f}, Hamming Loss: {avg_test_metrics[4]:.4f}\n")


In [53]:
encoding = tokenizer(documents, add_special_tokens=True, max_length=512,
                     return_token_type_ids=False, padding="max_length",
                     truncation=True, return_attention_mask=True,
                     return_tensors='pt')

# Splitting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(encoding['input_ids'], labels, test_size=TEST_SIZE)
train_masks, test_masks, _, _ = train_test_split(encoding['attention_mask'], labels, test_size=TEST_SIZE)

# Creating datasets
train_dataset = TensorDataset(x_train, train_masks, torch.tensor(y_train).float())
test_dataset = TensorDataset(x_test, test_masks, torch.tensor(y_test).float())

# K-Fold Configuration
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True)

# Applying K-Fold Cross-Validation
for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
    train_subsampler = Subset(train_dataset, train_idx)
    val_subsampler = Subset(train_dataset, val_idx)

    train_loader = DataLoader(train_subsampler, sampler=RandomSampler(train_subsampler), batch_size=BATCH_SIZE)
    val_loader = DataLoader(val_subsampler, batch_size=BATCH_SIZE)  # No need for shuffling

    print(f"Starting Fold {fold + 1}")
    train_and_evaluate(model, train_loader, val_loader)

# Create test dataloader (assuming test_dataset is already created)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Evaluate on the test set
evaluate_on_test_set(model, test_dataloader)

Starting Fold 1


Training:   1%|          | 2/176 [00:09<13:15,  4.57s/it]


KeyboardInterrupt: 

# SVM, Random Forest, Gradient Boosting

In [None]:
classifiers = {
    "svm": SVC(kernel='linear', probability=True),
    "random_forest": RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED),
    "gradient_boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
}

In [None]:
# Preprocessing and Feature Extraction
vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)
x = vectorizer.fit_transform(documents)
y = labels

# Splitting Data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

for classifier_name, classifier in classifiers.items():
    print(f"\nTESTING CLASSIFIER: {classifier_name}")

    # Train the SVM model with One-vs-Rest strategy
    model = OneVsRestClassifier(classifier)
    model.fit(x_train, y_train)

    # Evaluate the model
    predictions = model.predict(x_test)
    print("\n-> Accuracy: .............. :", accuracy_score(y_test, predictions))
    print("\n-> Classification Report ... :", classification_report(y_test, predictions))

    # Define a pipeline combining a text vectorizer, and a classifier
    pipeline = Pipeline([('tfidf', vectorizer), ('clf', model)])

    # Perform cross-validation
    scores = cross_val_score(pipeline, documents, y, cv=5, scoring='f1_samples')
    print("\n-> Mean CV F1 score ....... :", scores.mean())

# Simple Neural Network

In [54]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(MAX_FEATURES, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, y.shape[1])
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [55]:
# Preprocessing and Feature Extraction
x = TfidfVectorizer(max_features=MAX_FEATURES).fit_transform(documents).toarray()
x = torch.FloatTensor(x)
y = torch.FloatTensor(labels)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# K-Fold Configuration
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=RANDOM_SEED)

# Prepare the test DataLoader
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

# Loss and optimizer
criterion = nn.BCELoss()

for fold, (train_idx, val_idx) in enumerate(kf.split(x_train)):
    print(f"Fold {fold + 1}/{num_folds}")

    # Creating data subsets for the current fold
    x_train_fold, x_val_fold = x_train[train_idx], x_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    train_data = DataLoader(TensorDataset(x_train_fold, y_train_fold), batch_size=BATCH_SIZE, shuffle=True)
    val_data = DataLoader(TensorDataset(x_val_fold, y_val_fold), batch_size=BATCH_SIZE, shuffle=False)

    # Initialize the network
    model = SimpleNN()

    # Optimizer
    optimizer = AdamW(model.parameters(), lr=LR)

    # Training and validation loop
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for inputs, targets in tqdm(train_data, desc="Training"):
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation phase
        model.eval()
        with torch.no_grad():
            all_predictions, all_targets = [], []
            for inputs, targets in tqdm(val_data, desc="Validation"):
                outputs = model(inputs)
                all_predictions.append(outputs)
                all_targets.append(targets)

            all_predictions = torch.cat(all_predictions).cpu()
            all_targets = torch.cat(all_targets).cpu()
            predicted_labels = (all_predictions > 0.5).type(torch.float)
            acc = accuracy_score(all_targets.numpy(), predicted_labels.numpy())
            print(f'Validation - Fold {fold + 1}, Epoch {epoch + 1}: Accuracy: {acc:.4f}')

# Evaluate on the test set
model.eval()
with torch.no_grad():
    all_predictions, all_targets = [], []
    for inputs, targets in tqdm(test_data, desc="Testing"):
        outputs = model(inputs)
        all_predictions.append(outputs)
        all_targets.append(targets)

    all_predictions = torch.cat(all_predictions).cpu()
    all_targets = torch.cat(all_targets).cpu()
    predicted_labels = (all_predictions > 0.5).type(torch.float)
    acc = accuracy_score(all_targets.numpy(), predicted_labels.numpy())
    precision = precision_score(all_targets.numpy(), predicted_labels.numpy(), average='samples', zero_division=0)
    recall = recall_score(all_targets.numpy(), predicted_labels.numpy(), average='samples', zero_division=0)
    f1 = f1_score(all_targets.numpy(), predicted_labels.numpy(), average='samples', zero_division=0)
    print(f'Test Set Evaluation - Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1:{f1:.4f}')


Fold 1/5


Training: 100%|██████████| 176/176 [00:00<00:00, 661.67it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6772.21it/s]


Validation - Fold 1, Epoch 1: Accuracy: 0.0795


Training: 100%|██████████| 176/176 [00:00<00:00, 763.51it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3518.58it/s]


Validation - Fold 1, Epoch 2: Accuracy: 0.1477


Training: 100%|██████████| 176/176 [00:00<00:00, 763.89it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6543.61it/s]


Validation - Fold 1, Epoch 3: Accuracy: 0.1818


Training: 100%|██████████| 176/176 [00:00<00:00, 782.52it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 2824.27it/s]


Validation - Fold 1, Epoch 4: Accuracy: 0.2045


Training: 100%|██████████| 176/176 [00:00<00:00, 772.98it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9134.75it/s]


Validation - Fold 1, Epoch 5: Accuracy: 0.1932


Training: 100%|██████████| 176/176 [00:00<00:00, 751.84it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5482.42it/s]


Validation - Fold 1, Epoch 6: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 792.98it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6604.49it/s]


Validation - Fold 1, Epoch 7: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 753.18it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4765.02it/s]


Validation - Fold 1, Epoch 8: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 640.23it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4370.72it/s]


Validation - Fold 1, Epoch 9: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 634.05it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6720.42it/s]


Validation - Fold 1, Epoch 10: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 605.03it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6677.86it/s]


Validation - Fold 1, Epoch 11: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 523.84it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5034.77it/s]


Validation - Fold 1, Epoch 12: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 670.12it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8994.07it/s]


Validation - Fold 1, Epoch 13: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 705.45it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8523.83it/s]


Validation - Fold 1, Epoch 14: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 674.59it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9319.73it/s]


Validation - Fold 1, Epoch 15: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 795.11it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6674.72it/s]


Validation - Fold 1, Epoch 16: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 672.49it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6299.26it/s]


Validation - Fold 1, Epoch 17: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 718.68it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5108.07it/s]


Validation - Fold 1, Epoch 18: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 724.32it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4496.60it/s]


Validation - Fold 1, Epoch 19: Accuracy: 0.1477


Training: 100%|██████████| 176/176 [00:00<00:00, 774.91it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6712.60it/s]


Validation - Fold 1, Epoch 20: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 776.70it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7315.26it/s]


Validation - Fold 1, Epoch 21: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 769.79it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9195.29it/s]


Validation - Fold 1, Epoch 22: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 641.95it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5632.17it/s]


Validation - Fold 1, Epoch 23: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 711.30it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7962.61it/s]


Validation - Fold 1, Epoch 24: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 664.78it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9642.58it/s]


Validation - Fold 1, Epoch 25: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 678.07it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5425.37it/s]


Validation - Fold 1, Epoch 26: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 551.10it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6544.54it/s]


Validation - Fold 1, Epoch 27: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 482.10it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9016.04it/s]


Validation - Fold 1, Epoch 28: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 623.04it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6003.56it/s]


Validation - Fold 1, Epoch 29: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 634.93it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5876.06it/s]


Validation - Fold 1, Epoch 30: Accuracy: 0.1932
Fold 2/5


Training: 100%|██████████| 176/176 [00:00<00:00, 607.61it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6586.81it/s]


Validation - Fold 2, Epoch 1: Accuracy: 0.1818


Training: 100%|██████████| 176/176 [00:00<00:00, 590.36it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3830.10it/s]


Validation - Fold 2, Epoch 2: Accuracy: 0.2045


Training: 100%|██████████| 176/176 [00:00<00:00, 579.79it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8906.82it/s]


Validation - Fold 2, Epoch 3: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 561.97it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4200.13it/s]


Validation - Fold 2, Epoch 4: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 542.44it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3975.30it/s]


Validation - Fold 2, Epoch 5: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 611.13it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6075.70it/s]


Validation - Fold 2, Epoch 6: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 593.71it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4134.63it/s]


Validation - Fold 2, Epoch 7: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 605.92it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5883.93it/s]


Validation - Fold 2, Epoch 8: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 580.47it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4953.28it/s]


Validation - Fold 2, Epoch 9: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 493.68it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3191.68it/s]


Validation - Fold 2, Epoch 10: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 447.94it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4032.28it/s]


Validation - Fold 2, Epoch 11: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 564.01it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4950.62it/s]


Validation - Fold 2, Epoch 12: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 570.25it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3810.56it/s]


Validation - Fold 2, Epoch 13: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 589.93it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5880.18it/s]


Validation - Fold 2, Epoch 14: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 593.01it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5638.88it/s]


Validation - Fold 2, Epoch 15: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 596.24it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9251.52it/s]


Validation - Fold 2, Epoch 16: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 619.97it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6309.16it/s]


Validation - Fold 2, Epoch 17: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 608.80it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3781.98it/s]


Validation - Fold 2, Epoch 18: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 418.61it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3771.70it/s]


Validation - Fold 2, Epoch 19: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 461.96it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9056.75it/s]


Validation - Fold 2, Epoch 20: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 643.48it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9621.97it/s]


Validation - Fold 2, Epoch 21: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 592.04it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4013.78it/s]


Validation - Fold 2, Epoch 22: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 645.84it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4754.59it/s]


Validation - Fold 2, Epoch 23: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 640.41it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5990.31it/s]


Validation - Fold 2, Epoch 24: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 635.28it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3977.27it/s]


Validation - Fold 2, Epoch 25: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 605.17it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7189.58it/s]


Validation - Fold 2, Epoch 26: Accuracy: 0.3068


Training: 100%|██████████| 176/176 [00:00<00:00, 566.95it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3332.30it/s]


Validation - Fold 2, Epoch 27: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 611.47it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 3730.76it/s]


Validation - Fold 2, Epoch 28: Accuracy: 0.3068


Training: 100%|██████████| 176/176 [00:00<00:00, 557.07it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4067.65it/s]


Validation - Fold 2, Epoch 29: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 365.79it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7805.67it/s]


Validation - Fold 2, Epoch 30: Accuracy: 0.3068
Fold 3/5


Training: 100%|██████████| 176/176 [00:00<00:00, 593.84it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5069.76it/s]


Validation - Fold 3, Epoch 1: Accuracy: 0.0568


Training: 100%|██████████| 176/176 [00:00<00:00, 578.25it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9674.43it/s]


Validation - Fold 3, Epoch 2: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 639.51it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8461.30it/s]


Validation - Fold 3, Epoch 3: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 635.96it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9098.27it/s]


Validation - Fold 3, Epoch 4: Accuracy: 0.3182


Training: 100%|██████████| 176/176 [00:00<00:00, 637.37it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5964.56it/s]


Validation - Fold 3, Epoch 5: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 698.63it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6894.66it/s]


Validation - Fold 3, Epoch 6: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 501.97it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7433.71it/s]


Validation - Fold 3, Epoch 7: Accuracy: 0.3295


Training: 100%|██████████| 176/176 [00:00<00:00, 451.05it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9542.37it/s]


Validation - Fold 3, Epoch 8: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 619.95it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9544.34it/s]


Validation - Fold 3, Epoch 9: Accuracy: 0.3523


Training: 100%|██████████| 176/176 [00:00<00:00, 608.30it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8379.85it/s]


Validation - Fold 3, Epoch 10: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 637.26it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9308.45it/s]


Validation - Fold 3, Epoch 11: Accuracy: 0.3409


Training: 100%|██████████| 176/176 [00:00<00:00, 636.94it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8741.86it/s]


Validation - Fold 3, Epoch 12: Accuracy: 0.3182


Training: 100%|██████████| 176/176 [00:00<00:00, 607.62it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8079.03it/s]


Validation - Fold 3, Epoch 13: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 592.90it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9345.69it/s]


Validation - Fold 3, Epoch 14: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 623.21it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6997.66it/s]


Validation - Fold 3, Epoch 15: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 577.39it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4163.36it/s]


Validation - Fold 3, Epoch 16: Accuracy: 0.2841


Training: 100%|██████████| 176/176 [00:00<00:00, 459.72it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8812.40it/s]


Validation - Fold 3, Epoch 17: Accuracy: 0.3068


Training: 100%|██████████| 176/176 [00:00<00:00, 712.23it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5459.88it/s]


Validation - Fold 3, Epoch 18: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 648.11it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6013.93it/s]


Validation - Fold 3, Epoch 19: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 702.07it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8758.04it/s]


Validation - Fold 3, Epoch 20: Accuracy: 0.2955


Training: 100%|██████████| 176/176 [00:00<00:00, 680.13it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6858.79it/s]


Validation - Fold 3, Epoch 21: Accuracy: 0.3182


Training: 100%|██████████| 176/176 [00:00<00:00, 621.91it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5048.26it/s]


Validation - Fold 3, Epoch 22: Accuracy: 0.3182


Training: 100%|██████████| 176/176 [00:00<00:00, 609.41it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4350.01it/s]


Validation - Fold 3, Epoch 23: Accuracy: 0.3409


Training: 100%|██████████| 176/176 [00:00<00:00, 583.72it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9031.49it/s]


Validation - Fold 3, Epoch 24: Accuracy: 0.3295


Training: 100%|██████████| 176/176 [00:00<00:00, 646.17it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7458.95it/s]


Validation - Fold 3, Epoch 25: Accuracy: 0.3068


Training: 100%|██████████| 176/176 [00:00<00:00, 625.88it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9302.35it/s]


Validation - Fold 3, Epoch 26: Accuracy: 0.3295


Training: 100%|██████████| 176/176 [00:00<00:00, 679.23it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9230.24it/s]


Validation - Fold 3, Epoch 27: Accuracy: 0.3409


Training: 100%|██████████| 176/176 [00:00<00:00, 642.36it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5915.80it/s]


Validation - Fold 3, Epoch 28: Accuracy: 0.3295


Training: 100%|██████████| 176/176 [00:00<00:00, 597.93it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6882.32it/s]


Validation - Fold 3, Epoch 29: Accuracy: 0.3636


Training: 100%|██████████| 176/176 [00:00<00:00, 454.14it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7554.83it/s]


Validation - Fold 3, Epoch 30: Accuracy: 0.3295
Fold 4/5


Training: 100%|██████████| 176/176 [00:00<00:00, 535.51it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5145.67it/s]


Validation - Fold 4, Epoch 1: Accuracy: 0.1591


Training: 100%|██████████| 176/176 [00:00<00:00, 726.63it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 7118.86it/s]


Validation - Fold 4, Epoch 2: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 693.81it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6649.47it/s]


Validation - Fold 4, Epoch 3: Accuracy: 0.2273


Training: 100%|██████████| 176/176 [00:00<00:00, 775.32it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 4447.62it/s]


Validation - Fold 4, Epoch 4: Accuracy: 0.2614


Training: 100%|██████████| 176/176 [00:00<00:00, 561.75it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 6634.65it/s]


Validation - Fold 4, Epoch 5: Accuracy: 0.2727


Training: 100%|██████████| 176/176 [00:00<00:00, 644.28it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 5888.81it/s]


Validation - Fold 4, Epoch 6: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 638.40it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 8956.10it/s]


Validation - Fold 4, Epoch 7: Accuracy: 0.2386


Training: 100%|██████████| 176/176 [00:00<00:00, 674.97it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 1347.72it/s]


Validation - Fold 4, Epoch 8: Accuracy: 0.2500


Training: 100%|██████████| 176/176 [00:00<00:00, 636.51it/s]
Validation:  55%|█████▍    | 24/44 [00:00<00:00, 2277.24it/s]


KeyboardInterrupt: 

# LSTM

In [56]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = True  # Optionally freeze the embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        packed_output, (hidden, cell) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return torch.sigmoid(output)

In [57]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Loading GloVe Embeddings"):
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [58]:
glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')  # Update path as necessary

# Tokenization and vocabulary creation
word_count = Counter(word for sentence in documents for word in sentence.lower().split())
vocabulary = {word: i + 1 for i, word in enumerate(word_count)}  # start indexing from 1
vocabulary['<PAD>'] = 0  # Padding value

# Embedding matrix creation
embedding_dim = 100  # Dimensionality of GloVe embeddings used
embedding_matrix = np.zeros((len(vocabulary), embedding_dim))
for word, i in tqdm(vocabulary.items(), desc='Creating Embedding Matrix'):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Convert text to sequence of integers
sequences = [[vocabulary[word] for word in text.lower().split()] for text in documents]

# Finding the longest sequence
max_seq_len = max(len(seq) for seq in sequences)

# Pad sequences
seq_padded = [seq + [vocabulary['<PAD>']] * (max_seq_len - len(seq)) for seq in sequences]

Loading GloVe Embeddings: 400000it [00:03, 100997.07it/s]
Creating Embedding Matrix: 100%|██████████| 45901/45901 [00:00<00:00, 1349143.29it/s]


In [59]:
x_tensor = torch.tensor(seq_padded, dtype=torch.long)
y_tensor = torch.tensor(labels, dtype=torch.float)

# Split dataset for final evaluation
x_train_val, x_test, y_train_val, y_test = train_test_split(x_tensor, y_tensor, test_size=TEST_SIZE,
                                                            random_state=RANDOM_SEED)
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

# K-Fold Configuration
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=RANDOM_SEED)

# Perform K-fold Cross Validation
for fold, (train_idx, val_idx) in enumerate(kf.split(x_train_val)):
    print(f"Training Fold {fold + 1}/{num_folds}")

    # Create training and validation data loaders
    train_data = DataLoader(TensorDataset(x_train_val[train_idx], y_train_val[train_idx]), batch_size=BATCH_SIZE,
                            shuffle=True)
    val_data = DataLoader(TensorDataset(x_train_val[val_idx], y_train_val[val_idx]), batch_size=BATCH_SIZE,
                          shuffle=False)

    # Model, loss, and optimizer setup
    model = LSTMClassifier(len(vocabulary), embedding_dim, hidden_dim=100, output_dim=y_train_val.shape[1],
                           pretrained_embeddings=embedding_matrix)
    model = model.to(DEVICE)
    criterion = nn.BCELoss().to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)

    # Training loop
    for epoch in range(NUM_EPOCHS):
        model.train()
        total_loss = 0
        for inputs, targets in tqdm(train_data, desc=f"Training Epoch {epoch + 1}, Fold {fold + 1}"):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        with torch.no_grad():
            all_predictions, all_targets = [], []
            for inputs, targets in tqdm(val_data, desc="Validating"):
                inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
                outputs = model(inputs)
                predicted_labels = (outputs > 0.5).float()
                all_predictions.append(predicted_labels)
                all_targets.append(targets)

            all_predictions = torch.cat(all_predictions).cpu()
            all_targets = torch.cat(all_targets).cpu()
            acc = accuracy_score(all_targets.numpy(), all_predictions.numpy())
            precision = precision_score(all_targets.numpy(), all_predictions.numpy(), average='samples',
                                        zero_division=0)
            recall = recall_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)
            f1 = f1_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)

            print(
                f"Epoch {epoch + 1}/{NUM_EPOCHS}, Fold {fold + 1} - Loss: {total_loss / len(train_data):.4f}, Acc: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

# Evaluate on the separate test set
model.eval()
with torch.no_grad():
    all_predictions, all_targets = [], []
    for inputs, targets in tqdm(test_data, desc="Evaluating on Test Set"):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        outputs = model(inputs)
        predicted_labels = (outputs > 0.5).float()
        all_predictions.append(predicted_labels)
        all_targets.append(targets)

    all_predictions = torch.cat(all_predictions).cpu()
    all_targets = torch.cat(all_targets).cpu()
    acc = accuracy_score(all_targets.numpy(), all_predictions.numpy())
    precision = precision_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)
    recall = recall_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)
    f1 = f1_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)

    print(f"Test Set Evaluation - Acc: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

Training Fold 1/5


Training Epoch 1, Fold 1:   6%|▋         | 11/176 [00:10<02:42,  1.01it/s]


KeyboardInterrupt: 