# Import

In [131]:
import os
import random
import re
from collections import Counter
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import RobertaForSequenceClassification, RobertaTokenizer

# Configuration

In [132]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running on device {DEVICE}")

RANDOM_SEED = 0
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.benchmark = False

PATH_TO_DATASET = os.path.join("dataset")
BERT_MODEL_TYPE = 'microsoft/codebert-base'

MAX_FEATURES = 500
BATCH_SIZE = 2
NUM_EPOCHS = 30
NUM_LABELS = 20
LR = 0.001

TEST_SIZE = 0.2
FILE_TYPE = "source"
FILE_EXT = ".sol"
FILE_ID = "sol"

Running on device cpu


# Dataset

Create PyTorch dataset feeding either source code, bytecode or runtime to the models.

## Preprocessing

In [133]:
def preprocess_hex(hex_data: str) -> str:
    # Reads a hex file and converts it to a byte string
    byte_data = bytes.fromhex(hex_data.strip())

    # Convert byte data to a readable ASCII string, ignoring non-ASCII characters
    return ' '.join(f'{byte:02x}' for byte in byte_data)

In [134]:
def preprocess_solidity_code(code: str) -> str:
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)

    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

    # Remove blank lines (lines only containing whitespace)
    lines = code.split('\n')
    non_blank_lines = [line for line in lines if line.strip() != '']
    code = '\n'.join(non_blank_lines)

    return code

In [135]:
def preprocess(data: str):
    return preprocess_solidity_code(data) if FILE_TYPE == "source" else preprocess_hex(data)

## Labels Management

In [136]:
def init_docs_and_gt(data: pd.DataFrame) -> Tuple:
    docs, labels, gt = {}, {}, {}
    for _, row in tqdm(data.iterrows(), desc="Initializing documents and groundtruth data"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, FILE_TYPE, str(file_id) + FILE_EXT)
        if os.path.exists(path_to_file):

            # Initialize the documents
            docs[item_id] = preprocess(open(path_to_file, 'r', encoding="utf8").read())

            # Initialize the label
            labels[item_id] = [0] * NUM_LABELS

            # Initialize the groundtruth
            prop = row["property"].lower()
            if prop not in gt.keys():
                gt[prop] = len(gt.values())

    return list(docs.values()), labels, gt

In [137]:
def set_labels(data: pd.DataFrame, labels: Dict, gt: Dict) -> List:
    for _, row in tqdm(data.iterrows(), desc="Setting up the labels"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, FILE_TYPE, str(file_id) + FILE_EXT)
        if os.path.exists(path_to_file):

            # Set label   
            prop = row["property"].lower()
            if row['property_holds'] == 't':
                labels[item_id][gt[prop]] = 1

    return list(labels.values())

## Initialization of the dataset

In [138]:
# Read the dataset from CSV
dataset = pd.read_csv(os.path.join(PATH_TO_DATASET, "consolidated.csv"), sep=";")

# Count the frequency of each item in the column
frequency = dataset['dataset'].value_counts()

# Find the item with the maximum occurrence
most_frequent_item = frequency.idxmax()
most_frequent_count = frequency.max()

print(f"The most frequent item in the column is '{most_frequent_item}' and it appears {most_frequent_count} times.")

# Exclude outliers from the dataset
dataset = dataset[dataset["dataset"] == most_frequent_item]

# Initialize the documents and the groundtruth
documents, labels, gt = init_docs_and_gt(dataset)

# Set the labels for the multilabel classification problem
labels = set_labels(dataset, labels, gt)

The most frequent item in the column is 'CodeSmells' and it appears 10395 times.


Initializing documents and groundtruth data: 10395it [00:01, 7207.25it/s]
Setting up the labels: 10395it [00:00, 47593.97it/s]


# BERT-like Models

In [139]:
model = RobertaForSequenceClassification.from_pretrained(BERT_MODEL_TYPE, num_labels=20, ignore_mismatched_sizes=True)
model.config.problem_type = "multi_label_classification"
model.to(DEVICE)

tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL_TYPE, ignore_mismatched_sizes=True)

optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [140]:
encoding = tokenizer(documents, add_special_tokens=True, max_length=512, return_token_type_ids=False,
                     padding="max_length", truncation=True, return_attention_mask=True, return_tensors='pt')

x_train, x_test, y_train, y_test = train_test_split(encoding['input_ids'], labels, test_size=TEST_SIZE)
train_masks, test_masks, _, _ = train_test_split(encoding['attention_mask'], labels, test_size=TEST_SIZE)

train_dataset = TensorDataset(x_train, train_masks, torch.tensor(y_train).float())
test_dataset = TensorDataset(x_test, test_masks, torch.tensor(y_test).float())

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=BATCH_SIZE)

In [141]:
def compute_metrics(true_labels, pred_labels):
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average='samples', zero_division=0)
    recall = recall_score(true_labels, pred_labels, average='samples', zero_division=0)
    f1 = f1_score(true_labels, pred_labels, average='samples', zero_division=0)
    hamming = hamming_loss(true_labels, pred_labels)
    return accuracy, precision, recall, f1, hamming

In [142]:
for epoch in range(NUM_EPOCHS):
    model.train()
    train_losses, train_metrics = [], []

    for batch in tqdm(train_dataloader, desc="Training"):
        batch = tuple(b.to(model.device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        model.zero_grad()
        outputs = model(**inputs)
        loss = loss_fn(outputs.logits, inputs['labels'])
        train_losses.append(loss.item())
        loss.backward()
        optimizer.step()

        # Compute metrics for the batch
        with torch.no_grad():
            predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
            batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
            train_metrics.append(batch_metrics)

    avg_train_loss = np.mean(train_losses)
    avg_train_metrics = np.mean(train_metrics, axis=0)
    print(
        f"\nEpoch {epoch + 1}/{NUM_EPOCHS} | Train loss: {avg_train_loss:.4f} | Train Metrics: Precision: {avg_train_metrics[1]:.4f}, Recall: {avg_train_metrics[2]:.4f}, F1: {avg_train_metrics[3]:.4f}, Hamming Loss: {avg_train_metrics[4]:.4f}\n")

    # Test phase
    model.eval()
    test_losses, test_metrics = [], []

    for batch in tqdm(test_dataloader, desc="Testing"):
        batch = tuple(b.to(model.device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        with torch.no_grad():
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            test_losses.append(loss.item())
            predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
            batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
            test_metrics.append(batch_metrics)

    avg_test_loss = np.mean(test_losses)
    avg_test_metrics = np.mean(test_metrics, axis=0)
    print(
        f"\nEpoch {epoch + 1}/{NUM_EPOCHS} | Test loss: {avg_test_loss:.4f} | Test Metrics: Precision: {avg_test_metrics[1]:.4f}, Recall: {avg_test_metrics[2]:.4f}, F1: {avg_test_metrics[3]:.4f}, Hamming Loss: {avg_test_metrics[4]:.4f}\n")

Training:  31%|███▏      | 69/220 [03:47<08:18,  3.30s/it]


KeyboardInterrupt: 

# SVM

In [143]:
# Preprocessing and Feature Extraction
x = TfidfVectorizer(max_features=MAX_FEATURES).fit_transform(documents)
y = labels

# Splitting Data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# Train the SVM model with One-vs-Rest strategy
model = OneVsRestClassifier(SVC(kernel='linear', probability=True))
model.fit(x_train, y_train)

# Evaluate the model
predictions = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))



Accuracy: 0.27927927927927926
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         2
           6       1.00      0.36      0.53        22
           7       1.00      0.29      0.44        14
           8       0.00      0.00      0.00         3
           9       0.98      0.89      0.93        47
          10       0.95      1.00      0.97       105
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.89      0.68      0.77        47
          14       0.88      0.96      0.92        97
          15       0.00      0.00      0.00         8
          16       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Random Forest

In [144]:
# Preprocessing and Feature Extraction
x = TfidfVectorizer(max_features=MAX_FEATURES).fit_transform(documents)
y = labels

# Splitting Data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)

# Train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)
model.fit(x_train, y_train)

# Evaluate the model
predictions = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))


Accuracy: 0.2972972972972973
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       1.00      0.17      0.29         6
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         2
           6       1.00      0.45      0.62        22
           7       1.00      0.21      0.35        14
           8       0.00      0.00      0.00         3
           9       0.98      0.89      0.93        47
          10       0.95      1.00      0.98       105
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.90      0.74      0.81        47
          14       0.89      0.97      0.93        97
          15       1.00      0.12      0.22         8
          16       0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Simple Neural Network

In [145]:
# Preprocessing and Feature Extraction
x = TfidfVectorizer(max_features=MAX_FEATURES).fit_transform(documents).toarray()
x = torch.FloatTensor(x)
y = torch.FloatTensor(labels)

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)
train_data = DataLoader(TensorDataset(x_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)


# Step 4: Defining the Neural Network
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(MAX_FEATURES, 512)  # 1000 features to 512
        self.fc2 = nn.Linear(512, 128)  # 512 to 128
        self.fc3 = nn.Linear(128, y_train.shape[1])  # Output layer: number of labels
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))  # Sigmoid activation for multilabel classification
        return x


# Initialize the network
model = SimpleNN()

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss for multilabel classification
optimizer = AdamW(model.parameters(), lr=LR)

# Training loop with metric tracking
for epoch in range(NUM_EPOCHS):

    model.train()
    total_loss = 0
    for inputs, targets in tqdm(train_data, desc="Training"):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        # Collect all predictions and true labels across batches to calculate metrics.
        all_predictions, all_targets = [], []
        for inputs, targets in tqdm(test_data, desc="Testing"):
            outputs = model(inputs)
            all_predictions.append(outputs)
            all_targets.append(targets)

        all_predictions = torch.cat(all_predictions).cpu()
        all_targets = torch.cat(all_targets).cpu()

        # Threshold predictions to binary values
        predicted_labels = (all_predictions > 0.5).type(torch.float)

        # Calculate metrics
        acc = accuracy_score(all_targets, predicted_labels)
        precision = precision_score(all_targets, predicted_labels, average='samples', zero_division=0)
        recall = recall_score(all_targets, predicted_labels, average='samples', zero_division=0)
        f1 = f1_score(all_targets, predicted_labels, average='samples', zero_division=0)

    print(
        f'Epoch {epoch + 1}/{NUM_EPOCHS} - Loss: {total_loss / len(train_data):.4f}, Acc: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

Training: 100%|██████████| 220/220 [00:00<00:00, 596.43it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 7278.39it/s]


Epoch 1/30 - Loss: 0.2797, Acc: 0.1261, Precision: 0.8266, Recall: 0.7107, F1: 0.7409


Training: 100%|██████████| 220/220 [00:00<00:00, 763.72it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8257.08it/s]


Epoch 2/30 - Loss: 0.2073, Acc: 0.2342, Precision: 0.8407, Recall: 0.7864, F1: 0.7929


Training: 100%|██████████| 220/220 [00:00<00:00, 859.00it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 4941.22it/s]


Epoch 3/30 - Loss: 0.1819, Acc: 0.2432, Precision: 0.8417, Recall: 0.8225, F1: 0.8144


Training: 100%|██████████| 220/220 [00:00<00:00, 833.50it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 6499.74it/s]


Epoch 4/30 - Loss: 0.1631, Acc: 0.2703, Precision: 0.8503, Recall: 0.8278, F1: 0.8213


Training: 100%|██████████| 220/220 [00:00<00:00, 850.97it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9473.68it/s]


Epoch 5/30 - Loss: 0.1478, Acc: 0.2523, Precision: 0.8495, Recall: 0.8242, F1: 0.8186


Training: 100%|██████████| 220/220 [00:00<00:00, 813.59it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 6465.03it/s]


Epoch 6/30 - Loss: 0.1349, Acc: 0.2613, Precision: 0.8395, Recall: 0.8369, F1: 0.8194


Training: 100%|██████████| 220/220 [00:00<00:00, 567.21it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 6979.91it/s]


Epoch 7/30 - Loss: 0.1249, Acc: 0.2793, Precision: 0.8465, Recall: 0.8350, F1: 0.8232


Training: 100%|██████████| 220/220 [00:00<00:00, 625.18it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 5895.31it/s]


Epoch 8/30 - Loss: 0.1134, Acc: 0.2703, Precision: 0.8249, Recall: 0.8471, F1: 0.8177


Training: 100%|██████████| 220/220 [00:00<00:00, 698.56it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8948.87it/s]


Epoch 9/30 - Loss: 0.1012, Acc: 0.2973, Precision: 0.8332, Recall: 0.8355, F1: 0.8153


Training: 100%|██████████| 220/220 [00:00<00:00, 863.80it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9148.95it/s]


Epoch 10/30 - Loss: 0.0925, Acc: 0.2613, Precision: 0.8577, Recall: 0.8094, F1: 0.8153


Training: 100%|██████████| 220/220 [00:00<00:00, 815.52it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 7171.94it/s]


Epoch 11/30 - Loss: 0.0809, Acc: 0.2703, Precision: 0.8508, Recall: 0.8407, F1: 0.8272


Training: 100%|██████████| 220/220 [00:00<00:00, 811.55it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8767.82it/s]


Epoch 12/30 - Loss: 0.0733, Acc: 0.2703, Precision: 0.8505, Recall: 0.8387, F1: 0.8239


Training: 100%|██████████| 220/220 [00:00<00:00, 848.94it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 7861.86it/s]


Epoch 13/30 - Loss: 0.0662, Acc: 0.2973, Precision: 0.8473, Recall: 0.8476, F1: 0.8271


Training: 100%|██████████| 220/220 [00:00<00:00, 876.03it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9214.63it/s]


Epoch 14/30 - Loss: 0.0584, Acc: 0.2252, Precision: 0.8520, Recall: 0.8249, F1: 0.8142


Training: 100%|██████████| 220/220 [00:00<00:00, 855.12it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9045.02it/s]


Epoch 15/30 - Loss: 0.0526, Acc: 0.2883, Precision: 0.8359, Recall: 0.8688, F1: 0.8313


Training: 100%|██████████| 220/220 [00:00<00:00, 834.20it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8620.75it/s]


Epoch 16/30 - Loss: 0.0480, Acc: 0.3063, Precision: 0.8363, Recall: 0.8629, F1: 0.8299


Training: 100%|██████████| 220/220 [00:00<00:00, 887.38it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 6945.24it/s]


Epoch 17/30 - Loss: 0.0437, Acc: 0.2973, Precision: 0.8372, Recall: 0.8609, F1: 0.8287


Training: 100%|██████████| 220/220 [00:00<00:00, 834.38it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 7737.80it/s]


Epoch 18/30 - Loss: 0.0398, Acc: 0.2883, Precision: 0.8441, Recall: 0.8531, F1: 0.8291


Training: 100%|██████████| 220/220 [00:00<00:00, 821.15it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 4928.26it/s]


Epoch 19/30 - Loss: 0.0373, Acc: 0.2162, Precision: 0.8291, Recall: 0.8464, F1: 0.8165


Training: 100%|██████████| 220/220 [00:00<00:00, 582.25it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 6721.06it/s]


Epoch 20/30 - Loss: 0.0322, Acc: 0.2613, Precision: 0.8516, Recall: 0.8396, F1: 0.8226


Training: 100%|██████████| 220/220 [00:00<00:00, 606.15it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8388.01it/s]


Epoch 21/30 - Loss: 0.0329, Acc: 0.2793, Precision: 0.8485, Recall: 0.8328, F1: 0.8171


Training: 100%|██████████| 220/220 [00:00<00:00, 825.16it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8388.31it/s]


Epoch 22/30 - Loss: 0.0301, Acc: 0.2883, Precision: 0.8383, Recall: 0.8493, F1: 0.8218


Training: 100%|██████████| 220/220 [00:00<00:00, 824.06it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9061.42it/s]


Epoch 23/30 - Loss: 0.0280, Acc: 0.2432, Precision: 0.8477, Recall: 0.8163, F1: 0.8076


Training: 100%|██████████| 220/220 [00:00<00:00, 885.59it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8354.30it/s]


Epoch 24/30 - Loss: 0.0259, Acc: 0.2613, Precision: 0.8422, Recall: 0.8405, F1: 0.8186


Training: 100%|██████████| 220/220 [00:00<00:00, 854.75it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9647.23it/s]


Epoch 25/30 - Loss: 0.0262, Acc: 0.2432, Precision: 0.8390, Recall: 0.8198, F1: 0.8063


Training: 100%|██████████| 220/220 [00:00<00:00, 822.01it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9351.85it/s]


Epoch 26/30 - Loss: 0.0238, Acc: 0.2703, Precision: 0.8344, Recall: 0.8403, F1: 0.8168


Training: 100%|██████████| 220/220 [00:00<00:00, 835.78it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 9293.02it/s]


Epoch 27/30 - Loss: 0.0222, Acc: 0.2883, Precision: 0.8384, Recall: 0.8389, F1: 0.8148


Training: 100%|██████████| 220/220 [00:00<00:00, 815.05it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 8987.22it/s]


Epoch 28/30 - Loss: 0.0238, Acc: 0.3063, Precision: 0.8584, Recall: 0.8336, F1: 0.8242


Training: 100%|██████████| 220/220 [00:00<00:00, 918.93it/s] 
Testing: 100%|██████████| 56/56 [00:00<00:00, 7706.07it/s]


Epoch 29/30 - Loss: 0.0209, Acc: 0.2523, Precision: 0.8541, Recall: 0.8177, F1: 0.8124


Training: 100%|██████████| 220/220 [00:00<00:00, 861.16it/s]
Testing: 100%|██████████| 56/56 [00:00<00:00, 7294.67it/s]

Epoch 30/30 - Loss: 0.0203, Acc: 0.2523, Precision: 0.8247, Recall: 0.8634, F1: 0.8206





# LSTM

In [146]:
# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Loading GloVe Embeddings"):
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings


glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')  # Update path as necessary

# Tokenization and vocabulary creation
word_count = Counter(word for sentence in documents for word in sentence.lower().split())
vocabulary = {word: i + 1 for i, word in enumerate(word_count)}  # start indexing from 1
vocabulary['<PAD>'] = 0  # Padding value

# Embedding matrix creation
embedding_dim = 100  # Dimensionality of GloVe embeddings used
embedding_matrix = np.zeros((len(vocabulary), embedding_dim))
for word, i in tqdm(vocabulary.items(), desc='Creating Embedding Matrix'):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Convert text to sequence of integers
sequences = [[vocabulary[word] for word in text.lower().split()] for text in documents]

# Finding the longest sequence
max_seq_len = max(len(seq) for seq in sequences)

# Pad sequences
seq_padded = [seq + [vocabulary['<PAD>']] * (max_seq_len - len(seq)) for seq in sequences]
seq_padded = torch.tensor(seq_padded)

labels = torch.tensor(labels, dtype=torch.float32).to(DEVICE)

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(seq_padded, labels, test_size=TEST_SIZE, random_state=RANDOM_SEED)
train_data = DataLoader(TensorDataset(x_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)


# Define the LSTM model with pre-trained embeddings
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = True  # Optionally freeze the embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        packed_output, (hidden, cell) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return torch.sigmoid(output)


# Model, loss, and optimizer
model = LSTMClassifier(len(vocabulary), embedding_dim, hidden_dim=100, output_dim=y_train.shape[1],
                       pretrained_embeddings=embedding_matrix)
model = model.to(DEVICE)

criterion = nn.BCELoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    for inputs, targets in tqdm(train_data, desc="Training"):
        inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        all_predictions, all_targets = [], []
        for inputs, targets in tqdm(test_data, desc="Testing"):
            inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
            outputs = model(inputs)
            predicted_labels = (outputs > 0.5).float()
            all_predictions.append(predicted_labels)
            all_targets.append(targets)

        all_predictions = torch.cat(all_predictions).cpu()
        all_targets = torch.cat(all_targets).cpu()

        acc = accuracy_score(all_targets.numpy(), all_predictions.numpy())
        precision = precision_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)
        recall = recall_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)
        f1 = f1_score(all_targets.numpy(), all_predictions.numpy(), average='samples', zero_division=0)

    print(
        f'Epoch {epoch + 1}/{NUM_EPOCHS} - Loss: {loss.item():.4f}, Acc: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

Loading GloVe Embeddings: 400000it [00:05, 76487.53it/s]
Creating Embedding Matrix: 100%|██████████| 45901/45901 [00:00<00:00, 1516834.86it/s]
Training:   4%|▎         | 8/220 [00:09<04:02,  1.15s/it]


KeyboardInterrupt: 