# Import

In [102]:
import os
import random
import re
from collections import Counter
from typing import Tuple, Dict, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch.utils.data import RandomSampler
from tqdm import tqdm
from transformers import RobertaForSequenceClassification, RobertaTokenizer


def warn(*args, **kwargs):
    pass


import warnings

warnings.warn = warn

# Configuration

In [90]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Running on device {DEVICE}")

RANDOM_SEED = 0
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.backends.cudnn.benchmark = False

LOG_DIR = os.path.join("log")
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

PATH_TO_DATASET = os.path.join("..", "dataset", "cgt")
BERT_MODEL_TYPE = 'microsoft/codebert-base'

MAX_FEATURES = 500
BATCH_SIZE = 2
NUM_FOLDS = 2
NUM_EPOCHS = 1
NUM_LABELS = 20
LR = 0.001
TEST_SIZE = 0.2

FILE_TYPE = "runtime"
FILE_EXT = ".rt.hex"
FILE_ID = "runtime"

Running on device cpu


# Dataset

Create PyTorch dataset feeding either source code, bytecode or runtime to the models.

## Preprocessing

In [91]:
def preprocess_hex(hex_data: str) -> str:
    # Reads a hex file and converts it to a byte string
    byte_data = bytes.fromhex(hex_data.strip())

    # Convert byte data to a readable ASCII string, ignoring non-ASCII characters
    return ' '.join(f'{byte:02x}' for byte in byte_data)

In [92]:
def preprocess_solidity_code(code: str) -> str:
    # Remove single-line comments
    code = re.sub(r'//.*', '', code)

    # Remove multi-line comments
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

    # Remove blank lines (lines only containing whitespace)
    lines = code.split('\n')
    non_blank_lines = [line for line in lines if line.strip() != '']
    code = '\n'.join(non_blank_lines)

    return code

In [93]:
def preprocess(data: str):
    return preprocess_solidity_code(data) if FILE_TYPE == "source" else preprocess_hex(data)

## Labels Management

In [94]:
def init_docs_and_gt(data: pd.DataFrame, file_type: str = FILE_TYPE, file_ext: str = FILE_EXT) -> Tuple:
    docs, labels, gt = {}, {}, {}
    for _, row in tqdm(data.iterrows(), desc="Initializing documents and groundtruth data"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, file_type, str(file_id) + file_ext)
        if os.path.exists(path_to_file):

            # Initialize the documents
            docs[item_id] = preprocess(open(path_to_file, 'r', encoding="utf8").read())

            # Initialize the label
            labels[item_id] = [0] * NUM_LABELS

            # Initialize the groundtruth
            prop = row["property"].lower()
            if prop not in gt.keys():
                gt[prop] = len(gt.values())

    return list(docs.values()), labels, gt

In [95]:
def set_labels(data: pd.DataFrame, labels: Dict, gt: Dict, file_type: str = FILE_TYPE,
               file_ext: str = FILE_EXT) -> List:
    for _, row in tqdm(data.iterrows(), desc="Setting up the labels"):
        item_id, file_id = row["id"], row["fp_" + FILE_ID]

        # Check if file exists
        path_to_file = os.path.join(PATH_TO_DATASET, file_type, str(file_id) + file_ext)
        if os.path.exists(path_to_file):

            # Set label   
            prop = row["property"].lower()
            if row['property_holds'] == 't':
                labels[item_id][gt[prop]] = 1

    return list(labels.values())

## Initialization of the dataset

In [96]:
# Read the dataset from CSV
dataset = pd.read_csv(os.path.join(PATH_TO_DATASET, "consolidated.csv"), sep=";")

# Count the frequency of each item in the column
frequency = dataset['dataset'].value_counts()

# Find the item with the maximum occurrence
most_frequent_item = frequency.idxmax()
most_frequent_count = frequency.max()

print(f"The most frequent item in the column is '{most_frequent_item}' and it appears {most_frequent_count} times.")

# Exclude outliers from the dataset
dataset = dataset[dataset["dataset"] == most_frequent_item]

# Initialize the documents and the groundtruth
documents, labels, gt = init_docs_and_gt(dataset)

# Set the labels for the multilabel classification problem
labels = set_labels(dataset, labels, gt)

vectorizer = TfidfVectorizer(max_features=MAX_FEATURES)

The most frequent item in the column is 'CodeSmells' and it appears 10395 times.


Initializing documents and groundtruth data: 10395it [00:06, 1487.77it/s]
Setting up the labels: 10395it [00:00, 47924.23it/s]


# Utility functions

In [97]:
def compute_metrics(true_labels, pred_labels):
    return {
        "precision": precision_score(true_labels, pred_labels, average='samples', zero_division=0),
        "recall": recall_score(true_labels, pred_labels, average='samples', zero_division=0),
        "f1": f1_score(true_labels, pred_labels, average='samples', zero_division=0)
    }


def save_results(results, filename):
    df = pd.DataFrame(results)
    df.to_csv(os.path.join(LOG_DIR, filename), index=False)
    print(f"All fold results saved to '{filename}'")

# BERT-like Models

In [52]:
model = RobertaForSequenceClassification.from_pretrained(BERT_MODEL_TYPE, num_labels=20, ignore_mismatched_sizes=True)
model.config.problem_type = "multi_label_classification"
model.to(DEVICE)

tokenizer = RobertaTokenizer.from_pretrained(BERT_MODEL_TYPE, ignore_mismatched_sizes=True)

optimizer = AdamW(model.parameters(), lr=LR)
loss_fn = nn.BCEWithLogitsLoss()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
def train_and_evaluate(model, train_dataloader, test_dataloader):
    for epoch in range(NUM_EPOCHS):
        print(f"\n --- Epoch {epoch + 1}/{NUM_EPOCHS} ---")

        # Training Phase
        model.train()  # Set the model to training mode
        train_losses, train_metrics_list = [], []

        for batch in tqdm(train_dataloader, desc="Training"):
            batch = tuple(b.to(model.device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            model.zero_grad()  # Clear existing gradients
            outputs = model(**inputs)
            loss = loss_fn(outputs.logits, inputs['labels'])
            train_losses.append(loss.item())
            loss.backward()  # Compute gradient
            optimizer.step()  # Update model parameters

            with torch.no_grad():
                predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
                batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
                train_metrics_list.append(batch_metrics)

        avg_train_loss = np.mean(train_losses)
        # Calculate average training metrics
        avg_train_metrics = {metric: np.mean([m[metric] for m in train_metrics_list]) for metric in
                             train_metrics_list[0]}
        print(
            f"\n TRAIN | Loss: {avg_train_loss:.4f} | Precision: {avg_train_metrics['precision']:.4f}, Recall: {avg_train_metrics['recall']:.4f}, F1: {avg_train_metrics['f1']:.4f}\n")

        # Validation Phase
        model.eval()  # Set the model to evaluation mode
        test_losses, test_metrics_list = [], []

        for batch in tqdm(test_dataloader, desc="Testing"):
            batch = tuple(b.to(model.device) for b in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            with torch.no_grad():
                outputs = model(**inputs)
                loss = loss_fn(outputs.logits, inputs['labels'])
                test_losses.append(loss.item())
                predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()
                batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
                test_metrics_list.append(batch_metrics)

        avg_test_loss = np.mean(test_losses)
        # Calculate average testing metrics
        avg_test_metrics = {metric: np.mean([m[metric] for m in test_metrics_list]) for metric in test_metrics_list[0]}
        print(
            f" VALID | Loss: {avg_test_loss:.4f} | Precision: {avg_test_metrics['precision']:.4f}, Recall: {avg_test_metrics['recall']:.4f}, F1: {avg_test_metrics['f1']:.4f}\n")

In [55]:
def evaluate_on_test_set(model, test_dataloader):
    # Put the model in evaluation mode which turns off specific layers like dropout
    model.eval()

    # Lists to store losses and metrics for each batch
    test_losses, test_metrics = [], []

    # Disable gradient calculations since we're only running inference, not training
    with torch.no_grad():
        # Iterate over batches in the provided DataLoader
        for batch in tqdm(test_dataloader, desc="Evaluating on Test Set"):
            # Move the batch to the device (e.g., GPU) the model is on
            batch = tuple(b.to(model.device) for b in batch)

            # Prepare inputs dictionary according to the model's expected input
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

            # Pass the inputs to the model and get outputs
            outputs = model(**inputs)

            # Compute loss using the loss function defined outside this function
            loss = loss_fn(outputs.logits, inputs['labels'])
            test_losses.append(loss.item())

            # Convert model logits to binary predictions
            predictions = torch.sigmoid(outputs.logits).round().cpu().numpy()

            # Compute metrics (e.g., precision, recall, F1) for the batch
            batch_metrics = compute_metrics(batch[2].cpu().numpy(), predictions)
            test_metrics.append(batch_metrics)

    # Calculate the average loss over all batches
    avg_test_loss = np.mean(test_losses)

    # Calculate average metrics across all batches
    avg_test_metrics = {metric: np.mean([m[metric] for m in test_metrics]) for metric in test_metrics[0]}

    # Print out the average loss and other metrics for all batches
    print(
        f"\nTest Set Evaluation | Loss: {avg_test_loss:.4f} | Precision: {avg_test_metrics['precision']:.4f}, Recall: {avg_test_metrics['recall']:.4f}, F1: {avg_test_metrics['f1']:.4f}\n")

    # Return a dictionary of average metrics
    return avg_test_metrics


In [56]:
# Tokenization
encoding = tokenizer(documents, add_special_tokens=True, max_length=512,
                     return_token_type_ids=False, padding="max_length",
                     truncation=True, return_attention_mask=True,
                     return_tensors='pt')

# Splitting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(encoding['input_ids'], labels, test_size=TEST_SIZE)
train_masks, test_masks, _, _ = train_test_split(encoding['attention_mask'], labels, test_size=TEST_SIZE)

# Creating datasets
train_dataset = TensorDataset(x_train, train_masks, torch.tensor(y_train).float())
test_dataset = TensorDataset(x_test, test_masks, torch.tensor(y_test).float())

# K-Fold Configuration
kf = KFold(n_splits=NUM_FOLDS, shuffle=True)

# Initialize a list to store metrics
fold_metrics = []

# Applying K-Fold Cross-Validation
for fold, (train_idx, val_idx) in enumerate(kf.split(train_dataset)):
    train_subsampler = Subset(train_dataset, train_idx)
    val_subsampler = Subset(train_dataset, val_idx)

    train_loader = DataLoader(train_subsampler, sampler=RandomSampler(train_subsampler), batch_size=BATCH_SIZE)
    val_loader = DataLoader(val_subsampler, batch_size=BATCH_SIZE)  # No need for shuffling

    print(f"Starting Fold {fold + 1}/{NUM_FOLDS}")
    train_and_evaluate(model, train_loader, val_loader)

    # Evaluate on the test set after each fold
    metrics = evaluate_on_test_set(model, DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False))
    fold_metrics.append(metrics)

# Save metrics to CSV
df = pd.DataFrame(fold_metrics)
df.to_csv(os.path.join(LOG_DIR, "bert.csv"), index=False)
print("Metrics saved to bert.csv")

# Calculate average and standard deviation of each metric across all folds
metric_keys = fold_metrics[0].keys()  # Assuming all metrics dictionaries have the same structure
average_metrics = {key: np.mean([metric[key] for metric in fold_metrics]) for key in metric_keys}
std_dev_metrics = {key: np.std([metric[key] for metric in fold_metrics]) for key in metric_keys}

# Print average metrics and their standard deviations
print("Average Metrics Over All Folds:")
for key, value in average_metrics.items():
    print(f"{key}: {value:.4f} (±{std_dev_metrics[key]:.4f})")

Starting Fold 1/2

 --- Epoch 1/1 ---


Training: 100%|██████████| 110/110 [05:04<00:00,  2.77s/it]



 TRAIN | Loss: 0.3526 | Precision: 0.7370, Recall: 0.7016, F1: 0.6939



Testing: 100%|██████████| 110/110 [00:52<00:00,  2.11it/s]


 VALID | Loss: 0.2513 | Precision: 0.8288, Recall: 0.7087, F1: 0.7404



Evaluating on Test Set: 100%|██████████| 56/56 [00:29<00:00,  1.88it/s]



Test Set Evaluation | Loss: 0.2715 | Precision: 0.8155, Recall: 0.6594, F1: 0.7044

Starting Fold 2/2

 --- Epoch 1/1 ---


Training: 100%|██████████| 110/110 [05:34<00:00,  3.04s/it]



 TRAIN | Loss: 0.2691 | Precision: 0.7900, Recall: 0.7211, F1: 0.7247



Testing: 100%|██████████| 110/110 [00:46<00:00,  2.34it/s]


 VALID | Loss: 0.2451 | Precision: 0.8030, Recall: 0.6772, F1: 0.7184



Evaluating on Test Set: 100%|██████████| 56/56 [00:23<00:00,  2.35it/s]


Test Set Evaluation | Loss: 0.2638 | Precision: 0.8155, Recall: 0.6594, F1: 0.7044

Metrics saved to model_evaluation_metrics.csv
Average Metrics Over All Folds:
precision: 0.8155 (±0.0000)
recall: 0.6594 (±0.0000)
f1: 0.7044 (±0.0000)





# SVM, Random Forest, Gradient Boosting

In [109]:
def evaluate_classifier(classifier, X, y):
    kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_SEED)
    results = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Use a pipeline to handle TF-IDF vectorization and classification
        pipeline = make_pipeline(
            TfidfVectorizer(max_features=MAX_FEATURES),
            OneVsRestClassifier(classifier)
        )
        pipeline.fit(X_train, y_train)
        predictions = pipeline.predict(X_test)

        # Calculate metrics
        precision = precision_score(y_test, predictions, average='samples')
        recall = recall_score(y_test, predictions, average='samples')
        f1 = f1_score(y_test, predictions, average='samples')
        results.append({'precision': precision, 'recall': recall, 'f1': f1})

    return pd.DataFrame(results)

In [111]:
x = vectorizer.fit_transform(documents)
y = labels

# Classifier configurations
classifiers = {
    "svm": SVC(kernel='linear', probability=True),
    "random_forest": RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED),
    "gradient_boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
}

for classifier_name, classifier in classifiers.items():
    print(f"\nTesting classifier: {classifier_name}")
    metrics_df = evaluate_classifier(classifier, x, y)  # Use the full dataset for cross-validation
    file_path = os.path.join(LOG_DIR, f"{classifier_name}_metrics.csv")
    metrics_df.to_csv(file_path, index=False)
    print(f"Saved {classifier_name} cross-validation metrics to {file_path}")


Testing classifier: svm


AttributeError: 'csr_matrix' object has no attribute 'lower'

# Simple Neural Network

In [59]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(MAX_FEATURES, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, y.shape[1])
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [None]:
def train(model, data_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_predictions, all_labels = [], []
    for inputs, labels in data_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # Collect predictions for metrics calculation
        predicted = outputs.sigmoid().round()
        all_predictions.extend(predicted.detach().cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    metrics = compute_metrics(np.array(all_labels), np.array(all_predictions))
    return total_loss / len(data_loader), metrics


def evaluate(model, data_loader, device):
    model.eval()
    all_predictions, all_labels = [], []
    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = outputs.sigmoid().round()
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    metrics = compute_metrics(np.array(all_labels), np.array(all_predictions))
    return metrics

In [60]:
x = torch.FloatTensor(vectorizer.fit_transform(documents).toarray())
y = torch.FloatTensor(labels)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE, random_state=RANDOM_SEED)
train_data = DataLoader(TensorDataset(x_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)

# Model setup
model = SimpleNN().to(DEVICE)
criterion = nn.BCEWithLogitsLoss().to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# K-Fold training and validation
kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_train)):
    print(f"Training Fold {fold + 1}/5")
    train_subset = DataLoader(TensorDataset(x_train[train_idx], y_train[train_idx]), batch_size=BATCH_SIZE,
                              shuffle=True)
    val_subset = DataLoader(TensorDataset(x_train[val_idx], y_train[val_idx]), batch_size=BATCH_SIZE, shuffle=False)

    for epoch in range(NUM_EPOCHS):
        train_loss, train_metrics = train(model, train_subset, criterion, optimizer, DEVICE)
        print(f"Fold {fold + 1}, Epoch {epoch + 1} - Training Loss: {train_loss:.4f}, Metrics: {train_metrics}")

        val_metrics = evaluate(model, val_subset, DEVICE)
        print(f"Fold {fold + 1}, Epoch {epoch + 1} - Validation Metrics: {val_metrics}")

    # Test evaluation
    test_metrics = evaluate(model, test_data, DEVICE)
    fold_results.append(test_metrics)
    print(f"Fold {fold + 1} - Test Metrics: {test_metrics}")

# Save all results
save_results(fold_results, "ffnn.csv")

Fold 1/5


Training: 100%|██████████| 176/176 [00:00<00:00, 1067.70it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 10154.58it/s]


Validation - Fold 1, Epoch 1: Accuracy: 0.0909
Fold 2/5


Training: 100%|██████████| 176/176 [00:00<00:00, 1366.31it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 11212.67it/s]


Validation - Fold 2, Epoch 1: Accuracy: 0.1705
Fold 3/5


Training: 100%|██████████| 176/176 [00:00<00:00, 1424.92it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 11707.76it/s]


Validation - Fold 3, Epoch 1: Accuracy: 0.1591
Fold 4/5


Training: 100%|██████████| 176/176 [00:00<00:00, 1450.91it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 9478.65it/s]


Validation - Fold 4, Epoch 1: Accuracy: 0.1591
Fold 5/5


Training: 100%|██████████| 176/176 [00:00<00:00, 1400.27it/s]
Validation: 100%|██████████| 44/44 [00:00<00:00, 11790.03it/s]


Validation - Fold 5, Epoch 1: Accuracy: 0.1477


Testing: 100%|██████████| 56/56 [00:00<00:00, 5144.69it/s]

Test Set Evaluation - Accuracy: 0.1171, Precision: 0.8138, Recall: 0.6619, F1:0.7058





# LSTM

In [61]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))
        self.embedding.weight.requires_grad = True  # Optionally freeze the embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        packed_output, (hidden, cell) = self.lstm(embedded)
        hidden = hidden.squeeze(0)
        output = self.fc(hidden)
        return torch.sigmoid(output)

In [None]:
def train(model, train_data, criterion, optimizer, device):
    model.train()
    total_loss = 0
    all_predictions, all_targets = [], []
    for inputs, targets in train_data:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        # For metrics calculation, we need binary predictions and actual targets
        predictions = outputs.sigmoid().round()
        all_predictions.extend(predictions.detach().cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

    avg_loss = total_loss / len(train_data)
    training_metrics = compute_metrics(np.vstack(all_targets), np.vstack(all_predictions))
    return avg_loss, training_metrics


def evaluate(model, data, device):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for inputs, labels in data:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predictions.extend(outputs.round().detach().cpu().numpy())
            targets.extend(labels.cpu().numpy())
    return np.vstack(targets), np.vstack(predictions)


# Load GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as file:
        for line in tqdm(file, desc="Loading GloVe Embeddings"):
            parts = line.split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [63]:
glove_embeddings = load_glove_embeddings('glove.6B.100d.txt')  # Update path as necessary

# Tokenization and vocabulary creation
word_count = Counter(word for sentence in documents for word in sentence.lower().split())
vocabulary = {word: i + 1 for i, word in enumerate(word_count)}  # start indexing from 1
vocabulary['<PAD>'] = 0  # Padding value

# Embedding matrix creation
embedding_dim = 100  # Dimensionality of GloVe embeddings used
embedding_matrix = np.zeros((len(vocabulary), embedding_dim))
for word, i in tqdm(vocabulary.items(), desc='Creating Embedding Matrix'):
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Convert text to sequence of integers
sequences = [[vocabulary[word] for word in text.lower().split()] for text in documents]

# Finding the longest sequence
max_seq_len = max(len(seq) for seq in sequences)

# Pad sequences
seq_padded = [seq + [vocabulary['<PAD>']] * (max_seq_len - len(seq)) for seq in sequences]

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'

In [None]:
x_tensor = torch.tensor(seq_padded, dtype=torch.long)
y_tensor = torch.tensor(labels, dtype=torch.float)

# Data loading and model setup
x_train_val, x_test, y_train_val, y_test = train_test_split(x_tensor, y_tensor, test_size=TEST_SIZE,
                                                            random_state=RANDOM_SEED)
test_data = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE, shuffle=False)
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=RANDOM_SEED)
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(x_train_val)):
    print(f"Training Fold {fold + 1}/5")
    train_data = DataLoader(TensorDataset(x_train_val[train_idx], y_train_val[train_idx]), BATCH_SIZE, shuffle=True)
    val_data = DataLoader(TensorDataset(x_train_val[val_idx], y_train_val[val_idx]), BATCH_SIZE, shuffle=False)

    model = LSTMClassifier(1000, 50, 100, 1, glove_embeddings)
    criterion = nn.BCEWithLogitsLoss().to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR)

    for epoch in range(NUM_EPOCHS):
        train_loss, train_metrics = train(model, train_data, criterion, optimizer, DEVICE)
        print(f'Fold {fold + 1}, Epoch {epoch + 1} - Training Loss: {train_loss:.4f}, Metrics: {train_metrics}')

        val_targets, val_predictions = evaluate(model, val_data, DEVICE)
        val_metrics = compute_metrics(val_targets, val_predictions)
        print(f'Fold {fold + 1}, Epoch {epoch + 1} - Validation Metrics: {val_metrics}')

    test_targets, test_predictions = evaluate(model, test_data, DEVICE)
    test_metrics = compute_metrics(test_targets, test_predictions)
    fold_results.append(test_metrics)
    print(f'Test Set Evaluation - Fold {fold + 1}: {test_metrics}')

save_results(fold_results, "lstm.csv")