Installing Packages

In [None]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob
import requests
from collections import Counter

import warnings
warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('words')
stop_words = stopwords.words('english')
words = set(nltk.corpus.words.words())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Loading function

In [None]:
  def load_data_label(path):
      temp_data = []
      lines = [line.strip() for line in open(path)]
      for idx in range(0, len(lines), 4):
          id = lines[idx].split("\t")[0]
          relation = lines[idx + 1]

          sentence = lines[idx].split("\t")[1][1:-1]
          sentence = sentence.replace("<e1>", " _e1_ ").replace("</e1>", " _/e1_ ")
          sentence = sentence.replace("<e2>", " _e2_ ").replace("</e2>", " _/e2_ ")
          sentence = sentence.replace("<e1>", "<e1> ").replace("</e1>", " </e11>")
          sentence = sentence.replace("<e2>", "<e2> ").replace("</e2>", " </e22>")

          tokens = nltk.word_tokenize(sentence)

          tokens.remove('_/e1_')
          tokens.remove('_/e2_')

          e1 = tokens.index("_e1_")
          del tokens[e1]
          element1=tokens[e1]
          e2 = tokens.index("_e2_")
          del tokens[e2]
          element2=tokens[e2]
          sentence = " ".join(tokens)
          temp_data.append([id, sentence, e1, element1, e2, element2, relation])
      df = pd.DataFrame(data=temp_data, columns=["id", "sentence", "e1_position","element1", "e2_position","element2", "class"])
      #print (df)
      labelsMapping = {'Other': 0,'Message-Topic(e1,e2)': 1, 'Message-Topic(e2,e1)': 2,
                      'Product-Producer(e1,e2)': 3, 'Product-Producer(e2,e1)': 4,
                      'Instrument-Agency(e1,e2)': 5, 'Instrument-Agency(e2,e1)': 6,
                      'Entity-Destination(e1,e2)': 7, 'Entity-Destination(e2,e1)': 8,
                      'Cause-Effect(e1,e2)': 9, 'Cause-Effect(e2,e1)': 10,
                      'Component-Whole(e1,e2)': 11, 'Component-Whole(e2,e1)': 12,
                      'Entity-Origin(e1,e2)': 13, 'Entity-Origin(e2,e1)': 14,
                      'Member-Collection(e1,e2)': 15, 'Member-Collection(e2,e1)': 16,
                      'Content-Container(e1,e2)': 17, 'Content-Container(e2,e1)': 18}
      df['tag'] = [labelsMapping[r] for r in df['class']]
      #print(df)
      x_sentence = df['sentence'].tolist()

      #Label Data
      y = df['tag']
      return df

In [None]:
df = pd.read_csv("large_ds_df_exported.csv") #  (for training data we have already loaded while creation in Distant Supervision file)

path_to_test_file = "/content/TEST_FILE.txt"
df_test = load_data_label(path_to_test_file) # (for the test data)

Cleaning

In [None]:
def preprocessor(sentence):

    sentence = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", sentence)
    sentence = sentence.lower()
    sentence = re.sub(r'\d+','', sentence)
    sentence = sentence.replace("user", "")
    return  sentence

def clean_text(df):

    train_cleaned = df['sentence'].apply(preprocessor)
    df['sentence'] = train_cleaned.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

    return df

In [None]:
#input format
df['sentence'] = df['sentence'] + " " + df['element1'] + " " + df['element2']
df_test['sentence'] = df_test['sentence'] + " " + df_test['element1'] + " " + df_test['element2']

train_cleaned = clean_text(df)
test_cleaned = clean_text(df_test)

Installing torch and transformers

In [None]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [None]:
pip install transformers



Tokenizer

In [None]:
from transformers import BertTokenizer

# loading BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#  tokenizing set of texts
def Input_embeddings(data,length):

    input_ids = []
    attention_masks = []

    for sentence in data:

        encoded_sent = tokenizer.encode_plus(
            text=sentence,
            add_special_tokens=True,
            max_length=length,
            pad_to_max_length=True,
            return_attention_mask=True
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
sentences = np.concatenate([train_cleaned.sentence.values, test_cleaned.sentence.values])
sentences_encoded = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences]
max_length = max([len(sent) for sent in sentences_encoded])
print('Max length: ', max_length)

Max length:  137


In [None]:
data = [train_cleaned.sentence[0]]
token_ids = list(Input_embeddings(data,max_length)[0].squeeze().numpy())
print('Original: ', train_cleaned.sentence[0])
print('Token IDs: ', token_ids)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  the system described above has its greatest application arrayed configuration antenna elements configuration elements
Token IDs:  [101, 1996, 2291, 2649, 2682, 2038, 2049, 4602, 4646, 9140, 2098, 9563, 13438, 3787, 9563, 3787, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


Creating the training and validation data for model

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import torch.nn as nn
import torch.nn.functional as F
import time
import datetime

# Split the data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    train_cleaned['sentence'].values,
    train_cleaned['tag'].values,
    test_size=0.1,
    random_state=42
)

# Tokenize all of the sentences and map the tokens to their word IDs.
train_input_ids, train_attention_masks = Input_embeddings(train_sentences, max_length)
val_input_ids, val_attention_masks = Input_embeddings(val_sentences, max_length)

# Convert the labels to tensors.
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)

# Create the DataLoader for our training set.
train_data = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

# Create the DataLoader for our validation set.
val_data = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=32)



Class Weighting

In [None]:
from collections import Counter
import torch

# Calculate class counts
class_counts = Counter(train_cleaned['tag'])

# Exclude class 8 from the class counts
class_counts_without_8 = {k: v for k, v in class_counts.items() if k != 8}

# Calculate class weights (inverse of class frequencies) excluding class 8
class_weights_without_8 = torch.tensor([1.0 / class_counts_without_8[i] for i in range(19) if i != 8], dtype=torch.float32)

# Normalize the weights so they sum to 1
class_weights_without_8 = class_weights_without_8 / class_weights_without_8.sum()

# Create a tensor for all classes, initially setting weight for class 8 to 0
class_weights = torch.zeros(19, dtype=torch.float32)
for i in range(19):
    if i != 8:
        class_weights[i] = class_weights_without_8[i if i < 8 else i - 1]

# manual adjust for class 8 by adjusting other classes as well
class_weights[8]=0.02
class_weights[14]=0.0786
class_weights[18]=0.0916
print(class_weights)
print(class_weights.sum())

tensor([0.0079, 0.0229, 0.0822, 0.0351, 0.0263, 0.1742, 0.0397, 0.0106, 0.0200,
        0.0602, 0.0268, 0.0268, 0.0252, 0.0198, 0.0786, 0.1840, 0.0265, 0.0415,
        0.0916])
tensor(1.0000)


Loss function

In [None]:
class_weights = class_weights.to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

Evaluation Metrics

In [None]:
# Function to calculate the accuracy of predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

# Function to calculate the F1 score
from sklearn.metrics import f1_score

def flat_f1(preds, labels):
    # Flatten the predictions and labels
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    # Exclude the 'Other' class (label 0) and calculate F1 for the remaining 18 classes
    f1 = f1_score(labels_flat, pred_flat, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], average='macro')
    return f1

Model Setup and Initial Training

In [None]:
# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=19,
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(
    model.parameters(),
    lr=2e-5,
    eps=1e-8,
)

# Number of training epochs
epochs = 4

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Default value in run_glue.py
    num_training_steps=total_steps,
)
# Training loop
import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch
loss_values = []

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Move batch tensors to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs.logits  # Extract logits from the model output

        # Calculate loss using the custom loss function with class weights
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()
    eval_accuracy, eval_f1 = 0, 0

    for batch in val_dataloader:
        # Move batch tensors to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            # Forward pass
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        # Calculate metrics
        eval_accuracy += flat_accuracy(logits, labels)
        eval_f1 += flat_f1(logits, labels)

    print(f"Validation Accuracy: {eval_accuracy / len(val_dataloader)}")
    print(f"Validation F1 Score: {eval_f1 / len(val_dataloader)}")
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...

  Average training loss: 2.12
  Training epoch took: 0:05:31

Running Validation...
Validation Accuracy: 0.5047516648168701
Validation F1 Score: 0.36637412203449926
  Validation took: 0:00:13

Training...

  Average training loss: 1.00
  Training epoch took: 0:05:32

Running Validation...
Validation Accuracy: 0.7177788568257492
Validation F1 Score: 0.4961861796073782
  Validation took: 0:00:13

Training...

  Average training loss: 0.55
  Training epoch took: 0:05:32

Running Validation...
Validation Accuracy: 0.783157602663707
Validation F1 Score: 0.5309102625481112
  Validation took: 0:00:13

Training...

  Average training loss: 0.36
  Training epoch took: 0:05:32

Running Validation...
Validation Accuracy: 0.8066731409544949
Validation F1 Score: 0.5411454268713259
  Validation took: 0:00:13

Training complete!


Initial Testing

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Tokenize the test sentences and prepare the test DataLoader
test_input_ids, test_attention_masks = Input_embeddings(test_cleaned['sentence'].values, max_length)
test_labels = torch.tensor(test_cleaned['tag'].values)

# Create the TensorDataset
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# Create the DataLoader
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [None]:
# Test evaluation loop
model.eval()
test_accuracy, test_f1 = 0, 0
nb_test_steps = 0

for batch in test_dataloader:
    # Move batch to GPU
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]

    # Calculate predictions
    preds = torch.argmax(logits, dim=1)

    # Calculate accuracy
    correct_predictions = torch.sum(preds == b_labels).item()
    total_predictions = len(b_labels)
    tmp_test_accuracy = correct_predictions / total_predictions

    # Calculate f1,  Exclude 'Other' class
    tmp_test_f1 = f1_score(
        b_labels.cpu().numpy(),
        preds.cpu().numpy(),
        average='macro',         # Macro-averaged F1 score
        labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
    )


    test_accuracy += tmp_test_accuracy
    test_f1 += tmp_test_f1
    nb_test_steps += 1


print(f"Test Accuracy: {test_accuracy / nb_test_steps}")
print(f"Test F1 Score (excluding 'Other' class): {test_f1 / nb_test_steps}")

Test Accuracy: 0.7436992900608519
Test F1 Score (excluding 'Other' class): 0.5315555577320283


Hyperparameter Tuning

In [None]:
pip install optuna

In [None]:
import optuna
from optuna import Trial
import random
import numpy as np
import torch

# Set a fixed random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Define the objective function for Optuna
def objective(trial: Trial):
    # Hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 2e-5, 8e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.025)

    # Print the current combination of hyperparameters
    print(f"\nTrial {trial.number}:")
    print(f"  Learning Rate: {learning_rate}")
    print(f"  Batch Size: {batch_size}")
    print(f"  Weight Decay: {weight_decay}")

    # Update the DataLoader with the new batch size
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    # Reinitialize the model, optimizer, and scheduler with the new hyperparameters
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=19,
        output_attentions=False,
        output_hidden_states=False,
    )
    model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        eps=1e-8,  # Fixed epsilon
        weight_decay=weight_decay,  # Tuned weight decay
    )
    total_steps = len(train_dataloader) * 3  # Fixed epochs = 3
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),  # 10% warmup steps
        num_training_steps=total_steps,
    )

    # Training loop
    for epoch_i in range(3):  # Fixed epochs = 3
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            logits = outputs.logits
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"  Epoch {epoch_i + 1}: Training Loss = {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    eval_accuracy, eval_f1 = 0, 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        eval_accuracy += flat_accuracy(logits, labels)
        eval_f1 += flat_f1(logits, labels)

    # Compute average validation metrics
    avg_eval_accuracy = eval_accuracy / len(val_dataloader)
    avg_eval_f1 = eval_f1 / len(val_dataloader)

    # Print the validation metrics
    print(f"  Validation Accuracy: {avg_eval_accuracy:.4f}")
    print(f"  Validation F1 Score: {avg_eval_f1:.4f}")

    # Return the metric to optimize
    return avg_eval_f1

# Create an Optuna study and optimize
study = optuna.create_study(direction="maximize")  # Maximize F1 score
study.optimize(objective, n_trials=12)  # Limit to 15 combinations

# Print the best hyperparameters
print("\nBest hyperparameters:", study.best_params)
print("Best F1 score:", study.best_value)

[I 2025-02-24 02:57:57,758] A new study created in memory with name: no-name-a2c4faef-d630-4605-abc1-b8597395e324



Trial 0:
  Learning Rate: 4.2972789809091055e-05
  Batch Size: 32
  Weight Decay: 0.019859829933948636


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.9133
  Epoch 2: Training Loss = 0.5611
  Epoch 3: Training Loss = 0.2344


[I 2025-02-24 03:14:54,758] Trial 0 finished with value: 0.5680511576090561 and parameters: {'learning_rate': 4.2972789809091055e-05, 'batch_size': 32, 'weight_decay': 0.019859829933948636}. Best is trial 0 with value: 0.5680511576090561.


  Validation Accuracy: 0.8597
  Validation F1 Score: 0.5681

Trial 1:
  Learning Rate: 4.9623507005211336e-05
  Batch Size: 64
  Weight Decay: 0.02422743737693746


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.0766
  Epoch 2: Training Loss = 0.7022
  Epoch 3: Training Loss = 0.3122


[I 2025-02-24 03:31:07,845] Trial 1 finished with value: 0.6832869192391429 and parameters: {'learning_rate': 4.9623507005211336e-05, 'batch_size': 64, 'weight_decay': 0.02422743737693746}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.8336
  Validation F1 Score: 0.6833

Trial 2:
  Learning Rate: 2.2506750661077515e-05
  Batch Size: 16
  Weight Decay: 0.016141925324305778


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.9611
  Epoch 2: Training Loss = 0.6912
  Epoch 3: Training Loss = 0.3279


[I 2025-02-24 03:49:43,906] Trial 2 finished with value: 0.3944766624011906 and parameters: {'learning_rate': 2.2506750661077515e-05, 'batch_size': 16, 'weight_decay': 0.016141925324305778}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.8402
  Validation F1 Score: 0.3945

Trial 3:
  Learning Rate: 6.810398432663404e-05
  Batch Size: 32
  Weight Decay: 0.002577966495717829


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.6833
  Epoch 2: Training Loss = 0.4287
  Epoch 3: Training Loss = 0.1454


[I 2025-02-24 04:06:42,642] Trial 3 finished with value: 0.582767475332997 and parameters: {'learning_rate': 6.810398432663404e-05, 'batch_size': 32, 'weight_decay': 0.002577966495717829}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.8833
  Validation F1 Score: 0.5828

Trial 4:
  Learning Rate: 5.946825651847558e-05
  Batch Size: 16
  Weight Decay: 0.006548308256690053


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.5339
  Epoch 2: Training Loss = 0.3882
  Epoch 3: Training Loss = 0.1353


[I 2025-02-24 04:25:18,526] Trial 4 finished with value: 0.41329425582570206 and parameters: {'learning_rate': 5.946825651847558e-05, 'batch_size': 16, 'weight_decay': 0.006548308256690053}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.9045
  Validation F1 Score: 0.4133

Trial 5:
  Learning Rate: 6.0529375215779714e-05
  Batch Size: 32
  Weight Decay: 0.008603738297666154


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.7466
  Epoch 2: Training Loss = 0.4711
  Epoch 3: Training Loss = 0.1705


[I 2025-02-24 04:42:16,906] Trial 5 finished with value: 0.5728622521323612 and parameters: {'learning_rate': 6.0529375215779714e-05, 'batch_size': 32, 'weight_decay': 0.008603738297666154}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.8768
  Validation F1 Score: 0.5729

Trial 6:
  Learning Rate: 6.8429672141983e-05
  Batch Size: 32
  Weight Decay: 0.013040132330181801


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.6860
  Epoch 2: Training Loss = 0.4276
  Epoch 3: Training Loss = 0.1472


[I 2025-02-24 04:59:15,603] Trial 6 finished with value: 0.5806685637706727 and parameters: {'learning_rate': 6.8429672141983e-05, 'batch_size': 32, 'weight_decay': 0.013040132330181801}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.8916
  Validation F1 Score: 0.5807

Trial 7:
  Learning Rate: 2.264513834473916e-05
  Batch Size: 32
  Weight Decay: 0.01896378333767471


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.2156
  Epoch 2: Training Loss = 1.0107
  Epoch 3: Training Loss = 0.5728


[I 2025-02-24 05:16:14,349] Trial 7 finished with value: 0.5318022474763522 and parameters: {'learning_rate': 2.264513834473916e-05, 'batch_size': 32, 'weight_decay': 0.01896378333767471}. Best is trial 1 with value: 0.6832869192391429.


  Validation Accuracy: 0.7761
  Validation F1 Score: 0.5318

Trial 8:
  Learning Rate: 7.647458264348458e-05
  Batch Size: 64
  Weight Decay: 0.010145868032051947


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.8870
  Epoch 2: Training Loss = 0.5454
  Epoch 3: Training Loss = 0.2142


[I 2025-02-24 05:32:28,881] Trial 8 finished with value: 0.7052491301275872 and parameters: {'learning_rate': 7.647458264348458e-05, 'batch_size': 64, 'weight_decay': 0.010145868032051947}. Best is trial 8 with value: 0.7052491301275872.


  Validation Accuracy: 0.8740
  Validation F1 Score: 0.7052

Trial 9:
  Learning Rate: 3.362765347630469e-05
  Batch Size: 64
  Weight Decay: 6.999172219382677e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.3193
  Epoch 2: Training Loss = 1.0382
  Epoch 3: Training Loss = 0.5909


[I 2025-02-24 05:48:43,307] Trial 9 finished with value: 0.6334136041316071 and parameters: {'learning_rate': 3.362765347630469e-05, 'batch_size': 64, 'weight_decay': 6.999172219382677e-05}. Best is trial 8 with value: 0.7052491301275872.


  Validation Accuracy: 0.7682
  Validation F1 Score: 0.6334

Trial 10:
  Learning Rate: 3.2352549870656554e-05
  Batch Size: 64
  Weight Decay: 0.009620449439701894


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model training with Selected Hyperparameters

In [None]:
epochs=5
batch_size = 64
# Update the DataLoader with the batch size
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# Reinitialize the model, optimizer, and scheduler with the new hyperparameters
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=19,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

optimizer = AdamW(
    model.parameters(),
    lr=7.6e-05, # Tuned learning rate
    eps=1e-8,  # Fixed epsilon
    weight_decay=0.01,  # Tuned weight decay
)
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),  # 10% warmup steps
    num_training_steps=total_steps,
)
# Training loop
import random
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch
loss_values = []

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Move batch tensors to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        logits = outputs.logits  # Extract logits from the model output

        # Calculate loss using the custom loss function with class weights
        loss = loss_fn(logits, b_labels)
        total_loss += loss.item()

        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        model.zero_grad()

    avg_train_loss = total_loss / len(train_dataloader)


    loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()
    eval_accuracy, eval_f1 = 0, 0

    for batch in val_dataloader:
        # Move batch tensors to the device
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            # Forward pass
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        # Move logits and labels to CPU for metric calculation
        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        # Calculate accuracy and F1 score
        eval_accuracy += flat_accuracy(logits, labels)
        eval_f1 += flat_f1(logits, labels)

    print(f"Validation Accuracy: {eval_accuracy / len(val_dataloader)}")
    print(f"Validation F1 Score: {eval_f1 / len(val_dataloader)}")
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Training...

  Average training loss: 1.99
  Training epoch took: 0:05:01

Running Validation...
Validation Accuracy: 0.6693899782135077
Validation F1 Score: 0.5427622130771617
  Validation took: 0:00:12

Training...

  Average training loss: 0.60
  Training epoch took: 0:05:04

Running Validation...
Validation Accuracy: 0.8320057189542484
Validation F1 Score: 0.6780302914596893
  Validation took: 0:00:12

Training...

  Average training loss: 0.23
  Training epoch took: 0:05:04

Running Validation...
Validation Accuracy: 0.8625408496732027
Validation F1 Score: 0.6870374561841547
  Validation took: 0:00:12

Training...

  Average training loss: 0.10
  Training epoch took: 0:05:04

Running Validation...
Validation Accuracy: 0.8884463507625272
Validation F1 Score: 0.7053086474459532
  Validation took: 0:00:12

Training...

  Average training loss: 0.04
  Training epoch took: 0:05:04

Running Validation...
Validation Accuracy: 0.8994417211328977
Validation F1 Score: 0.7130106724841184
  

Test Evaluation

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Tokenize the test sentences and prepare the test DataLoader
test_input_ids, test_attention_masks = Input_embeddings(test_cleaned['sentence'].values, max_length)
test_labels = torch.tensor(test_cleaned['tag'].values)

# Create the TensorDataset
test_data = TensorDataset(test_input_ids, test_attention_masks, test_labels)

# Create the DataLoader
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=64)

In [None]:
# Test evaluation loop
model.eval()
test_accuracy, test_f1 = 0, 0
nb_test_steps = 0

for batch in test_dataloader:
    # Move batch to GPU
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]

    # Calculate predictions on GPU
    preds = torch.argmax(logits, dim=1)

    # Calculate accuracy on GPU
    correct_predictions = torch.sum(preds == b_labels).item()
    total_predictions = len(b_labels)
    tmp_test_accuracy = correct_predictions / total_predictions

    # Calculate F1 score on GPU (move to CPU temporarily for sklearn)
    tmp_test_f1 = f1_score(
        b_labels.cpu().numpy(),  # Move labels to CPU for sklearn
        preds.cpu().numpy(),     # Move predictions to CPU for sklearn
        average='macro',         # Macro-averaged F1 score
        labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]  # Exclude 'Other' class
    )

    # Accumulate metrics
    test_accuracy += tmp_test_accuracy
    test_f1 += tmp_test_f1
    nb_test_steps += 1

# Report test metrics
print(f"Test Accuracy: {test_accuracy / nb_test_steps}")
print(f"Test F1 Score (excluding 'Other' class): {test_f1 / nb_test_steps}")

Test Accuracy: 0.7951834402566159
Test F1 Score (excluding 'Other' class): 0.661488059790204


In [None]:
torch.save(model, 'cw_ds_bert_model.pt')

Miscellaneous Hyperparameter Checks

In [None]:
import optuna
from optuna import Trial
import random
import numpy as np
import torch

# Set a fixed random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Define the objective function for Optuna
def objective(trial: Trial):
    # Hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 4e-5, 7e-5, log=True)  # Adjusted range
    weight_decay = trial.suggest_float("weight_decay", 0.003, 0.009)  # Reduced range


    batch_size = 64 #chosen
    # Print the current combination of hyperparameters
    print(f"\nTrial {trial.number}:")
    print(f"  Learning Rate: {learning_rate}")
    print(f"  Batch Size: {batch_size}")
    print(f"  Weight Decay: {weight_decay}")

    # Update the DataLoader with the new batch size
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    # Reinitialize the model, optimizer, and scheduler with the new hyperparameters
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=19,
        output_attentions=False,
        output_hidden_states=False,
    )
    model.to(device)

    optimizer = AdamW(
        model.parameters(),
        lr=learning_rate,
        eps=1e-8,  # Fixed epsilon
        weight_decay=weight_decay,  # Tuned weight decay
    )
    total_steps = len(train_dataloader) * 3  # Fixed epochs = 3
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),  # 10% warmup steps
        num_training_steps=total_steps,
    )

    # Training loop
    for epoch_i in range(3):  # Fixed epochs = 3
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            logits = outputs.logits
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f"  Epoch {epoch_i + 1}: Training Loss = {avg_train_loss:.4f}")

    # Validation loop
    model.eval()
    eval_accuracy, eval_f1 = 0, 0

    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            logits = outputs.logits

        logits = logits.detach().cpu().numpy()
        labels = b_labels.to('cpu').numpy()

        eval_accuracy += flat_accuracy(logits, labels)
        eval_f1 += flat_f1(logits, labels)

    # Compute average validation metrics
    avg_eval_accuracy = eval_accuracy / len(val_dataloader)
    avg_eval_f1 = eval_f1 / len(val_dataloader)

    # Print the validation metrics
    print(f"  Validation Accuracy: {avg_eval_accuracy:.4f}")
    print(f"  Validation F1 Score: {avg_eval_f1:.4f}")

    # Return the metric to optimize (e.g., F1 score or a combination of accuracy and F1)
    return avg_eval_f1

# Create an Optuna study and optimize
study = optuna.create_study(direction="maximize")  # Maximize F1 score
study.optimize(objective, n_trials=6)  # Limit to 15 combinations

# Print the best hyperparameters
print("\nBest hyperparameters:", study.best_params)
print("Best F1 score:", study.best_value)

[I 2025-02-24 09:40:02,126] A new study created in memory with name: no-name-37bd19ca-a375-4720-86b1-502407366159



Trial 0:
  Learning Rate: 5.819578047519045e-05
  Batch Size: 64
  Weight Decay: 0.0031002403184018862


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.0184
  Epoch 2: Training Loss = 0.6087
  Epoch 3: Training Loss = 0.2670


[I 2025-02-24 09:57:25,280] Trial 0 finished with value: 0.6907141622055384 and parameters: {'learning_rate': 5.819578047519045e-05, 'weight_decay': 0.0031002403184018862}. Best is trial 0 with value: 0.6907141622055384.


  Validation Accuracy: 0.8508
  Validation F1 Score: 0.6907

Trial 1:
  Learning Rate: 4.076966194822149e-05
  Batch Size: 64
  Weight Decay: 0.003165006959976539


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.1743
  Epoch 2: Training Loss = 0.8719
  Epoch 3: Training Loss = 0.4259


[I 2025-02-24 10:14:49,216] Trial 1 finished with value: 0.6613305417731885 and parameters: {'learning_rate': 4.076966194822149e-05, 'weight_decay': 0.003165006959976539}. Best is trial 0 with value: 0.6907141622055384.


  Validation Accuracy: 0.8052
  Validation F1 Score: 0.6613

Trial 2:
  Learning Rate: 5.186460168977477e-05
  Batch Size: 64
  Weight Decay: 0.008800043328350281


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.0171
  Epoch 2: Training Loss = 0.7067
  Epoch 3: Training Loss = 0.3208


[I 2025-02-24 10:32:11,016] Trial 2 finished with value: 0.6748088999714874 and parameters: {'learning_rate': 5.186460168977477e-05, 'weight_decay': 0.008800043328350281}. Best is trial 0 with value: 0.6907141622055384.


  Validation Accuracy: 0.8255
  Validation F1 Score: 0.6748

Trial 3:
  Learning Rate: 5.6633075602830086e-05
  Batch Size: 64
  Weight Decay: 0.004270762866873623


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 1.9868
  Epoch 2: Training Loss = 0.6364
  Epoch 3: Training Loss = 0.2714


[I 2025-02-24 10:49:33,155] Trial 3 finished with value: 0.691240855049432 and parameters: {'learning_rate': 5.6633075602830086e-05, 'weight_decay': 0.004270762866873623}. Best is trial 3 with value: 0.691240855049432.


  Validation Accuracy: 0.8406
  Validation F1 Score: 0.6912

Trial 4:
  Learning Rate: 4.294485617125591e-05
  Batch Size: 64
  Weight Decay: 0.006966116640116172


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  Epoch 1: Training Loss = 2.1387
  Epoch 2: Training Loss = 0.7902
  Epoch 3: Training Loss = 0.3862


[I 2025-02-24 11:06:55,420] Trial 4 finished with value: 0.671895773798206 and parameters: {'learning_rate': 4.294485617125591e-05, 'weight_decay': 0.006966116640116172}. Best is trial 3 with value: 0.691240855049432.


  Validation Accuracy: 0.8249
  Validation F1 Score: 0.6719

Trial 5:
  Learning Rate: 6.7829159644573e-05
  Batch Size: 64
  Weight Decay: 0.0058995274228931275


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
