1. Load data and model

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Check your setup.")


GPU is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU


In [3]:
# Import required libraries
import os
from datasets import Dataset

# Function to read IOB files and split into paragraphs
def read_iob_file(file_path):
    """Read IOB file and convert to token-label samples"""
    examples = []
    words, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                token, label = line.split("\t")
                words.append(token)
                labels.append(label.strip())
            else:
                # End of a sample
                if words and labels:
                    examples.append({"tokens": words, "labels": labels})
                    words, labels = [], []
    # Add the last example if file doesn't end with a blank line
    if words and labels:
        examples.append({"tokens": words, "labels": labels})
    return examples



In [4]:
# Load datasets using the updated function
train_path = 'C:\\S24-25\\TxM\\dataset\\train.tsv'
val_path = 'C:\\S24-25\\TxM\\dataset\\val_gold.tsv'
test_path = 'C:\\S24-25\\TxM\\dataset\\test_gold.tsv'
train_data = read_iob_file(train_path)
val_data = read_iob_file(val_path)
test_data = read_iob_file(test_path)

print(f"Train samples: {len(train_data)}, Val samples: {len(val_data)}, Test samples: {len(test_data)}")


Train samples: 998, Val samples: 124, Test samples: 126


Tokenize and Align Labels

In [5]:
from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# Define label mappings
label_list = ["O", "B-ADR", "I-ADR", "B-DRU", "I-DRU", "B-DIS", "I-DIS", "B-SYM", "I-SYM"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # Ignore padding
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label[word_idx]])
            else:
                aligned_labels.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize datasets
train_dataset = Dataset.from_list(train_data).map(tokenize_and_align_labels, batched=True)
test_dataset = Dataset.from_list(test_data).map(tokenize_and_align_labels, batched=True)
val_dataset = Dataset.from_list(val_data).map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 998/998 [00:00<00:00, 2618.51 examples/s]
Map: 100%|██████████| 126/126 [00:00<00:00, 2290.75 examples/s]
Map: 100%|██████████| 124/124 [00:00<00:00, 2724.52 examples/s]


In [6]:
from transformers import AutoModelForTokenClassification


# Load the pre-trained BERT model

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2. Train and evaluate with default parameters on the test set

In [7]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=10,
)


In [8]:
from transformers import Trainer
from evaluate import load
import torch

# Load metric for evaluation
metric = load("seqeval")

def compute_metrics(predictions):
    predictions, labels = predictions
    predictions = torch.argmax(torch.tensor(predictions), dim=2)
    
    # Convert predictions and labels to CPU and then to plain Python integers
    predictions = predictions.cpu().numpy()
    
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100] 
        for label_seq in labels
    ]
    true_predictions = [
        [id_to_label[int(p)] for (p, l) in zip(prediction, label_seq) if l != -100]  # Convert tensor to int
        for prediction, label_seq in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()



  5%|▌         | 10/189 [00:04<01:09,  2.59it/s]

{'loss': 1.4865, 'grad_norm': 1.8258541822433472, 'learning_rate': 1.8941798941798943e-05, 'epoch': 0.16}


 11%|█         | 20/189 [00:08<01:03,  2.67it/s]

{'loss': 0.5903, 'grad_norm': 1.1083478927612305, 'learning_rate': 1.7883597883597884e-05, 'epoch': 0.32}


 16%|█▌        | 30/189 [00:12<00:59,  2.68it/s]

{'loss': 0.4873, 'grad_norm': 1.5807186365127563, 'learning_rate': 1.6825396825396828e-05, 'epoch': 0.48}


 21%|██        | 40/189 [00:16<00:55,  2.66it/s]

{'loss': 0.366, 'grad_norm': 0.6019790172576904, 'learning_rate': 1.576719576719577e-05, 'epoch': 0.63}


 26%|██▋       | 50/189 [00:19<00:53,  2.58it/s]

{'loss': 0.3875, 'grad_norm': 1.2651851177215576, 'learning_rate': 1.470899470899471e-05, 'epoch': 0.79}


 32%|███▏      | 60/189 [00:23<00:50,  2.57it/s]

{'loss': 0.336, 'grad_norm': 0.7563637495040894, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.95}


  _warn_prf(average, modifier, msg_start, len(result))
                                                
 33%|███▎      | 63/189 [00:25<00:39,  3.16it/s]

{'eval_loss': 0.3093837797641754, 'eval_ADR': {'precision': 0.33065595716198126, 'recall': 0.37142857142857144, 'f1': 0.34985835694050993, 'number': 665}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 58}, 'eval_DRU': {'precision': 0.7016317016317016, 'recall': 0.7525, 'f1': 0.7261761158021713, 'number': 400}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}, 'eval_overall_precision': 0.46598639455782315, 'eval_overall_recall': 0.46717817561807334, 'eval_overall_f1': 0.46658152405278847, 'eval_overall_accuracy': 0.8985115911485775, 'eval_runtime': 1.1432, 'eval_samples_per_second': 110.215, 'eval_steps_per_second': 6.998, 'epoch': 1.0}


 37%|███▋      | 70/189 [00:29<00:53,  2.22it/s]

{'loss': 0.2654, 'grad_norm': 0.7308688163757324, 'learning_rate': 1.2592592592592593e-05, 'epoch': 1.11}


 42%|████▏     | 80/189 [00:33<00:40,  2.67it/s]

{'loss': 0.2766, 'grad_norm': 1.024377703666687, 'learning_rate': 1.1534391534391536e-05, 'epoch': 1.27}


 48%|████▊     | 90/189 [00:37<00:37,  2.67it/s]

{'loss': 0.2889, 'grad_norm': 1.607595443725586, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}


 53%|█████▎    | 100/189 [00:40<00:33,  2.65it/s]

{'loss': 0.2724, 'grad_norm': 0.8333780765533447, 'learning_rate': 9.417989417989418e-06, 'epoch': 1.59}


 58%|█████▊    | 110/189 [00:44<00:29,  2.66it/s]

{'loss': 0.2648, 'grad_norm': 0.8623822331428528, 'learning_rate': 8.35978835978836e-06, 'epoch': 1.75}


 63%|██████▎   | 120/189 [00:48<00:25,  2.66it/s]

{'loss': 0.2457, 'grad_norm': 1.1264517307281494, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.9}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 67%|██████▋   | 126/189 [00:51<00:19,  3.27it/s]

{'eval_loss': 0.26164135336875916, 'eval_ADR': {'precision': 0.38826185101580135, 'recall': 0.5172932330827068, 'f1': 0.443584784010316, 'number': 665}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 58}, 'eval_DRU': {'precision': 0.7941834451901566, 'recall': 0.8875, 'f1': 0.8382526564344746, 'number': 400}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}, 'eval_overall_precision': 0.5243810952738185, 'eval_overall_recall': 0.5959079283887468, 'eval_overall_f1': 0.5578611332801278, 'eval_overall_accuracy': 0.9163593256059009, 'eval_runtime': 1.107, 'eval_samples_per_second': 113.821, 'eval_steps_per_second': 7.227, 'epoch': 2.0}


 69%|██████▉   | 130/189 [00:54<00:34,  1.71it/s]

{'loss': 0.2246, 'grad_norm': 1.78065824508667, 'learning_rate': 6.243386243386243e-06, 'epoch': 2.06}


 74%|███████▍  | 140/189 [00:57<00:18,  2.61it/s]

{'loss': 0.213, 'grad_norm': 1.284313678741455, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


 79%|███████▉  | 150/189 [01:01<00:14,  2.65it/s]

{'loss': 0.2176, 'grad_norm': 1.2121782302856445, 'learning_rate': 4.126984126984127e-06, 'epoch': 2.38}


 85%|████████▍ | 160/189 [01:05<00:10,  2.64it/s]

{'loss': 0.2238, 'grad_norm': 1.2065999507904053, 'learning_rate': 3.068783068783069e-06, 'epoch': 2.54}


 90%|████████▉ | 170/189 [01:09<00:07,  2.65it/s]

{'loss': 0.1912, 'grad_norm': 1.5118227005004883, 'learning_rate': 2.0105820105820108e-06, 'epoch': 2.7}


 95%|█████████▌| 180/189 [01:12<00:03,  2.65it/s]

{'loss': 0.2279, 'grad_norm': 1.275444746017456, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 189/189 [01:18<00:00,  3.21it/s]

{'eval_loss': 0.2564546465873718, 'eval_ADR': {'precision': 0.41148325358851673, 'recall': 0.5172932330827068, 'f1': 0.45836109260493, 'number': 665}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 58}, 'eval_DRU': {'precision': 0.8484107579462102, 'recall': 0.8675, 'f1': 0.857849196538937, 'number': 400}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}, 'eval_overall_precision': 0.5550200803212851, 'eval_overall_recall': 0.5890878090366581, 'eval_overall_f1': 0.5715467328370554, 'eval_overall_accuracy': 0.9190595363540569, 'eval_runtime': 1.1124, 'eval_samples_per_second': 113.267, 'eval_steps_per_second': 7.192, 'epoch': 3.0}


100%|██████████| 189/189 [01:19<00:00,  2.37it/s]

{'train_runtime': 79.7466, 'train_samples_per_second': 37.544, 'train_steps_per_second': 2.37, 'train_loss': 0.3579519532975696, 'epoch': 3.0}





TrainOutput(global_step=189, training_loss=0.3579519532975696, metrics={'train_runtime': 79.7466, 'train_samples_per_second': 37.544, 'train_steps_per_second': 2.37, 'total_flos': 782372000176128.0, 'train_loss': 0.3579519532975696, 'epoch': 3.0})

In [9]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 8/8 [00:01<00:00,  7.98it/s]

Evaluation Results: {'eval_loss': 0.2564546465873718, 'eval_ADR': {'precision': 0.41148325358851673, 'recall': 0.5172932330827068, 'f1': 0.45836109260493, 'number': 665}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 58}, 'eval_DRU': {'precision': 0.8484107579462102, 'recall': 0.8675, 'f1': 0.857849196538937, 'number': 400}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 50}, 'eval_overall_precision': 0.5550200803212851, 'eval_overall_recall': 0.5890878090366581, 'eval_overall_f1': 0.5715467328370554, 'eval_overall_accuracy': 0.9190595363540569, 'eval_runtime': 1.131, 'eval_samples_per_second': 111.407, 'eval_steps_per_second': 7.073, 'epoch': 3.0}





3. Train and test on val set

4. HPO