1. Load data and model

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Check your setup.")


GPU is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU


In [3]:
# Import required libraries
import os
from datasets import Dataset

# Function to read IOB files and split into paragraphs
def read_iob_file(file_path):
    """Read IOB file and convert to token-label samples"""
    examples = []
    words, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                token, label = line.split("\t")
                words.append(token)
                labels.append(label.strip())
            else:
                # End of a sample
                if words and labels:
                    examples.append({"tokens": words, "labels": labels})
                    words, labels = [], []
    # Add the last example if file doesn't end with a blank line
    if words and labels:
        examples.append({"tokens": words, "labels": labels})
    return examples



In [4]:
# Load datasets using the updated function
train_path = 'C:\\S24-25\\TxM\\dataset\\train.tsv'
val_path = 'C:\\S24-25\\TxM\\dataset\\val_gold.tsv'
test_path = 'C:\\S24-25\\TxM\\dataset\\test_gold.tsv'
train_data = read_iob_file(train_path)
val_data = read_iob_file(val_path)
test_data = read_iob_file(test_path)

print(f"Train samples: {len(train_data)}, Val samples: {len(val_data)}, Test samples: {len(test_data)}")


Train samples: 998, Val samples: 124, Test samples: 126


In [6]:
from transformers import AutoTokenizer, AutoModelForPreTraining
model_checkpoint="kamalkraj/bioelectra-base-discriminator-pubmed"
tokenizer = AutoTokenizer.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed")
model = AutoModelForPreTraining.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed")
# Define label mappings
label_list = ["O", "B-ADR", "I-ADR", "B-DRU", "I-DRU", "B-DIS", "I-DIS", "B-SYM", "I-SYM"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        is_split_into_words=True,
        padding="max_length",      # or True, depending on your preference
        truncation=True,          # enable truncation
        max_length=512,           # explicitly set maximum sequence length 
        return_overflowing_tokens=False,
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # Ignore padding
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label[word_idx]])
            else:
                aligned_labels.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize datasets
train_dataset = Dataset.from_list(train_data).map(tokenize_and_align_labels, batched=True)
test_dataset = Dataset.from_list(test_data).map(tokenize_and_align_labels, batched=True)
val_dataset = Dataset.from_list(val_data).map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 998/998 [00:00<00:00, 2607.07 examples/s]
Map: 100%|██████████| 126/126 [00:00<00:00, 2532.93 examples/s]
Map: 100%|██████████| 124/124 [00:00<00:00, 2656.52 examples/s]


In [7]:
from transformers import AutoModelForTokenClassification


# Load the pre-trained BERT model

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at kamalkraj/bioelectra-base-discriminator-pubmed and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2. Train and evaluate with default parameters on the test set

In [8]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./bio_electra_logs",
    logging_steps=10,
)


In [9]:
from transformers import Trainer
from evaluate import load
import torch

# Load metric for evaluation
metric = load("seqeval")

def compute_metrics(predictions):
    predictions, labels = predictions
    predictions = torch.argmax(torch.tensor(predictions), dim=2)
    
    # Convert predictions and labels to CPU and then to plain Python integers
    predictions = predictions.cpu().numpy()
    
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100] 
        for label_seq in labels
    ]
    true_predictions = [
        [id_to_label[int(p)] for (p, l) in zip(prediction, label_seq) if l != -100]  # Convert tensor to int
        for prediction, label_seq in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  5%|▌         | 10/189 [02:27<43:57, 14.73s/it]

{'loss': 1.6947, 'grad_norm': 8.60431957244873, 'learning_rate': 1.8941798941798943e-05, 'epoch': 0.16}


 11%|█         | 20/189 [04:54<40:57, 14.54s/it]

{'loss': 0.8684, 'grad_norm': 1.3569080829620361, 'learning_rate': 1.7883597883597884e-05, 'epoch': 0.32}


 16%|█▌        | 30/189 [07:18<38:08, 14.39s/it]

{'loss': 0.5876, 'grad_norm': 0.6973162293434143, 'learning_rate': 1.6825396825396828e-05, 'epoch': 0.48}


 21%|██        | 40/189 [09:51<37:37, 15.15s/it]

{'loss': 0.4674, 'grad_norm': 0.6656844615936279, 'learning_rate': 1.576719576719577e-05, 'epoch': 0.63}


 26%|██▋       | 50/189 [12:23<35:14, 15.21s/it]

{'loss': 0.4717, 'grad_norm': 0.5613645315170288, 'learning_rate': 1.470899470899471e-05, 'epoch': 0.79}


 32%|███▏      | 60/189 [14:55<32:45, 15.23s/it]

{'loss': 0.4353, 'grad_norm': 0.5676628351211548, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.95}


  _warn_prf(average, modifier, msg_start, len(result))
                                                
 33%|███▎      | 63/189 [15:33<24:39, 11.75s/it]

{'eval_loss': 0.39326393604278564, 'eval_ADR': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 584}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 39}, 'eval_DRU': {'precision': 0.8333333333333334, 'recall': 0.0176678445229682, 'f1': 0.03460207612456747, 'number': 283}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}, 'eval_overall_precision': 0.8333333333333334, 'eval_overall_recall': 0.005263157894736842, 'eval_overall_f1': 0.010460251046025104, 'eval_overall_accuracy': 0.8785520739248246, 'eval_runtime': 4.2187, 'eval_samples_per_second': 29.867, 'eval_steps_per_second': 1.896, 'epoch': 1.0}


 37%|███▋      | 70/189 [17:20<29:48, 15.03s/it]

{'loss': 0.3473, 'grad_norm': 0.5149990916252136, 'learning_rate': 1.2592592592592593e-05, 'epoch': 1.11}


 42%|████▏     | 80/189 [19:52<27:33, 15.17s/it]

{'loss': 0.3737, 'grad_norm': 0.7052136063575745, 'learning_rate': 1.1534391534391536e-05, 'epoch': 1.27}


 48%|████▊     | 90/189 [22:24<25:02, 15.18s/it]

{'loss': 0.3659, 'grad_norm': 1.7506002187728882, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}


 53%|█████▎    | 100/189 [24:56<22:31, 15.18s/it]

{'loss': 0.3326, 'grad_norm': 0.5470075607299805, 'learning_rate': 9.417989417989418e-06, 'epoch': 1.59}


 58%|█████▊    | 110/189 [27:28<20:05, 15.26s/it]

{'loss': 0.349, 'grad_norm': 1.2141553163528442, 'learning_rate': 8.35978835978836e-06, 'epoch': 1.75}


 63%|██████▎   | 120/189 [30:00<17:26, 15.16s/it]

{'loss': 0.3035, 'grad_norm': 0.6778350472450256, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.9}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 67%|██████▋   | 126/189 [31:23<12:16, 11.69s/it]

{'eval_loss': 0.3069069981575012, 'eval_ADR': {'precision': 0.2767203513909224, 'recall': 0.3236301369863014, 'f1': 0.2983425414364641, 'number': 584}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 39}, 'eval_DRU': {'precision': 0.7403314917127072, 'recall': 0.4734982332155477, 'f1': 0.5775862068965518, 'number': 283}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}, 'eval_overall_precision': 0.3738425925925926, 'eval_overall_recall': 0.34, 'eval_overall_f1': 0.35611907386990077, 'eval_overall_accuracy': 0.9043979712360175, 'eval_runtime': 4.3421, 'eval_samples_per_second': 29.018, 'eval_steps_per_second': 1.842, 'epoch': 2.0}


 69%|██████▉   | 130/189 [32:26<14:45, 15.01s/it]

{'loss': 0.2995, 'grad_norm': 1.240178108215332, 'learning_rate': 6.243386243386243e-06, 'epoch': 2.06}


 74%|███████▍  | 140/189 [34:58<12:22, 15.16s/it]

{'loss': 0.2831, 'grad_norm': 1.5248281955718994, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


 79%|███████▉  | 150/189 [37:31<09:57, 15.31s/it]

{'loss': 0.276, 'grad_norm': 0.6992630958557129, 'learning_rate': 4.126984126984127e-06, 'epoch': 2.38}


 85%|████████▍ | 160/189 [40:04<07:17, 15.10s/it]

{'loss': 0.3054, 'grad_norm': 1.071355938911438, 'learning_rate': 3.068783068783069e-06, 'epoch': 2.54}


 90%|████████▉ | 170/189 [42:38<04:49, 15.22s/it]

{'loss': 0.2619, 'grad_norm': 0.8284920454025269, 'learning_rate': 2.0105820105820108e-06, 'epoch': 2.7}


 95%|█████████▌| 180/189 [45:11<02:16, 15.20s/it]

{'loss': 0.2951, 'grad_norm': 0.7920874357223511, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 189/189 [47:21<00:00, 11.67s/it]

{'eval_loss': 0.28889429569244385, 'eval_ADR': {'precision': 0.2753623188405797, 'recall': 0.3578767123287671, 'f1': 0.31124348473566643, 'number': 584}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 39}, 'eval_DRU': {'precision': 0.7798165137614679, 'recall': 0.6007067137809188, 'f1': 0.6786427145708583, 'number': 283}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}, 'eval_overall_precision': 0.38792221084953943, 'eval_overall_recall': 0.3989473684210526, 'eval_overall_f1': 0.39335755059678257, 'eval_overall_accuracy': 0.9083582296949906, 'eval_runtime': 4.111, 'eval_samples_per_second': 30.649, 'eval_steps_per_second': 1.946, 'epoch': 3.0}


100%|██████████| 189/189 [47:23<00:00, 15.04s/it]

{'train_runtime': 2843.3146, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.066, 'train_loss': 0.4534431260729593, 'epoch': 3.0}





TrainOutput(global_step=189, training_loss=0.4534431260729593, metrics={'train_runtime': 2843.3146, 'train_samples_per_second': 1.053, 'train_steps_per_second': 0.066, 'total_flos': 782372000176128.0, 'train_loss': 0.4534431260729593, 'epoch': 3.0})

In [None]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 8/8 [00:03<00:00,  2.07it/s]

Evaluation Results: {'eval_loss': 0.28889429569244385, 'eval_ADR': {'precision': 0.2753623188405797, 'recall': 0.3578767123287671, 'f1': 0.31124348473566643, 'number': 584}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 39}, 'eval_DRU': {'precision': 0.7798165137614679, 'recall': 0.6007067137809188, 'f1': 0.6786427145708583, 'number': 283}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 44}, 'eval_overall_precision': 0.38792221084953943, 'eval_overall_recall': 0.3989473684210526, 'eval_overall_f1': 0.39335755059678257, 'eval_overall_accuracy': 0.9083582296949906, 'eval_runtime': 4.4505, 'eval_samples_per_second': 28.311, 'eval_steps_per_second': 1.798, 'epoch': 3.0}





: 

3. Train and test on val set

4. HPO