1. Load data and model

In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available. Check your setup.")


GPU is available!
Using GPU: NVIDIA GeForce RTX 4080 Laptop GPU


In [3]:
# Import required libraries
import os
from datasets import Dataset

# Function to read IOB files and split into paragraphs
def read_iob_file(file_path):
    """Read IOB file and convert to token-label samples"""
    examples = []
    words, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                token, label = line.split("\t")
                words.append(token)
                labels.append(label.strip())
            else:
                # End of a sample
                if words and labels:
                    examples.append({"tokens": words, "labels": labels})
                    words, labels = [], []
    # Add the last example if file doesn't end with a blank line
    if words and labels:
        examples.append({"tokens": words, "labels": labels})
    return examples



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load datasets using the updated function
train_path = 'C:\\S24-25\\TxM\\dataset\\train.tsv'
val_path = 'C:\\S24-25\\TxM\\dataset\\val_gold.tsv'
test_path = 'C:\\S24-25\\TxM\\dataset\\test_gold.tsv'
train_data = read_iob_file(train_path)
val_data = read_iob_file(val_path)
test_data = read_iob_file(test_path)

print(f"Train samples: {len(train_data)}, Val samples: {len(val_data)}, Test samples: {len(test_data)}")


Train samples: 998, Val samples: 124, Test samples: 126


In [5]:
from transformers import AutoTokenizer
# Load model directly
# Use a pipeline as a high-level helper
from transformers import pipeline
model_checkpoint="google/electra-small-discriminator"
pipe = pipeline("fill-mask", model="google/electra-small-discriminator")
# Define label mappings
label_list = ["O", "B-ADR", "I-ADR", "B-DRU", "I-DRU", "B-DIS", "I-DIS", "B-SYM", "I-SYM"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = pipe.tokenizer(
        examples["tokens"], 
        is_split_into_words=True,
        padding="max_length",      # or True, depending on your preference
        truncation=True,          # enable truncation
        max_length=512,           # explicitly set maximum sequence length 
        return_overflowing_tokens=False,
    )
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                aligned_labels.append(-100)  # Ignore padding
            elif word_idx != previous_word_idx:
                aligned_labels.append(label_to_id[label[word_idx]])
            else:
                aligned_labels.append(label_to_id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(aligned_labels)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize datasets
train_dataset = Dataset.from_list(train_data).map(tokenize_and_align_labels, batched=True)
test_dataset = Dataset.from_list(test_data).map(tokenize_and_align_labels, batched=True)
val_dataset = Dataset.from_list(val_data).map(tokenize_and_align_labels, batched=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['generator_lm_head.bias', 'generator_predictions.LayerNorm.bias', 'generator_predictions.LayerNorm.weight', 'generator_predictions.dense.bias', 'generator_predictions.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Map: 100%|██████████| 998/998 [00:00<00:00, 2574.11 examples/s]
Map: 100%|██████████| 126/126 [00:00<00:00, 2393.78 examples/s]
Map: 100%|██████████| 12

In [6]:
from transformers import AutoModelForTokenClassification


# Load the pre-trained BERT model

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id
)


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2. Train and evaluate with default parameters on the test set

In [10]:
from transformers import TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    logging_dir="./electra_logs",
    logging_steps=10,
)


In [11]:
from transformers import Trainer
from evaluate import load
import torch

# Load metric for evaluation
metric = load("seqeval")

def compute_metrics(predictions):
    predictions, labels = predictions
    predictions = torch.argmax(torch.tensor(predictions), dim=2)
    
    # Convert predictions and labels to CPU and then to plain Python integers
    predictions = predictions.cpu().numpy()
    
    true_labels = [
        [id_to_label[label] for label in label_seq if label != -100] 
        for label_seq in labels
    ]
    true_predictions = [
        [id_to_label[int(p)] for (p, l) in zip(prediction, label_seq) if l != -100]  # Convert tensor to int
        for prediction, label_seq in zip(predictions, labels)
    ]
    return metric.compute(predictions=true_predictions, references=true_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=pipe.tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


  6%|▌         | 11/189 [00:01<00:22,  8.07it/s]

{'loss': 0.4572, 'grad_norm': 1.2515650987625122, 'learning_rate': 1.8941798941798943e-05, 'epoch': 0.16}


 11%|█         | 21/189 [00:02<00:20,  8.24it/s]

{'loss': 0.4637, 'grad_norm': 0.9536014199256897, 'learning_rate': 1.7883597883597884e-05, 'epoch': 0.32}


 16%|█▋        | 31/189 [00:03<00:19,  8.24it/s]

{'loss': 0.4724, 'grad_norm': 1.3160325288772583, 'learning_rate': 1.6825396825396828e-05, 'epoch': 0.48}


 22%|██▏       | 41/189 [00:05<00:18,  8.16it/s]

{'loss': 0.4014, 'grad_norm': 1.4200518131256104, 'learning_rate': 1.576719576719577e-05, 'epoch': 0.63}


 27%|██▋       | 51/189 [00:06<00:16,  8.16it/s]

{'loss': 0.444, 'grad_norm': 0.8572367429733276, 'learning_rate': 1.470899470899471e-05, 'epoch': 0.79}


 32%|███▏      | 61/189 [00:07<00:15,  8.24it/s]

{'loss': 0.413, 'grad_norm': 0.8080768585205078, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.95}


                                                
 33%|███▎      | 63/189 [00:08<00:15,  8.39it/s]

{'eval_loss': 0.3888791501522064, 'eval_ADR': {'precision': 0.22287968441814596, 'recall': 0.1978984238178634, 'f1': 0.20964749536178107, 'number': 571}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 52}, 'eval_DRU': {'precision': 0.8444444444444444, 'recall': 0.25249169435215946, 'f1': 0.38874680306905374, 'number': 301}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}, 'eval_overall_precision': 0.3165829145728643, 'eval_overall_recall': 0.19444444444444445, 'eval_overall_f1': 0.2409177820267686, 'eval_overall_accuracy': 0.8924641314946371, 'eval_runtime': 0.5328, 'eval_samples_per_second': 236.504, 'eval_steps_per_second': 15.016, 'epoch': 1.0}


 38%|███▊      | 71/189 [00:09<00:16,  7.28it/s]

{'loss': 0.3581, 'grad_norm': 0.7626343369483948, 'learning_rate': 1.2592592592592593e-05, 'epoch': 1.11}


 43%|████▎     | 81/189 [00:10<00:13,  8.19it/s]

{'loss': 0.3847, 'grad_norm': 0.8594221472740173, 'learning_rate': 1.1534391534391536e-05, 'epoch': 1.27}


 48%|████▊     | 91/189 [00:11<00:11,  8.20it/s]

{'loss': 0.3953, 'grad_norm': 1.1106054782867432, 'learning_rate': 1.0476190476190477e-05, 'epoch': 1.43}


 53%|█████▎    | 101/189 [00:13<00:10,  8.20it/s]

{'loss': 0.3636, 'grad_norm': 0.9679637551307678, 'learning_rate': 9.417989417989418e-06, 'epoch': 1.59}


 59%|█████▊    | 111/189 [00:14<00:09,  8.18it/s]

{'loss': 0.3854, 'grad_norm': 0.824306309223175, 'learning_rate': 8.35978835978836e-06, 'epoch': 1.75}


 64%|██████▍   | 121/189 [00:15<00:08,  8.21it/s]

{'loss': 0.3473, 'grad_norm': 0.9817842841148376, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.9}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
 67%|██████▋   | 126/189 [00:16<00:07,  8.34it/s]

{'eval_loss': 0.34956198930740356, 'eval_ADR': {'precision': 0.23589001447178004, 'recall': 0.28546409807355516, 'f1': 0.2583201267828843, 'number': 571}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 52}, 'eval_DRU': {'precision': 0.8421052631578947, 'recall': 0.53156146179402, 'f1': 0.6517311608961304, 'number': 301}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}, 'eval_overall_precision': 0.36662883087400683, 'eval_overall_recall': 0.3323045267489712, 'eval_overall_f1': 0.3486238532110092, 'eval_overall_accuracy': 0.9028416213957375, 'eval_runtime': 0.5254, 'eval_samples_per_second': 239.829, 'eval_steps_per_second': 15.227, 'epoch': 2.0}


 69%|██████▉   | 131/189 [00:17<00:09,  6.08it/s]

{'loss': 0.3597, 'grad_norm': 1.224724292755127, 'learning_rate': 6.243386243386243e-06, 'epoch': 2.06}


 75%|███████▍  | 141/189 [00:18<00:05,  8.13it/s]

{'loss': 0.3419, 'grad_norm': 1.2506340742111206, 'learning_rate': 5.185185185185185e-06, 'epoch': 2.22}


 80%|███████▉  | 151/189 [00:19<00:04,  8.23it/s]

{'loss': 0.3245, 'grad_norm': 1.064380407333374, 'learning_rate': 4.126984126984127e-06, 'epoch': 2.38}


 85%|████████▌ | 161/189 [00:21<00:03,  8.18it/s]

{'loss': 0.3592, 'grad_norm': 1.0506058931350708, 'learning_rate': 3.068783068783069e-06, 'epoch': 2.54}


 90%|█████████ | 171/189 [00:22<00:02,  8.20it/s]

{'loss': 0.3248, 'grad_norm': 1.1256532669067383, 'learning_rate': 2.0105820105820108e-06, 'epoch': 2.7}


 96%|█████████▌| 181/189 [00:23<00:00,  8.21it/s]

{'loss': 0.3566, 'grad_norm': 1.0210720300674438, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


  _warn_prf(average, modifier, msg_start, len(result))
                                                 
100%|██████████| 189/189 [00:25<00:00,  8.37it/s]

{'eval_loss': 0.3368615508079529, 'eval_ADR': {'precision': 0.2594142259414226, 'recall': 0.3257443082311734, 'f1': 0.2888198757763975, 'number': 571}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 52}, 'eval_DRU': {'precision': 0.8090909090909091, 'recall': 0.5913621262458472, 'f1': 0.6833013435700576, 'number': 301}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}, 'eval_overall_precision': 0.38847385272145146, 'eval_overall_recall': 0.37448559670781895, 'eval_overall_f1': 0.3813514929282347, 'eval_overall_accuracy': 0.9056971723081209, 'eval_runtime': 0.5631, 'eval_samples_per_second': 223.778, 'eval_steps_per_second': 14.208, 'epoch': 3.0}


100%|██████████| 189/189 [00:25<00:00,  7.45it/s]

{'train_runtime': 25.3804, 'train_samples_per_second': 117.965, 'train_steps_per_second': 7.447, 'train_loss': 0.38417816162109375, 'epoch': 3.0}





TrainOutput(global_step=189, training_loss=0.38417816162109375, metrics={'train_runtime': 25.3804, 'train_samples_per_second': 117.965, 'train_steps_per_second': 7.447, 'total_flos': 87493824681984.0, 'train_loss': 0.38417816162109375, 'epoch': 3.0})

In [12]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)


  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 8/8 [00:00<00:00, 14.72it/s]

Evaluation Results: {'eval_loss': 0.3368615508079529, 'eval_ADR': {'precision': 0.2594142259414226, 'recall': 0.3257443082311734, 'f1': 0.2888198757763975, 'number': 571}, 'eval_DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 52}, 'eval_DRU': {'precision': 0.8090909090909091, 'recall': 0.5913621262458472, 'f1': 0.6833013435700576, 'number': 301}, 'eval_SYM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 48}, 'eval_overall_precision': 0.38847385272145146, 'eval_overall_recall': 0.37448559670781895, 'eval_overall_f1': 0.3813514929282347, 'eval_overall_accuracy': 0.9056971723081209, 'eval_runtime': 0.7153, 'eval_samples_per_second': 176.139, 'eval_steps_per_second': 11.183, 'epoch': 3.0}





: 

3. Train and test on val set

4. HPO