In [1]:
!pip install transformers[torch] datasets seqeval accelerate -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m99.0 MB/s[0m eta [36

In [4]:
import zipfile
import os

# data_r = zipfile.ZipFile('data.zip', 'r')
# data_r.printdir()

# data_r.extractall()

os.getcwd()
print(os.listdir("data"))

['20977862.ann', '26395443.ann', '27773410.ann', '28422883.ann', '19860006.ann', '26457578.ann', '26714786.ann', '18787726.ann', '26584481.ann', '28296749.txt', '28353604.ann', '28250406.ann', '28248891.ann', '26309459.ann', '28079821.ann', '21477357.txt', '21129213.txt', '24526194.txt', '26444414.txt', '27974938.ann', '27196481.txt', '28321071.ann', '28121940.ann', '24898994.ann', '28173879.txt', '23124805.ann', '28193213.ann', '28154669.ann', '20146086.ann', '28353588.txt', '26285706.txt', '25759562.txt', '26530965.txt', '19860925.txt', '28250304.ann', '16778410.ann', '26683938.ann', '21720478.txt', '18561524.ann', '25246819.ann', '18815636.txt', '22814979.txt', '28538413.txt', '23077697.txt', '28193213.txt', '28321073.ann', '28353561.ann', '26469535.txt', '28103924.ann', '26309459.txt', '25155594.ann', '28239141.txt', '25023062.ann', '26692730.ann', '27749582.txt', '26629302.txt', '28296775.ann', '23468586.txt', '23035161.ann', '23155491.ann', '28151882.txt', '25410034.ann', '186663

In [5]:
import os
import gc
from collections import defaultdict
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
import torch

In [6]:
gc.collect()
torch.cuda.empty_cache()

In [7]:
def parse_ann_file(ann_path):
    entities = []
    with open(ann_path, 'r') as f:
        for line in f:
            if line.startswith('T'):
                parts = line.strip().split('\t')
                entity_info = parts[1].split()
                label = entity_info[0]
                start = int(entity_info[1].split(';')[0])
                end = int(entity_info[2].split(';')[0])
                entities.append((start, end, label))
    return entities

In [8]:
def process_data_streaming(directory):
    texts = []
    all_tags = []
    unique_tags = set()

    # Process files one-by-one (no full dataset in RAM)
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            txt_path = os.path.join(directory, filename)
            ann_path = os.path.join(directory, filename.replace(".txt", ".ann"))

            with open(txt_path, "r") as f:
                text = f.read()

            # Split text into words with offsets
            tokens = []
            word_offsets = []
            current_pos = 0
            for word in text.split():
                start = current_pos
                end = start + len(word)
                tokens.append(word)
                word_offsets.append((start, end))
                current_pos = end + 1  # +1 for space

            # Initialize tags
            tags = ['O'] * len(tokens)
            entities = parse_ann_file(ann_path)

            # Map entities to tokens
            for (char_start, char_end, label) in entities:
                entity_words = []
                for i, (word_start, word_end) in enumerate(word_offsets):
                    if (char_start <= word_start < char_end) or (word_start <= char_start < word_end):
                        entity_words.append(i)

                if entity_words:
                    start_idx = entity_words[0]
                    end_idx = entity_words[-1]
                    tags[start_idx] = f"B-{label}"
                    for j in range(start_idx + 1, end_idx + 1):
                        tags[j] = f"I-{label}"

            texts.append(tokens)
            all_tags.append(tags)
            unique_tags.update(tags)

    # Create label mappings
    unique_tags = sorted(list(unique_tags))
    id2label = {i: tag for i, tag in enumerate(unique_tags)}
    label2id = {tag: i for i, tag in enumerate(unique_tags)}
    num_labels = len(unique_tags)

    # Free memory
    del entities, tokens, word_offsets, tags
    gc.collect()

    return Dataset.from_dict({"tokens": texts, "tags": all_tags}), id2label, label2id, num_labels

In [9]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        max_length=256,  # Reduce sequence length
        padding=False,  # Dynamic padding later
        is_split_into_words=True
    )

    labels = []
    for i, tags in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[tags[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
# Process data in streaming mode
raw_dataset, id2label, label2id, num_labels = process_data_streaming("data")

# Split dataset before tokenization
split = raw_dataset.train_test_split(test_size=0.2, seed=42)
train_raw = split["train"]
test_raw = split["test"]

# Tokenize datasets (lazily)
train_tokenized = train_raw.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "tags"],
    batch_size=8  # Smaller batches during tokenization
)

test_tokenized = test_raw.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "tags"],
    batch_size=8
)

Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [13]:
model = AutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=200,  # Evaluate less frequently
    learning_rate=3e-5,
    per_device_train_batch_size=8,  # Reduce batch size
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # Enable mixed-precision
    gradient_checkpointing=True,  # Trade compute for memory
    optim="adamw_torch_fused",  # Faster optimizer
    report_to="none",  # Disable logging to save RAM
    logging_steps=50,
    save_total_limit=1,
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (-100) and convert label IDs to strings
    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Compute seqeval metrics (entity-level)
    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    # Compute token-level accuracy (excluding ignored tokens)
    flat_true_labels = [l for sublist in true_labels for l in sublist]
    flat_predictions = [p for sublist in true_predictions for p in sublist]
    accuracy = accuracy_score(flat_true_labels, flat_predictions)

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy,
    }

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [20]:
gc.collect()
torch.cuda.empty_cache()

In [21]:
# Train
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=60, training_loss=1.699284553527832, metrics={'train_runtime': 960.2658, 'train_samples_per_second': 0.5, 'train_steps_per_second': 0.062, 'total_flos': 31402101473280.0, 'train_loss': 1.699284553527832, 'epoch': 3.0})

In [22]:
results = trainer.evaluate()
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")
print(f"Accuracy: {results['eval_accuracy']:.4f}")

Precision: 0.3755
Recall: 0.4215
F1-Score: 0.3972
Accuracy: 0.6501
