<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Transformers-Hub/blob/main/NER-Data-Tokenizer-And-Model-Training-CONLL2003/NER_Data_Tokenizer_%26_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup & Imports

In [None]:
# Install libraries
%%bash
pip install -q transformers datasets seqeval evaluate

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.6/43.6 kB 2.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 480.6/480.6 kB 10.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 84.0/84.0 kB 4.9 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 3.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 179.3/179.3 kB 6.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 5.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 5.9 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.


In [None]:
# ==================== #
# Import Libraries
# ==================== #

import os
import random
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline
                        )
import evaluate

In [None]:
# ==================== #
# Configuration
# ==================== #

MODEL_CHECKPOINT = 'distilbert-base-cased'
DATASET_NAME = "conll2003"
OUTPUT_DIR = 'distilbert-finetuned-ner'
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3
WEIGHT_DECAY = 0.01
SEED=32

# Set random seed
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

    # In case of using PyTorch uncomment below
    # if torch.is_available():
    #     torch.manual_seed(seed)
    #     torch.cuda.manual_seed_all(seed)

set_seed(SEED)

# Data Loading & Basic Exploration

In [None]:
# ==================== #
# Data Loading
# ==================== #

def load_and_explore_data(dataset_name):
    """Loads the ataset and performs based exploration"""
    data = load_dataset(dataset_name)
    print("Dataset: ", data)
    print("Sample from train data: ", data['train'][0])
    print("Features: ", data['train'].features)
    ner_tags_feature = data['train'].features['ner_tags']
    label_names = ner_tags_feature.feature.names
    print("Label Names: ", label_names)
    return data, label_names

data, label_names = load_and_explore_data(DATASET_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

conll2003.py:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

The repository for conll2003 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/conll2003.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset:  DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Sample from train data:  {'id': '0', 'tokens': ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7], 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0], 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}
Features:  {'id': Value(dtype='string', id=None), 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'N

# Tokenization

In [None]:
# ============================== #
# Tokenization & Label Alignment
# ============================== #

# Create tokenizer instance
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
begin2inside = {1: 2, 3: 4, 5: 6, 7: 8}

def align_labels(labels, word_ids, begin2inside):
    """Aligns token labels with word labels, handling subword tokenization."""
    aligned_labels = []
    last_word = None
    for word in word_ids:
        if word is None:
            label = -100  # Special token
        elif word != last_word:
            label = labels[word]
        else:
            label = labels[word]
            if label in begin2inside:
                label = begin2inside[label]
        aligned_labels.append(label)
        last_word = word
    return aligned_labels


def tokenize_and_align(batch, tokenizer, label_names, begin2inside):
    """Tokenizes the input and aligns the labels."""
    tokenized_inputs = tokenizer(
        batch['tokens'], truncation=True, is_split_into_words=True
    )
    labels_batch = batch['ner_tags']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(i)
        aligned_labels_batch.append(align_labels(labels, word_ids, begin2inside))
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
tokenized_datasets = data.map(
    tokenize_and_align,
    batched=True,
    remove_columns=data['train'].column_names,
    fn_kwargs={
        'tokenizer': tokenizer,
        'label_names': label_names,
        'begin2inside': begin2inside
    })

print("Tokenized Datasets: ", tokenized_datasets)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Tokenized Datasets:  DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})


# Define TrainingArguments & Trainer

In [None]:
# ==================== #
# Data Collator
# ==================== #

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# OPTIONAL :
# Example of how the data collator works (optional for demonstration)
# example_batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
# print("Example Batch Labels:", example_batch['labels'])
# print("Example Batch Input IDs:", example_batch['input_ids'])

In [None]:
# ==================== #
# Evaluation Metric
# ==================== #

metric = evaluate.load('seqeval')

def compute_metrics(p):
    """Computes seqeval metrics (precision, recall, f1, accuracy)."""
    logits, labels = p.predictions, p.label_ids
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[t] for t in label if t != -100] for label in labels]
    true_predictions = [
        [label_names[p] for p, t in zip(prediction, label) if t != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": metrics["overall_precision"],
        "recall": metrics["overall_recall"],
        "f1": metrics["overall_f1"],
        "accuracy": metrics["overall_accuracy"],
    }


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
# ==================== #
# Model Definition
# ==================== #

id2label = {i: label for i, label in enumerate(label_names)}
label2id = {label: i for i, label in enumerate(label_names)}

model = AutoModelForTokenClassification.from_pretrained(
    MODEL_CHECKPOINT,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ============================= #
# TrainingArguments & Trainer
# ============================= #

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to='none',
    logging_steps=100,
    seed=SEED,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
# ================ #
# Train Model
# ================ #

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0954,0.09232,0.8645,0.904073,0.883843,0.972597
2,0.0544,0.070954,0.904143,0.925446,0.914671,0.98047
3,0.0367,0.068961,0.901158,0.929822,0.915265,0.981648


TrainOutput(global_step=2634, training_loss=0.09405742310717329, metrics={'train_runtime': 268.8221, 'train_samples_per_second': 156.695, 'train_steps_per_second': 9.798, 'total_flos': 526389870368628.0, 'train_loss': 0.09405742310717329, 'epoch': 3.0})

In [None]:
# ==================== #
# Saving the Model
# ==================== #

trainer.save_model(OUTPUT_DIR)

# Inference

In [None]:
def perform_inference(text, model_path, aggregation_strategy='simple', device=-1):
    """Performs inference on a given text using the trained model."""
    ner_pipeline = pipeline(
        "token-classification",
        model=model_path,
        aggregation_strategy=aggregation_strategy,
        device=device  # -1 for CPU, 0 for the first GPU, etc.
    )
    results = ner_pipeline(text)
    print("Inference Results:", results)
    return results

In [None]:
example_text = "Bill Gates was the CEO of Microsoft in Seattle, Washington."
perform_inference(example_text, OUTPUT_DIR)

Device set to use cpu


Inference Results: [{'entity_group': 'PER', 'score': 0.9989759, 'word': 'Bill Gates', 'start': 0, 'end': 10}, {'entity_group': 'ORG', 'score': 0.996924, 'word': 'Microsoft', 'start': 26, 'end': 35}, {'entity_group': 'LOC', 'score': 0.9972996, 'word': 'Seattle', 'start': 39, 'end': 46}, {'entity_group': 'LOC', 'score': 0.9980089, 'word': 'Washington', 'start': 48, 'end': 58}]


[{'entity_group': 'PER',
  'score': 0.9989759,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.996924,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9972996,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.9980089,
  'word': 'Washington',
  'start': 48,
  'end': 58}]

# Save Files

In [None]:
# ======================== #
# Zip distilbert Model
# ======================== #

!zip -r /content/distilbert_fine_tuned.zip /content/distilbert-finetuned-ner

  adding: content/distilbert-finetuned-ner/ (stored 0%)
  adding: content/distilbert-finetuned-ner/model.safetensors (deflated 8%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/ (stored 0%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/scheduler.pt (deflated 55%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/model.safetensors (deflated 8%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/trainer_state.json (deflated 64%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/vocab.txt (deflated 49%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/rng_state.pth (deflated 25%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/tokenizer_config.json (deflated 75%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/special_tokens_map.json (deflated 42%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/training_args.bin (deflated 51%)
  adding: content/distilbert-finetuned-ner/checkpoint-878/tokenizer.json (def

In [None]:
# Save Model to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import shutil

# Name of zip file in colab env
source_path = '/content/distilbert_fine_tuned.zip'

# Path of desired folder in google drive
destination_path = '/content/drive/My Drive/Transformers-Model/distilbert_fine_tuned.zip'

shutil.move(source_path, destination_path)
print("File moved to Google Drive:", destination_path)

File moved to Google Drive: /content/drive/My Drive/Transformers-Model/distilbert_fine_tuned.zip
