<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Transformers-Hub/blob/main/Restaurant%20NER%20Recognition%20-%20DistilBert/Resaurant_Search_NER_DistilBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Libraries

In [None]:
!pip install -q transformers accelerate datasets seqeval evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m71.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m764.6 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3

# Import Libraries

In [None]:
import warnings
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForTokenClassification,
                          TrainingArguments, Trainer, DataCollatorForTokenClassification, pipeline)
import evaluate

# Suppress warnings for clean output
warnings.filterwarnings("ignore")

# Preparing Project & Dataset

In [None]:
# Define constants
TRAIN_URL = "/content/train.bio"
TEST_URL = "/content/test.bio"
MODEL_CKPT = "distilbert-base-uncased"
OUTPUT_DIR = "finetuned-ner"

In [None]:
def load_data(url):
    """Fetches and processes dataset from a given URL."""
    response = requests.get(url).text.splitlines()
    tokens, tags = [], []
    temp_tokens, temp_tags = [], []

    for line in response:
        if line.strip():
            tag, token = line.strip().split("\t")
            temp_tags.append(tag)
            temp_tokens.append(token)
        else:
            tokens.append(temp_tokens)
            tags.append(temp_tags)
            temp_tokens, temp_tags = [], []

    return tokens, tags

In [None]:
# Load train and test datasets
train_tokens, train_tags = load_data(TRAIN_URL)
test_tokens, test_tags = load_data(TEST_URL)

# Convert datasets to Hugging Face Dataset format
dataset = DatasetDict({
    'train': Dataset.from_pandas(pd.DataFrame({'tokens': train_tokens, 'ner_tags_str': train_tags})),
    'test': Dataset.from_pandas(pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags})),
    'validation': Dataset.from_pandas(pd.DataFrame({'tokens': test_tokens, 'ner_tags_str': test_tags}))
})

# Generate unique NER tags
unique_tags = set(tag[2:] for sublist in dataset['train']['ner_tags_str'] for tag in sublist if tag != 'O')
tag2index = {"O": 0, **{f'{prefix}-{tag}': i+1 for i, tag in enumerate(unique_tags) for prefix in ['B', 'I']}}
index2tag = {v: k for k, v in tag2index.items()}

# Encoding

In [None]:
def encode_tags(example):
    """Encodes NER tags into their corresponding index values."""
    return {"ner_tags": [tag2index[tag] for tag in example['ner_tags_str']]}

dataset = dataset.map(encode_tags)

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

# Tokenization

In [None]:
# Load tokenizer and define tokenization function
tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

def tokenize_and_align_labels(examples):
    """Tokenizes input sentences and aligns NER labels with tokenized words."""
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore tokens like special characters
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)  # Assign -100 to subword tokens

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs['labels'] = labels
    return tokenized_inputs

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/7659 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

Map:   0%|          | 0/1520 [00:00<?, ? examples/s]

In [None]:
# Prepare data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Evaluation

In [None]:
# Load evaluation metric
metric = evaluate.load('seqeval')
label_names = list(tag2index)

def compute_metrics(eval_preds):
    """Computes precision, recall, F1-score, and accuracy for model evaluation."""
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [[label_names[p] for p, l in zip(prediction, label) if l != -100]
                        for prediction, label in zip(predictions, labels)]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics['overall_precision'],
        "recall": all_metrics['overall_recall'],
        "f1": all_metrics['overall_f1'],
        "accuracy": all_metrics['overall_accuracy'],
    }


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

# Define & Train Model

In [None]:
# Load pre-trained model and set training arguments
model = AutoModelForTokenClassification.from_pretrained(MODEL_CKPT, id2label=index2tag, label2id=tag2index)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=4,
    weight_decay=0.01,
    report_to='none'
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save model
trainer.save_model(OUTPUT_DIR)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4556,0.235819,0.840461,0.869066,0.854524,0.923919
2,0.1885,0.232255,0.853191,0.869282,0.861162,0.927569
3,0.152,0.239168,0.86799,0.862346,0.865159,0.929885
4,0.1285,0.241907,0.862121,0.874268,0.868152,0.931008


# Test The Model

In [None]:
# Load trained model for inference
ner_pipeline = pipeline('token-classification', model=OUTPUT_DIR, aggregation_strategy='simple')

# Example inference
example_text = "which restaurant serves the best sushi in New York?"
ner_pipeline(example_text)

[{'entity_group': 'Rating',
  'score': 0.9841209,
  'word': 'best',
  'start': 28,
  'end': 32},
 {'entity_group': 'Dish',
  'score': 0.87054574,
  'word': 'sushi',
  'start': 33,
  'end': 38},
 {'entity_group': 'Location',
  'score': 0.9823859,
  'word': 'new york',
  'start': 42,
  'end': 50}]