<a href="https://colab.research.google.com/github/loganathanspr/nlp_course/blob/main/fast_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fast tokenizers' special powers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m481.3/519.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [3]:
encoding

{'input_ids': [101, 1422, 1271, 1110, 156, 7777, 2497, 1394, 1105, 146, 1250, 1120, 20164, 10932, 10289, 1107, 6010, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
tokenizer.is_fast

True

In [None]:
encoding.is_fast

True

In [None]:
encoding.tokens()

['[CLS]', 'My', 'name', 'is', 'S', '##yl', '##va', '##in', 'and', 'I', 'work', 'at', 'Hu', '##gging', 'Face', 'in',
 'Brooklyn', '.', '[SEP]']

In [5]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 3, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, None]

# Test

In [7]:
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
encoding_roberta = roberta_tokenizer("80s")
encoding_roberta

{'input_ids': [0, 2940, 29, 2], 'attention_mask': [1, 1, 1, 1]}

In [8]:
encoding_roberta.tokens()

['<s>', '80', 's', '</s>']

In [9]:
tokenizer("80s").tokens()

['[CLS]', '80s', '[SEP]']

In [10]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Sylvain'

# Test

In [13]:
texts = ["Today is Monday", "Next month is November ."]
encoding_texts = roberta_tokenizer(texts)
encoding_texts

{'input_ids': [[0, 5625, 16, 302, 2], [0, 19192, 353, 16, 759, 479, 2]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}

In [18]:
encoding_texts.word_ids()

[None, 0, 1, 2, None]

In [19]:
encoding_texts.word_ids(1)

[None, 0, 1, 2, 3, 4, None]

In [32]:
# word to chars
start, end = encoding_texts[1].word_to_chars(1)
texts[1][start:end]

'month'

In [33]:
# tokens to chars
start, end = encoding_texts[1].token_to_chars(1)
texts[1][start:end]

'Next'

In [43]:
# char to word
w = encoding_texts[1].char_to_word(0)
start, end = encoding_texts[1].word_to_chars(w)
texts[1][start:end]

'Next'

In [53]:
[enc for enc in encoding_texts.tokens()]

['<s>', 'Today', 'Ġis', 'ĠMonday', '</s>']

In [62]:
from transformers import pipeline
token_classifier = pipeline("token-classification")
output = token_classifier("My name is Loganathan Ramasamy. I live in New York.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [63]:
output

[{'entity': 'I-PER',
  'score': 0.9973621,
  'index': 4,
  'word': 'Logan',
  'start': 11,
  'end': 16},
 {'entity': 'I-PER',
  'score': 0.99289787,
  'index': 5,
  'word': '##ath',
  'start': 16,
  'end': 19},
 {'entity': 'I-PER',
  'score': 0.9914608,
  'index': 6,
  'word': '##an',
  'start': 19,
  'end': 21},
 {'entity': 'I-PER',
  'score': 0.9992663,
  'index': 7,
  'word': 'Rama',
  'start': 22,
  'end': 26},
 {'entity': 'I-PER',
  'score': 0.9945457,
  'index': 8,
  'word': '##sa',
  'start': 26,
  'end': 28},
 {'entity': 'I-PER',
  'score': 0.9875486,
  'index': 9,
  'word': '##my',
  'start': 28,
  'end': 30},
 {'entity': 'I-LOC',
  'score': 0.99931014,
  'index': 14,
  'word': 'New',
  'start': 42,
  'end': 45},
 {'entity': 'I-LOC',
  'score': 0.9988575,
  'index': 15,
  'word': 'York',
  'start': 46,
  'end': 50}]

In [64]:
from transformers import pipeline

token_classifier = pipeline("token-classification", aggregation_strategy="simple")
token_classifier("My name is Loganathan Ramasamy. I live in New York.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'entity_group': 'PER',
  'score': 0.9938469,
  'word': 'Loganathan Ramasamy',
  'start': 11,
  'end': 30},
 {'entity_group': 'LOC',
  'score': 0.9990838,
  'word': 'New York',
  'start': 42,
  'end': 50}]

In [65]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

example = "My name is Loganathan Ramasamy and I live in New York."
inputs = tokenizer(example, return_tensors="pt")
outputs = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [66]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

torch.Size([1, 18])
torch.Size([1, 18, 9])


In [74]:
import torch
probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 8, 8, 0, 0]


In [75]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [76]:
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
inputs_with_offsets["offset_mapping"]

[(0, 0),
 (0, 2),
 (3, 7),
 (8, 10),
 (11, 16),
 (16, 19),
 (19, 21),
 (22, 26),
 (26, 28),
 (28, 30),
 (31, 34),
 (35, 36),
 (37, 41),
 (42, 44),
 (45, 48),
 (49, 53),
 (53, 54),
 (0, 0)]

In [77]:
example[11:30]

'Loganathan Ramasamy'

In [78]:
example[5:16]

'me is Logan'

In [79]:
results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "0":
        start, end = offsets[idx]
        results.append(
            {
                "entity": label,
                "score": probabilities[idx][pred],
                "word": tokens[idx],
                "start": start,
                "end": end
            }
        )
print(results)

[{'entity': 'O', 'score': 0.9990185499191284, 'word': '[CLS]', 'start': 0, 'end': 0}, {'entity': 'O', 'score': 0.9997076392173767, 'word': 'My', 'start': 0, 'end': 2}, {'entity': 'O', 'score': 0.999670147895813, 'word': 'name', 'start': 3, 'end': 7}, {'entity': 'O', 'score': 0.9998857975006104, 'word': 'is', 'start': 8, 'end': 10}, {'entity': 'I-PER', 'score': 0.9975009560585022, 'word': 'Logan', 'start': 11, 'end': 16}, {'entity': 'I-PER', 'score': 0.9939070343971252, 'word': '##ath', 'start': 16, 'end': 19}, {'entity': 'I-PER', 'score': 0.9931873083114624, 'word': '##an', 'start': 19, 'end': 21}, {'entity': 'I-PER', 'score': 0.9993284940719604, 'word': 'Rama', 'start': 22, 'end': 26}, {'entity': 'I-PER', 'score': 0.9954164028167725, 'word': '##sa', 'start': 26, 'end': 28}, {'entity': 'I-PER', 'score': 0.9891738891601562, 'word': '##my', 'start': 28, 'end': 30}, {'entity': 'O', 'score': 0.9998189806938171, 'word': 'and', 'start': 31, 'end': 34}, {'entity': 'O', 'score': 0.999759256839

In [80]:
example[33:45]

'd I live in '

In [81]:
import numpy as np

results = []
inputs_with_offsets = tokenizer(example, return_offsets_mapping=True)
tokens = inputs_with_offsets.tokens()
offsets = inputs_with_offsets["offset_mapping"]

idx = 0
while idx < len(predictions):
    pred = predictions[idx]
    label = model.config.id2label[pred]
    if label != "O":
        # Remove the B- or I-
        label = label[2:]
        start, _ = offsets[idx]

        # Grab all the tokens labeled with I-label
        all_scores = []
        while (
            idx < len(predictions)
            and model.config.id2label[predictions[idx]] == f"I-{label}"
        ):
            all_scores.append(probabilities[idx][pred])
            _, end = offsets[idx]
            idx += 1

        # The score is the mean of all the scores of the tokens in that grouped entity
        score = np.mean(all_scores).item()
        word = example[start:end]
        results.append(
            {
                "entity_group": label,
                "score": score,
                "word": word,
                "start": start,
                "end": end,
            }
        )
    idx += 1

print(results)

[{'entity_group': 'PER', 'score': 0.9947523474693298, 'word': 'Loganathan Ramasamy', 'start': 11, 'end': 30}, {'entity_group': 'LOC', 'score': 0.9990269839763641, 'word': 'New York', 'start': 45, 'end': 53}]
