In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Pavan and I work at Orbcomm India."
encoding = tokenizer(example)
print(type(encoding))

<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [5]:
tokenizer.is_fast

True

In [7]:
encoding.is_fast

True

In [9]:
encoding.tokens()

['[CLS]',
 'My',
 'name',
 'is',
 'Pa',
 '##van',
 'and',
 'I',
 'work',
 'at',
 'Or',
 '##b',
 '##com',
 '##m',
 'India',
 '.',
 '[SEP]']

In [11]:
encoding.word_ids()

[None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 8, 8, 8, 9, 10, None]

In [13]:
start, end = encoding.word_to_chars(3)
example[start:end]

'Pavan'

In [19]:
# Getting the base results with the pipeline
from transformers import pipeline

token_classifier = pipeline("token-classification", model="dbmdz/bert-large-cased-finetuned-conll03-english", device="mps:0", aggregation_strategy="simple")
token_classifier("My name is Pavan and I work at OrbComm in India.")

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


[{'entity_group': 'PER',
  'score': 0.9981694,
  'word': 'Sylvain',
  'start': 11,
  'end': 18},
 {'entity_group': 'ORG',
  'score': 0.9796019,
  'word': 'Hugging Face',
  'start': 33,
  'end': 45},
 {'entity_group': 'LOC',
  'score': 0.9932106,
  'word': 'Brooklyn',
  'start': 49,
  'end': 57}]

In [25]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "dbmdz/bert-large-cased-finetuned-conll03-english"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

sample = "My name is Pavan and I work at OrbComm in India."
inputs = tokenizer(sample, return_tensors="pt")
outputs = model(**inputs)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [27]:
print(inputs["input_ids"].shape)
print(outputs.logits.shape)

torch.Size([1, 19])
torch.Size([1, 19, 9])


In [29]:
import torch

probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
predictions = outputs.logits.argmax(dim=-1)[0].tolist()
print(predictions)

[0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 6, 6, 6, 6, 6, 0, 8, 0, 0]


In [31]:
model.config.id2label

{0: 'O',
 1: 'B-MISC',
 2: 'I-MISC',
 3: 'B-PER',
 4: 'I-PER',
 5: 'B-ORG',
 6: 'I-ORG',
 7: 'B-LOC',
 8: 'I-LOC'}

In [33]:
results = []
tokens = inputs.tokens()

for idx, pred in enumerate(predictions):
    label = model.config.id2label[pred]
    if label != "O":
        results.append(
            {"entity": label, "score": probabilities[idx][pred], "word": tokens[idx]}
        )

print(results)

[{'entity': 'I-PER', 'score': 0.9991655349731445, 'word': 'Pa'}, {'entity': 'I-PER', 'score': 0.9969972372055054, 'word': '##van'}, {'entity': 'I-ORG', 'score': 0.9980290532112122, 'word': 'Or'}, {'entity': 'I-ORG', 'score': 0.9854332804679871, 'word': '##b'}, {'entity': 'I-ORG', 'score': 0.9984001517295837, 'word': '##C'}, {'entity': 'I-ORG', 'score': 0.9940326809883118, 'word': '##om'}, {'entity': 'I-ORG', 'score': 0.994273841381073, 'word': '##m'}, {'entity': 'I-LOC', 'score': 0.9997760653495789, 'word': 'India'}]
