In [1]:
from transformers import pipeline

# Individual tokens inside the sentences are classified into different types.
default_model = "dbmdz/bert-large-cased-finetuned-conll03-english"  # 1.4GB model
# classifier = pipeline("ner")

In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [3]:
ner_tokenizer = AutoTokenizer.from_pretrained(default_model,
                                              resume_download=True)

O means the word doesn’t correspond to any entity.

B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.

B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.

B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.

B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.

O, Outside of a named entity

B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity

I-MIS, Miscellaneous entity

B-PER, Beginning of a person’s name right after another person’s name

I-PER, Person’s name

B-ORG, Beginning of an organisation right after another organisation

I-ORG, Organisation

B-LOC, Beginning of a location right after another location

I-LOC, Location

In [4]:
ner_model = AutoModelForTokenClassification.from_pretrained(default_model,
                                                            resume_download=True)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# ner_model.config
from rich import print
from transformers import AutoConfig
print(AutoConfig.from_pretrained(default_model))

In [9]:
label_list = [
"O",       # Outside of a named entity
"B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
"I-MISC",  # Miscellaneous entity
"B-PER",   # Beginning of a person's name right after another person's name
"I-PER",   # Person's name
"B-ORG",   # Beginning of an organisation right after another organisation
"I-ORG",   # Organisation
"B-LOC",   # Beginning of a location right after another location
"I-LOC"    # Location 
]

In [8]:
sentence = "I am from United States and I like playing with data Ramesh"

In [9]:
sentence_tokenized = ner_tokenizer.tokenize(sentence)
sentence_tokenized

['I',
 'am',
 'from',
 'United',
 'States',
 'and',
 'I',
 'like',
 'playing',
 'with',
 'data',
 'Ram',
 '##esh']

In [11]:
sentence_input = ner_tokenizer.encode(sentence, return_tensors='pt')
sentence_input

tensor([[  101,   146,  1821,  1121,  1244,  1311,  1105,   146,  1176,  1773,
          1114,  2233, 11447, 10654,   102]])

In [14]:
outputs = ner_model(sentence_input)[0]
print(outputs)

In [15]:
import torch
predictions = torch.argmax(outputs, dim=2) 
predictions

tensor([[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 4, 4, 0]])

In [16]:
class_sentence = ner_model(sentence)
class_sentence

TypeError: string indices must be integers

In [17]:
pos_classifier_model = "vblagoje/bert-english-uncased-finetuned-pos"  # 470 MB model

In [19]:
# Parts of Speech tagging is a Token classification sub-variant

pos_classifier = pipeline("token-classification",
                          model=pos_classifier_model) 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at vblagoje/bert-english-uncased-finetuned-pos were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
print(pos_classifier)

In [12]:
pos_classifier(sentence)

[{'entity': 'PRON',
  'score': 0.99954295,
  'index': 1,
  'word': 'i',
  'start': 0,
  'end': 1},
 {'entity': 'AUX',
  'score': 0.9976078,
  'index': 2,
  'word': 'am',
  'start': 2,
  'end': 4},
 {'entity': 'ADP',
  'score': 0.99935,
  'index': 3,
  'word': 'from',
  'start': 5,
  'end': 9},
 {'entity': 'PROPN',
  'score': 0.99864393,
  'index': 4,
  'word': 'india',
  'start': 10,
  'end': 15},
 {'entity': 'CCONJ',
  'score': 0.9992494,
  'index': 5,
  'word': 'and',
  'start': 16,
  'end': 19},
 {'entity': 'PRON',
  'score': 0.99945444,
  'index': 6,
  'word': 'i',
  'start': 20,
  'end': 21},
 {'entity': 'VERB',
  'score': 0.99256027,
  'index': 7,
  'word': 'like',
  'start': 22,
  'end': 26},
 {'entity': 'VERB',
  'score': 0.99910283,
  'index': 8,
  'word': 'playing',
  'start': 27,
  'end': 34},
 {'entity': 'ADP',
  'score': 0.99935657,
  'index': 9,
  'word': 'with',
  'start': 35,
  'end': 39},
 {'entity': 'NOUN',
  'score': 0.9962528,
  'index': 10,
  'word': 'data',
  'sta