In [5]:
from transformers import pipeline

# Individual tokens inside the sentences are classified into different types.
default_model = "dbmdz/bert-large-cased-finetuned-conll03-english"  # 1.4GB model
# classifier = pipeline("ner")

In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

In [7]:
ner_tokenizer = AutoTokenizer.from_pretrained(default_model,
                                              resume_download=True)

O means the word doesn’t correspond to any entity.

B-PER/I-PER means the word corresponds to the beginning of/is inside a person entity.

B-ORG/I-ORG means the word corresponds to the beginning of/is inside an organization entity.

B-LOC/I-LOC means the word corresponds to the beginning of/is inside a location entity.

B-MISC/I-MISC means the word corresponds to the beginning of/is inside a miscellaneous entity.

O, Outside of a named entity

B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity

I-MIS, Miscellaneous entity

B-PER, Beginning of a person’s name right after another person’s name

I-PER, Person’s name

B-ORG, Beginning of an organisation right after another organisation

I-ORG, Organisation

B-LOC, Beginning of a location right after another location

I-LOC, Location

In [8]:
ner_model = AutoModelForTokenClassification.from_pretrained(default_model,
                                                            resume_download=True)

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
# ner_model.config

from transformers import AutoConfig
AutoConfig.from_pretrained(default_model)

BertConfig {
  "_name_or_path": "dbmdz/bert-large-cased-finetuned-conll03-english",
  "_num_labels": 9,
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MISC",
    "2": "I-MISC",
    "3": "B-PER",
    "4": "I-PER",
    "5": "B-ORG",
    "6": "I-ORG",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 1,
    "B-ORG": 5,
    "B-PER": 3,
    "I-LOC": 8,
    "I-MISC": 2,
    "I-ORG": 6,
    "I-PER": 4,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_s

In [9]:
label_list = [
"O",       # Outside of a named entity
"B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
"I-MISC",  # Miscellaneous entity
"B-PER",   # Beginning of a person's name right after another person's name
"I-PER",   # Person's name
"B-ORG",   # Beginning of an organisation right after another organisation
"I-ORG",   # Organisation
"B-LOC",   # Beginning of a location right after another location
"I-LOC"    # Location 
]

In [13]:
sentence = "I am from United States and I like playing with data Ramesh"

In [15]:
sentence_tokenized = ner_tokenizer.tokenize(sentence)
sentence_tokenized

['I',
 'am',
 'from',
 'United',
 'States',
 'and',
 'I',
 'like',
 'playing',
 'with',
 'data',
 'Ram',
 '##esh']

In [16]:
sentence_input = ner_tokenizer.encode(sentence, return_tensors='pt')
sentence_input

tensor([[  101,   146,  1821,  1121,  1244,  1311,  1105,   146,  1176,  1773,
          1114,  2233, 11447, 10654,   102]])

In [17]:
outputs = ner_model(sentence_input)[0]
outputs

tensor([[[ 1.0070e+01, -2.1285e+00, -1.5390e+00, -2.0444e+00, -1.5033e+00,
          -1.8817e+00, -1.0944e+00, -1.9918e+00,  5.8473e-01],
         [ 1.0482e+01, -2.5018e+00, -1.7410e+00, -2.6160e+00, -6.1306e-02,
          -2.0731e+00,  3.4023e-01, -2.1875e+00, -5.3043e-01],
         [ 1.1118e+01, -2.3193e+00, -1.7046e+00, -2.4368e+00, -6.8214e-01,
          -1.8959e+00,  1.4921e-01, -1.8533e+00, -5.1833e-01],
         [ 1.0280e+01, -2.5857e+00, -1.0144e+00, -2.3723e+00, -1.0276e+00,
          -1.7403e+00, -2.1357e-03, -2.1665e+00,  3.0278e-01],
         [-4.8959e-01, -2.0961e+00, -6.2558e-01, -2.2961e+00, -1.3618e+00,
          -2.0690e+00, -1.0093e+00, -1.3048e+00,  8.8662e+00],
         [-1.4161e+00, -1.8178e+00, -5.6483e-01, -2.2271e+00, -1.4522e+00,
          -2.2266e+00, -1.2002e+00, -1.2413e+00,  8.7637e+00],
         [ 1.1036e+01, -2.4121e+00, -1.5921e+00, -2.4863e+00, -1.3190e+00,
          -1.5500e+00,  6.4275e-03, -1.7939e+00, -3.4909e-01],
         [ 1.1186e+01, -2.3898e+00

In [18]:
import torch
predictions = torch.argmax(outputs, dim=2) 
predictions

tensor([[0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 4, 4, 0]])

In [19]:
class_sentence = ner_model(sentence)
class_sentence

TypeError: string indices must be integers

In [7]:
pos_classifier_model = "vblagoje/bert-english-uncased-finetuned-pos"  # 470 MB model

In [8]:
# Parts of Speech tagging is a Token classification sub-variant

pos_classifier = pipeline("token-classification",
                          model=pos_classifier_model) 

In [12]:
pos_classifier(sentence)

[{'entity': 'PRON',
  'score': 0.99954295,
  'index': 1,
  'word': 'i',
  'start': 0,
  'end': 1},
 {'entity': 'AUX',
  'score': 0.9976078,
  'index': 2,
  'word': 'am',
  'start': 2,
  'end': 4},
 {'entity': 'ADP',
  'score': 0.99935,
  'index': 3,
  'word': 'from',
  'start': 5,
  'end': 9},
 {'entity': 'PROPN',
  'score': 0.99864393,
  'index': 4,
  'word': 'india',
  'start': 10,
  'end': 15},
 {'entity': 'CCONJ',
  'score': 0.9992494,
  'index': 5,
  'word': 'and',
  'start': 16,
  'end': 19},
 {'entity': 'PRON',
  'score': 0.99945444,
  'index': 6,
  'word': 'i',
  'start': 20,
  'end': 21},
 {'entity': 'VERB',
  'score': 0.99256027,
  'index': 7,
  'word': 'like',
  'start': 22,
  'end': 26},
 {'entity': 'VERB',
  'score': 0.99910283,
  'index': 8,
  'word': 'playing',
  'start': 27,
  'end': 34},
 {'entity': 'ADP',
  'score': 0.99935657,
  'index': 9,
  'word': 'with',
  'start': 35,
  'end': 39},
 {'entity': 'NOUN',
  'score': 0.9962528,
  'index': 10,
  'word': 'data',
  'sta