In [1]:
from transformers import pipeline
import pickle

In [2]:
ner = pipeline('ner', aggregation_strategy='simple', device=0)

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [14]:
with open('ner_train.pkl', 'rb') as f:
    corpus_train = pickle.load(f)

with open('ner_test.pkl', 'rb') as f:
    corpus_test = pickle.load(f)

In [13]:
# corpus_test

In [15]:
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
  tokens = []
  target = []
  for token, tag in sentence_tag_pairs:
    tokens.append(token)
    target.append(tag)
  inputs.append(tokens)
  targets.append(target)

In [16]:
len(inputs)

2970

In [19]:
inputs[9]

['He',
 'was',
 'well',
 'backed',
 'by',
 'England',
 'hopeful',
 'Mark',
 'Butcher',
 'who',
 'made',
 '70',
 'as',
 'Surrey',
 'closed',
 'on',
 '429',
 'for',
 'seven',
 ',',
 'a',
 'lead',
 'of',
 '234',
 '.']

In [20]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [21]:
detokenizer.detokenize(inputs[9])

'He was well backed by England hopeful Mark Butcher who made 70 as Surrey closed on 429 for seven, a lead of 234.'

In [22]:
ner(detokenizer.detokenize(inputs[9]))

[{'entity_group': 'LOC',
  'score': 0.99967515,
  'word': 'England',
  'start': 22,
  'end': 29},
 {'entity_group': 'PER',
  'score': 0.99974275,
  'word': 'Mark Butcher',
  'start': 38,
  'end': 50},
 {'entity_group': 'ORG',
  'score': 0.9996264,
  'word': 'Surrey',
  'start': 66,
  'end': 72}]

In [23]:
def compute_prediction(tokens, input_, ner_result):
  # map hugging face ner result to list of tags
  # tokens is the original tokenized sentence
  # input_ is the detokenized string

  predicted_tags = []
  state = 'O'
  current_index = 0
  for token in tokens:
    # find the token in input_
    index = input_.find(token)
    assert(index >= 0)
    current_index += index # currently pointing to
    # check if this index belongs to an entity and assign label
    tag = 'O'
    for entity in ner_result:
      if current_index >= entity['start'] and current_index < entity['end']:
        tag = entity['entity_group']
        # then this token belongs to an entity
        if state == 'O':
          state = 'B'
        else:
          state = 'I'
        tag = f" {state}-{entity['entity_group']}"
        break
    if tag == 'O':
      # reset state
      state = 'O'
    predicted_tags.append(tag)
    input_ = input_[index + len(token):]
    current_index += len(token)
  assert(len(tokens) == len(predicted_tags))
  return predicted_tags


In [24]:
input_ = detokenizer.detokenize(inputs[9])
ner_result = ner(input_)
predicted_tags = compute_prediction(inputs[9], input_, ner_result)

In [25]:
from sklearn.metrics import accuracy_score, f1_score

In [26]:
accuracy_score(targets[9], predicted_tags)

0.84

In [27]:
for target, predicted_tag in zip(targets[9], predicted_tags):
  print(target, predicted_tag)

O O
O O
O O
O O
O O
B-LOC  B-LOC
O O
B-PER  B-PER
I-PER  I-PER
O O
O O
O O
O O
B-ORG  B-ORG
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O
O O


In [28]:
# get detokenized inputs to pass into ner model
detoke_inputs = []
for input_ in inputs:
  detoke_inputs.append(detokenizer.detokenize(input_))

In [29]:
ner_results = ner(detoke_inputs)

In [30]:
predications = []
for tokens, text, ner_result in zip(inputs, detoke_inputs, ner_results):
  predications.append(compute_prediction(tokens, text, ner_result))

In [31]:
def flatten(list_of_lists):
  flattened = [val for sublist in list_of_lists for val in sublist]
  return flattened

In [32]:
flat_predictions = flatten(predications)
flat_targets = flatten(targets)

In [33]:
accuracy_score(flat_targets, flat_predictions)

0.8338941692740618

In [35]:
f1_score(flat_targets, flat_predictions, average='macro')

0.05870784824015689