In [4]:
import torch
import pickle
import textwrap
import numpy as np 
import pandas as pd 
from pprint import pprint
from transformers import pipeline

In [5]:
ner = pipeline("ner", aggregation_strategy='simple', device='cuda:0')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


# LOAD DATA

In [6]:
with open('data/ner/ner_train.pkl', 'rb') as f:
  corpus_train = pickle.load(f)

with open('data/ner/ner_test.pkl', 'rb') as f:
  corpus_test = pickle.load(f)

In [7]:
corpus_test

[[('CRICKET', 'O'),
  ('-', 'O'),
  ('LEICESTERSHIRE', 'B-ORG'),
  ('TAKE', 'O'),
  ('OVER', 'O'),
  ('AT', 'O'),
  ('TOP', 'O'),
  ('AFTER', 'O'),
  ('INNINGS', 'O'),
  ('VICTORY', 'O'),
  ('.', 'O')],
 [('West', 'B-MISC'),
  ('Indian', 'I-MISC'),
  ('all-rounder', 'O'),
  ('Phil', 'B-PER'),
  ('Simmons', 'I-PER'),
  ('took', 'O'),
  ('four', 'O'),
  ('for', 'O'),
  ('38', 'O'),
  ('on', 'O'),
  ('Friday', 'O'),
  ('as', 'O'),
  ('Leicestershire', 'B-ORG'),
  ('beat', 'O'),
  ('Somerset', 'B-ORG'),
  ('by', 'O'),
  ('an', 'O'),
  ('innings', 'O'),
  ('and', 'O'),
  ('39', 'O'),
  ('runs', 'O'),
  ('in', 'O'),
  ('two', 'O'),
  ('days', 'O'),
  ('to', 'O'),
  ('take', 'O'),
  ('over', 'O'),
  ('at', 'O'),
  ('the', 'O'),
  ('head', 'O'),
  ('of', 'O'),
  ('the', 'O'),
  ('county', 'O'),
  ('championship', 'O'),
  ('.', 'O')],
 [('Their', 'O'),
  ('stay', 'O'),
  ('on', 'O'),
  ('top', 'O'),
  (',', 'O'),
  ('though', 'O'),
  (',', 'O'),
  ('may', 'O'),
  ('be', 'O'),
  ('short-lived', 

# Data Preprocessing

In [8]:
inputs = []
targets = []

for sentence_tag_pairs in corpus_test:
    tokens = []
    target = []
    for token, tag in sentence_tag_pairs:
        tokens.append(token)
        target.append(tag)
    inputs.append(tokens)
    targets.append(target)

In [19]:
inputs[1]

['West',
 'Indian',
 'all-rounder',
 'Phil',
 'Simmons',
 'took',
 'four',
 'for',
 '38',
 'on',
 'Friday',
 'as',
 'Leicestershire',
 'beat',
 'Somerset',
 'by',
 'an',
 'innings',
 'and',
 '39',
 'runs',
 'in',
 'two',
 'days',
 'to',
 'take',
 'over',
 'at',
 'the',
 'head',
 'of',
 'the',
 'county',
 'championship',
 '.']

In [21]:
targets[1]

['B-MISC',
 'I-MISC',
 'O',
 'B-PER',
 'I-PER',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-ORG',
 'O',
 'B-ORG',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [11]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()

In [22]:
detokenizer.detokenize(inputs[1])

'West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship.'

#### The reason for detokenizing is that transformer pipeline only works for sentences not for tokens

# Predictions

In [23]:
ner(detokenizer.detokenize(inputs[1]))

[{'entity_group': 'MISC',
  'score': 0.9971397,
  'word': 'West Indian',
  'start': 0,
  'end': 11},
 {'entity_group': 'PER',
  'score': 0.9996698,
  'word': 'Phil Simmons',
  'start': 24,
  'end': 36},
 {'entity_group': 'ORG',
  'score': 0.9983859,
  'word': 'Leicestershire',
  'start': 67,
  'end': 81},
 {'entity_group': 'ORG',
  'score': 0.99905473,
  'word': 'Somerset',
  'start': 87,
  'end': 95}]