In [12]:
import datasets
import numpy as np

In [6]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
#os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import torch
print(f'available devices: {torch.cuda.device_count()}')
print(f'current device: { torch.cuda.current_device()}')

available devices: 1
current device: 0


In [1]:
from transformers import RobertaForTokenClassification, PreTrainedTokenizerFast

In [2]:
model = RobertaForTokenClassification.from_pretrained("tner/roberta-large-conll2003")

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("tner/roberta-large-conll2003")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [7]:
finer_ord = datasets.load_dataset('gtfintechlab/finer-ord')

label_to_label_idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-ORG': 5, 'I-ORG': 6}
label_idx_to_label = {v:k for k,v in label_to_label_idx.items()}

Using custom data configuration gtfintechlab--finer-ord-f945a026865ebcab
Found cached dataset csv (/home/ottowg/.cache/huggingface/datasets/gtfintechlab___csv/gtfintechlab--finer-ord-f945a026865ebcab/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
def save_as_connl(finer_ord_orig, part, target_filename):
    print(part)
    connl = []
    last_sent, last_doc = 0, 0
    for i in finer_ord[part]:
        if i["sent_idx"] != last_sent or i["doc_idx"] != last_doc:
            connl.append("\n")
        label = label_idx_to_label[i["gold_label"]]
        if i["gold_token"] is None:
            print(f"Why is this token None? {i}")
            i["gold_token"] = ""
        token = i['gold_token'].strip()
        if token:
            connl.append(f"{token} {label}")
        last_sent, last_doc = i["sent_idx"], i["doc_idx"]
    print(len(connl))
    connl = "\n".join(connl)
    with open(target_filename, "w") as f:
        f.write(connl)
save_as_connl(finer_ord, "train", "data/train.txt")
save_as_connl(finer_ord, "validation", "data/validation.txt")
save_as_connl(finer_ord, "test", "data/test.txt")

train
83792
validation
10635
test
27031


In [10]:
label2id_roberta = {'B-LOC': 5, 'B-MISC': 2, 'B-ORG': 1, 'B-PER': 3, 'I-LOC': 8, 'I-MISC': 7, 'I-ORG': 6, 'I-PER': 4, 'O': 0}
id2label_roberta = {v:k for k, v in label2id_roberta.items()}
id2label_roberta

{5: 'B-LOC',
 2: 'B-MISC',
 1: 'B-ORG',
 3: 'B-PER',
 8: 'I-LOC',
 7: 'I-MISC',
 6: 'I-ORG',
 4: 'I-PER',
 0: 'O'}

In [13]:
def predict_batch(batch, id2label):
    token = tokenizer(text=batch, is_split_into_words=True, return_tensors="pt", padding=True)
    pred = np.argmax(model(**token).logits.detach().numpy(), axis=2)[:,1:-1]
    word_ids_batch = [token.word_ids(i)[1:-1] for i in range(len(token.input_ids))]
    word_ids_batch = [[w if w is not None else len(b) - 1 for w in wb]
                      for wb, b in zip(word_ids_batch, batch)]
    word_ids_batch = np.array(word_ids_batch)
    word_ids_batch_next = np.roll(word_ids_batch, shift=1, axis=1)
    word_ids_batch_next[:,0] = -1
    mask_first_token = word_ids_batch != word_ids_batch_next
    mask_attention = token["attention_mask"].numpy().astype("bool")[:,1:-1]
    mask_first_token = mask_first_token & mask_attention
    pred_tag_idxs = np.ma.masked_array(pred, mask=~mask_first_token)
    to_label = np.vectorize(id2label.get, otypes=["O"])
    tags = np.apply_along_axis(to_label, 0, pred_tag_idxs).astype('U').tolist()
    tags = [[t for t in s if t is not None] for s in tags]
    for s, t in zip(batch, tags):
        assert len(s) == len(t)
    return tags

In [14]:
batch = [
    ["hello", "here","and", "Thomas", "Obama", "no", "no", "London", "London", "London", "London", "London", "London"],
    ["hi", "here", "Paris", "and", "London"],
    ['Tue', ',', 'Oct', '20', ',', '2015', ',', '03:57', 'BST', '-', 'UK'],
    ['…'],
    [],
]
#for b in batch:
#    print(len(b))
expected = [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG']]
predict_batch(batch, id2label_roberta)

[['O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'B-LOC',
  'B-LOC'],
 ['O', 'O', 'B-LOC', 'I-LOC', 'I-LOC'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC'],
 ['O'],
 []]

In [17]:
# load connl format into sentences
fn = "data/test.txt"

In [18]:
with open(fn) as f:
    lines = f.readlines()
    sentences = [[]]
    sentences_label = [[]]
    for idx, line in enumerate(lines):
        if line.strip():
            word, label = line.strip().split()
            sentences[-1].append(word)
            sentences_label[-1].append(label)
        elif sentences and len(sentences[-1]) != 0:#if idx + 1 < len(lines):
            sentences.append([])
            sentences_label.append([])
len(sentences), len(sentences_label)

(1075, 1075)

In [20]:
#sentences[351], sentences_label[351]

In [21]:
id2label_roberta_wo_misc = {
 5: 'B-LOC',
 2: 'O', # 'B-MISC',
 1: 'B-ORG',
 3: 'B-PER',
 8: 'I-LOC',
 7: 'O', # 'I-MISC',
 6: 'I-ORG',
 4: 'I-PER',
 0: 'O'}

In [22]:
batch_size = 16
start = 0
current_batch = None
predictions = []
while current_batch or not predictions:
    current_batch = sentences[:None][start:start+batch_size]
    if current_batch:
        prediction = predict_batch(current_batch, id2label_roberta_wo_misc)
        predictions.extend(prediction)
    print(f"\r{len(predictions)}", end="")
    start += batch_size
#predict_batch(sentences[100:103], id2label_roberta)

1075

In [23]:
import evaluate
import pandas as pd
from itertools import chain
metric = evaluate.load('seqeval')

def compute(predictions, references):
    performance = metric.compute(predictions=predictions, references=references)
    micro = pd.Series({k[8:]: v for k, v in performance.items() if k.startswith("overall_")})
    label_performance = {k: v for k, v in performance.items() if not k.startswith("overall_")}    
    metrics_df = pd.DataFrame(label_performance).T
    weights = metrics_df.number.divide(metrics_df.number.sum())
    weighted_average_macro = metrics_df[["precision", "recall", "f1"]].multiply(weights, axis=0).sum()
    metrics_df.loc["micro"] = micro
    metrics_df.loc["macro"] = metrics_df[["precision", "recall", "f1"]].mean()
    metrics_df.loc["macro_weighted"] = weighted_average_macro
    return metrics_df

In [24]:
compute(predictions, sentences_label)

Unnamed: 0,precision,recall,f1,number
LOC,0.740299,0.826667,0.781102,300.0
ORG,0.673721,0.690778,0.682143,553.0
PER,0.908451,0.902098,0.905263,286.0
micro,0.748735,0.779631,0.763871,
macro,0.767801,0.799793,0.783095,
macro_weighted,0.750197,0.779631,0.764233,
