In [1]:
import datasets
from tner import TransformersNER
from tner import get_dataset

In [2]:
import os
#os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
#os.environ["CUDA_VISIBLE_DEVICES"] = "3"
import torch
print(f'available devices: {torch.cuda.device_count()}')
print(f'current device: { torch.cuda.current_device()}')

available devices: 8
current device: 0


# load finer ord dataset from huggingface

In [3]:
finer_ord = datasets.load_dataset('gtfintechlab/finer-ord')

In [4]:
label_to_label_idx = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-ORG': 5, 'I-ORG': 6}
label_idx_to_label = {v:k for k,v in label_to_label_idx.items()}

## Finer ord to connl format files

In [5]:
def save_as_connl(finer_ord_orig, part, target_filename):
    print(part)
    connl = []
    last_sent, last_doc = 0, 0
    for i in finer_ord[part]:
        if i["sent_idx"] != last_sent or i["doc_idx"] != last_doc:
            connl.append("\n")
        label = label_idx_to_label[i["gold_label"]]
        if i["gold_token"] is None:
            print(f"Why is this token None? {i}")
            i["gold_token"] = ""
        token = i['gold_token'].strip()
        if token:
            connl.append(f"{token} {label}")
        last_sent, last_doc = i["sent_idx"], i["doc_idx"]
    print(len(connl))
    connl = "\n".join(connl)
    with open(target_filename, "w") as f:
        f.write(connl)
save_as_connl(finer_ord, "train", "data/train.txt")
save_as_connl(finer_ord, "validation", "data/validation.txt")
save_as_connl(finer_ord, "test", "data/test.txt")

train
Why is this token None? {'gold_label': 0, 'gold_token': None, 'doc_idx': 152, 'sent_idx': 11}
83791
validation
10635
test
27031


# load connl 2003 pretrained roberta

In [6]:
model = TransformersNER("tner/roberta-large-conll2003")
model.predict(["Jacob Collier is a Grammy awarded English artist from London"])

2024-04-03 00:07:10 INFO     initialize language model with `tner/roberta-large-conll2003`
2024-04-03 00:07:15 INFO     use CRF
2024-04-03 00:07:15 INFO     loading pre-trained CRF layer
2024-04-03 00:07:15 INFO     label2id: {'B-LOC': 5, 'B-MISC': 2, 'B-ORG': 1, 'B-PER': 3, 'I-LOC': 8, 'I-MISC': 7, 'I-ORG': 6, 'I-PER': 4, 'O': 0}
    There is an imbalance between your GPUs. You may want to exclude GPU 4 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
2024-04-03 00:07:17 INFO     device   : cuda
2024-04-03 00:07:17 INFO     gpus     : 8
2024-04-03 00:07:20 INFO     encode all the data: 1
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.34s/it]


{'prediction': [['B-PER',
   'I-PER',
   'O',
   'O',
   'B-MISC',
   'I-MISC',
   'B-MISC',
   'O',
   'O',
   'B-LOC']],
 'probability': [[0.9999880790710449,
   0.9999984502792358,
   0.9999995231628418,
   0.9999996423721313,
   0.9984089732170105,
   0.7698124051094055,
   0.9999961853027344,
   0.9999994039535522,
   0.9999992847442627,
   0.999987006187439]],
 'input': [['Jacob',
   'Collier',
   'is',
   'a',
   'Grammy',
   'awarded',
   'English',
   'artist',
   'from',
   'London']],
 'entity_prediction': [[{'type': 'PER',
    'entity': ['Jacob', 'Collier'],
    'position': [0, 1],
    'probability': [0.9999880790710449, 0.9999984502792358]},
   {'type': 'MISC',
    'entity': ['Grammy', 'awarded'],
    'position': [4, 5],
    'probability': [0.9984089732170105, 0.7698124051094055]},
   {'type': 'MISC',
    'entity': ['English'],
    'position': [6],
    'probability': [0.9999961853027344]},
   {'type': 'LOC',
    'entity': ['London'],
    'position': [9],
    'probability':

In [7]:
label2id_wanted = {'B-LOC': 5, 'B-MISC': 2, 'B-ORG': 1, 'B-PER': 3, 'I-LOC': 8, 'I-MISC': 7, 'I-ORG': 6, 'I-PER': 4, 'O': 0}
finer_ord_data, label2id = get_dataset(local_dataset={
    "valid": "data/validation.txt",
    "train": "data/train.txt",
    "test": "data/test.txt"
})
label2id

{'B-LOC': 0,
 'B-ORG': 1,
 'B-PER': 2,
 'I-LOC': 3,
 'I-ORG': 4,
 'I-PER': 5,
 'O': 6}

In [8]:
# Map to the right model label!
mapping = {}
for label, idx in label2id.items():
    mapping[idx] = label2id_wanted[label]
finer_ord_data["train"]["tags"] = [[mapping[t] for t in s] for s in finer_ord_data["train"]["tags"]]
finer_ord_data["valid"]["tags"] = [[mapping[t] for t in s] for s in finer_ord_data["valid"]["tags"]]
finer_ord_data["test"]["tags"] = [[mapping[t] for t in s] for s in finer_ord_data["test"]["tags"]]

In [9]:
import logging, sys
logging.disable(sys.maxsize)

In [10]:
def get_preds(data):
    preds = []
    for idx, sent in enumerate(data["tokens"]):
        print(f"\r({idx}/{len(data['tokens'])})", end="")
        pred = model.predict([sent])["prediction"][0]
        preds.append(pred)
        if len(pred) != len(sent):
            print(len(pred), len(sent))
            print("noooo")
    len(preds)
    return preds

In [22]:
preds_test = get_preds(finer_ord_data["test"]);

In [26]:
# preds_validation = get_preds(finer_ord_data["valid"])

In [None]:
#preds_train = get_preds(finer_ord_data["train"])

# Evaluate conll2003 model on finer ord

In [23]:
import evaluate
import pandas as pd

metric = evaluate.load('seqeval')

def compute(predictions, references):
    performance = metric.compute(predictions=predictions, references=references)
    micro = pd.Series({k[8:]: v for k, v in performance.items() if k.startswith("overall_")})
    label_performance = {k: v for k, v in performance.items() if not k.startswith("overall_")}
    
    metrics_df = pd.DataFrame(label_performance).T
    weights = metrics_df.number.divide(metrics_df.number.sum())
    weighted_average_macro = metrics_df[["precision", "recall", "f1"]].multiply(weights, axis=0).sum()
    metrics_df.loc["micro"] = micro
    metrics_df.loc["macro"] = metrics_df[["precision", "recall", "f1"]].mean()
    metrics_df.loc["macro_weighted"] = weighted_average_macro
    return metrics_df
id2label_wanted = {v:k for k, v in label2id_wanted.items()}

#preds_train_corrected = [[t if "MISC" not in t else "O" for t in s] for s in preds_train]
#gold_train = [[id2label_wanted[t] for t in s] for s in finer_ord_data["train"]['tags']]


#preds_validation_corrected = [[t if "MISC" not in t else "O" for t in s] for s in preds_validation]
#gold_validation = [[id2label_wanted[t] for t in s] for s in finer_ord_data["valid"]['tags']]

preds_test_corrected = [[t if "MISC" not in t else "O" for t in s] for s in preds_test]
gold_test = [[id2label_wanted[t] for t in s] for s in finer_ord_data["test"]['tags']]

gold_test[939], preds_test_corrected[939], finer_ord_data["test"]["tokens"][939]

(['O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['Scout', 'Finch', 'is', 'all', 'grown', 'up', ',', 'indeed', '.'])

In [14]:
test_perfomance = compute(preds_test_corrected, gold_test)

In [15]:
print("test")
compute(preds_test_corrected, gold_test)

test


Unnamed: 0,precision,recall,f1,number
LOC,0.775076,0.85,0.810811,300.0
ORG,0.720539,0.77396,0.746295,553.0
PER,0.925424,0.954545,0.939759,286.0
micro,0.784893,0.839333,0.811201,
macro,0.801483,0.85446,0.827016,
macro_weighted,0.786349,0.839333,0.811866,


In [24]:
print(test_perfomance.to_markdown())

|                |   precision |   recall |       f1 |   number |
|:---------------|------------:|---------:|---------:|---------:|
| LOC            |    0.775076 | 0.85     | 0.810811 |      300 |
| ORG            |    0.720539 | 0.77396  | 0.746295 |      553 |
| PER            |    0.925424 | 0.954545 | 0.939759 |      286 |
| micro          |    0.784893 | 0.839333 | 0.811201 |      nan |
| macro          |    0.801483 | 0.85446  | 0.827016 |      nan |
| macro_weighted |    0.786349 | 0.839333 | 0.811866 |      nan |


In [25]:
print("validation")
compute(preds_validation_corrected, gold_validation)

validation


Unnamed: 0,precision,recall,f1,number
LOC,0.892308,0.901554,0.896907,193.0
ORG,0.850202,0.755396,0.8,278.0
PER,0.924138,0.971014,0.946996,138.0
micro,0.882453,0.850575,0.866221,
macro,0.887275,0.869635,0.877531,
macro_weighted,0.8803,0.850575,0.864021,


In [65]:
print("train")
compute(preds_train_corrected, gold_train)

train


Unnamed: 0,precision,recall,f1,number
LOC,0.83871,0.861284,0.849847,966.0
ORG,0.832283,0.842079,0.837153,2039.0
PER,0.918671,0.970944,0.944085,826.0
micro,0.853106,0.874706,0.863771,
macro,0.860692,0.887253,0.873714,
macro_weighted,0.85253,0.874706,0.863409,


### Label count

In [None]:
org = 2039 + 278 + 553
per = 826 + 138 + 286
loc = 966 + 193 + 300
org, per, loc

In [None]:
model.train()

In [26]:
sents = list(zip(finer_ord_data["train"]["tokens"], finer_ord_data["train"]["tags"]))

In [27]:
sents[45]

(['In',
  'the',
  'first',
  'sign',
  'that',
  'turmoil',
  'in',
  'the',
  'stock',
  'market',
  'could',
  'affect',
  'spending',
  'in',
  'the',
  'real',
  'economy',
  ',',
  'China',
  "'s",
  'automakers',
  "'",
  'association',
  'on',
  'Friday',
  'slashed',
  'its',
  '2015',
  'forecast',
  'for',
  'vehicle',
  'sales',
  'growth',
  'to',
  'a',
  'meagre',
  '3',
  'percent',
  '.'],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])