In [106]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline, Trainer
from transformers import TrainingArguments

In [46]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")

In [99]:
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
model.classifier = nn.Linear(768,15)

In [67]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [44]:
example = "This dataset was derived from the Reuters corpus which consists of Reuters news stories. You can read more about how this dataset was created in the CoNLL-2003 paper."

In [68]:
ner_results = nlp(example)
for x in ner_results:
    print(x)

KeyError: 13

In [64]:
from torchsummary import summary
import torch
import torch.nn as nn

In [60]:
model.train()
nlp('Rex')
# for x, y in model.named_parameters():
#     print(x,y)

[{'entity': 'B-PER',
  'score': 0.9878875,
  'index': 1,
  'word': 'Rex',
  'start': 0,
  'end': 3}]

In [66]:
# print(model)
model.classifier = nn.Linear(768, 15)

In [71]:
tokenizer('This dataset was derived from the Reuters corpus'.split(' '), is_split_into_words=True)

{'input_ids': [101, 1188, 2233, 9388, 1108, 4408, 1121, 1103, 11336, 27603, 26661, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [88]:
from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, _, _, tag = line.split(' ')
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

In [93]:
import torch.datasets.dataset as dataset

ModuleNotFoundError: No module named 'torch.datasets'

In [94]:
token_docs, tag_docs = read_wnut('../data/test.txt')
dataset = [{'text': x, 'label': y} for x, y in zip(token_docs, tag_docs)]

In [127]:
encodings = tokenizer(token_docs, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [129]:
encodings.pop('offset_mapping')

[[(0, 0),
  (0, 1),
  (0, 12),
  (0, 3),
  (0, 4),
  (0, 2),
  (0, 8),
  (0, 7),
  (0, 15),
  (0, 2),
  (0, 2),
  (0, 7),
  (0, 3),
  (0, 7),
  (0, 2),
  (0, 9),
  (0, 2),
  (0, 1),
  (0, 4),
  (0, 1),
  (0, 2),
  (2, 3),
  (3, 6),
  (0, 6),
  (0, 1),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0),
  (0, 0)

In [119]:
tags = tag_docs
unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [115]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [120]:
labels = encode_tags(tag_docs, encodings)

In [121]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [130]:
dataset = MyDataset(encodings, labels)

In [101]:
training_args = TrainingArguments(output_dir="test_trainer")

In [103]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return (predictions==labels).mean()

In [131]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics,
)

In [132]:
trainer.train()

***** Running training *****
  Num examples = 46
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3


ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/transformers/models/bert/modeling_bert.py", line 1762, in forward
    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/torch/nn/modules/loss.py", line 1152, in forward
    label_smoothing=self.label_smoothing)
  File "/home/melodia/anaconda3/envs/sakura/lib/python3.6/site-packages/torch/nn/functional.py", line 2846, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
ValueError: Expected input batch_size (1040) to match target batch_size (624).
