In [36]:
import numpy as np
import torch
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer, pipeline 

In [2]:
data = load_dataset('conll2003')

In [3]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
data['train'][0]

{'id': '0',
 'tokens': ['EU',
  'rejects',
  'German',
  'call',
  'to',
  'boycott',
  'British',
  'lamb',
  '.'],
 'pos_tags': [22, 42, 16, 21, 35, 37, 16, 21, 7],
 'chunk_tags': [11, 21, 11, 12, 21, 22, 11, 12, 0],
 'ner_tags': [3, 0, 7, 0, 0, 0, 7, 0, 0]}

In [5]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [6]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [7]:
#will help in assigning corresponding I tags to B tags
data['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [8]:
# will help in parsing labels and predictions for compute_metrics fn
label_names = data['train'].features['ner_tags'].feature.names

In [9]:
# casing is better for ner(Bill vs bill)
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)
t

{'input_ids': [101, 7270, 22961, 1528, 1840, 1106, 21423, 1418, 2495, 12913, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
type(t)

transformers.tokenization_utils_base.BatchEncoding

In [12]:
t.tokens()

['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [13]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

In [14]:
#mapping B to corresponding I tags
#['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
begin2inside = {
    1:2,
    3:4,
    5:6,
    7:8
}

In [15]:
#function for aligning targets:
def align_targets(labels, word_ids):
    aligned_labels = []
    last_id = None
    for id in word_ids:
        if id == None:
            label = -100 #transformers(pytorch crossentropy loss) uses -100 for depicting targets that should not affect loss function 
        elif id != last_id:
            label = labels[id] # B tag
        else:
            label = labels[id] # I tag
            if label in begin2inside:
                label = begin2inside[label]
        aligned_labels.append(label)
        last_id = id
    return aligned_labels 

In [16]:
#checking target alignment:
labels = data['train'][idx+1]['ner_tags']
t = tokenizer(data['train'][idx+1]['tokens'], is_split_into_words=True)
word_ids = t.word_ids()
print(word_ids)
aligned_targets = align_targets(labels, word_ids)
aligned_targets

[None, 0, 1, None]


[-100, 1, 2, -100]

In [17]:
aligned_labels =  [label_names[t] if t>=0 else None for t in aligned_targets]
for x, y in zip(t.tokens(), aligned_labels):
    print(f"{x}\t{y}")

[CLS]	None
Peter	B-PER
Blackburn	I-PER
[SEP]	None


In [18]:
#tokenize fn:
def tokenize_fn(batch):
    tokenized_inputs = tokenizer(
        batch['tokens'],
        truncation=True,
        is_split_into_words=True
    )
    labels_batch = batch['ner_tags']
    aligned_labels_batch = []
    for i, labels in enumerate(labels_batch):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels_batch.append(align_targets(labels, word_ids))
    tokenized_inputs['labels'] = aligned_labels_batch
    return tokenized_inputs

In [19]:
data['train'].column_names

['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags']

In [20]:
tokenized_datasets = data.map(
    function=tokenize_fn,
    batched=True,
    remove_columns=data['train'].column_names
)

In [21]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

In [22]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [23]:
#using data collator explicitly:
batch = data_collator([tokenized_datasets['train'][i] for i in range(2)])
batch['labels'] #even padding tokens have -100 as target

tensor([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0, -100],
        [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100, -100]])

In [24]:
metric = load_metric('seqeval')

  metric = load_metric('seqeval')
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [25]:
metric.compute(
    predictions=[[0, 0, 0]],
    references=[[0, 0, 1]],
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


{'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [26]:
metric.compute(
    predictions=[['A', 'A', 'A']],
    references=[['A', 'A', 'B']],
)



{'_': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1},
 'overall_precision': 0.0,
 'overall_recall': 0.0,
 'overall_f1': 0.0,
 'overall_accuracy': 0.6666666666666666}

In [27]:
metric.compute(
    predictions=[['O', 'I-ORG', 'B-MISC']],
    references=[['O', 'B-ORG', 'B-MISC']],
)

{'MISC': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'ORG': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 0.6666666666666666}

In [28]:
#compute metrics fn:
def compute_metrics(logits_and_labels):
    logits, labels = logits_and_labels
    predictions = np.argmax(logits, axis=-1)
    str_labels = [[label_names[target] for target in row if target != -100] for row in labels]
    str_predictions = [[label_names[prediction] for prediction, target in zip(pred_row, label_row) if target != -100] for pred_row, label_row in zip(predictions, labels)]
    metrics = metric.compute(
        predictions=str_predictions,
        references=str_labels
    )
    return {
        'precision': metrics['overall_precision'],
        'recall': metrics['overall_recall'],
        'f1': metrics['overall_f1'],
        'accuracy': metrics['overall_accuracy']
    }

In [29]:
id2label = {key: value for key, value in enumerate(label_names)}
label2id = {value: key for key, value in id2label.items()}

In [30]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
training_args = TrainingArguments(
    output_dir='output_dir',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01
)

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [33]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()

 10%|▉         | 502/5268 [00:52<08:26,  9.40it/s]

{'loss': 0.2613, 'learning_rate': 1.810174639331815e-05, 'epoch': 0.28}


 19%|█▉        | 1002/5268 [01:44<07:37,  9.32it/s]

{'loss': 0.11, 'learning_rate': 1.6203492786636296e-05, 'epoch': 0.57}


 29%|██▊       | 1502/5268 [02:39<07:08,  8.79it/s]

{'loss': 0.076, 'learning_rate': 1.4305239179954442e-05, 'epoch': 0.85}


                                                   
 33%|███▎      | 1756/5268 [03:17<06:27,  9.07it/s]

{'eval_loss': 0.07418139278888702, 'eval_precision': 0.9124629080118695, 'eval_recall': 0.9315045439246045, 'eval_f1': 0.9218854097268487, 'eval_accuracy': 0.9807647024194973, 'eval_runtime': 11.5489, 'eval_samples_per_second': 281.412, 'eval_steps_per_second': 35.241, 'epoch': 1.0}


 38%|███▊      | 2002/5268 [03:45<05:45,  9.44it/s]  

{'loss': 0.0589, 'learning_rate': 1.240698557327259e-05, 'epoch': 1.14}


 47%|████▋     | 2501/5268 [04:38<04:59,  9.25it/s]

{'loss': 0.0423, 'learning_rate': 1.0508731966590738e-05, 'epoch': 1.42}


 57%|█████▋    | 3002/5268 [05:32<03:57,  9.52it/s]

{'loss': 0.0395, 'learning_rate': 8.610478359908885e-06, 'epoch': 1.71}


 66%|██████▋   | 3502/5268 [06:24<03:08,  9.34it/s]

{'loss': 0.0379, 'learning_rate': 6.712224753227031e-06, 'epoch': 1.99}


                                                   
 67%|██████▋   | 3512/5268 [06:37<03:23,  8.64it/s]

{'eval_loss': 0.05646337568759918, 'eval_precision': 0.9224194608809994, 'eval_recall': 0.9444631437226523, 'eval_f1': 0.9333111591551638, 'eval_accuracy': 0.9849885206334256, 'eval_runtime': 11.9365, 'eval_samples_per_second': 272.273, 'eval_steps_per_second': 34.097, 'epoch': 2.0}


 76%|███████▌  | 4002/5268 [07:33<02:14,  9.42it/s]  

{'loss': 0.019, 'learning_rate': 4.8139711465451785e-06, 'epoch': 2.28}


 85%|████████▌ | 4501/5268 [08:27<01:22,  9.31it/s]

{'loss': 0.0204, 'learning_rate': 2.9157175398633257e-06, 'epoch': 2.56}


 95%|█████████▍| 5001/5268 [09:22<00:31,  8.57it/s]

{'loss': 0.0243, 'learning_rate': 1.0174639331814731e-06, 'epoch': 2.85}


                                                   
100%|██████████| 5268/5268 [10:03<00:00,  9.77it/s]

{'eval_loss': 0.05672231689095497, 'eval_precision': 0.9326065411298315, 'eval_recall': 0.9501851228542578, 'eval_f1': 0.9413137712570858, 'eval_accuracy': 0.986342497203744, 'eval_runtime': 12.1064, 'eval_samples_per_second': 268.454, 'eval_steps_per_second': 33.619, 'epoch': 3.0}


100%|██████████| 5268/5268 [10:05<00:00,  8.70it/s]

{'train_runtime': 605.4614, 'train_samples_per_second': 69.572, 'train_steps_per_second': 8.701, 'train_loss': 0.06649394268870082, 'epoch': 3.0}





TrainOutput(global_step=5268, training_loss=0.06649394268870082, metrics={'train_runtime': 605.4614, 'train_samples_per_second': 69.572, 'train_steps_per_second': 8.701, 'train_loss': 0.06649394268870082, 'epoch': 3.0})

In [34]:
trainer.save_model('ner_model')

In [37]:
ner_model = pipeline(
    task='token-classification',
    model='ner_model',
    aggregation_strategy='simple',
    device=device
)

In [38]:
ner_model("Bill Gates was the CEO of Microsoft in Seattle, Washington")

[{'entity_group': 'PER',
  'score': 0.99913335,
  'word': 'Bill Gates',
  'start': 0,
  'end': 10},
 {'entity_group': 'ORG',
  'score': 0.9985202,
  'word': 'Microsoft',
  'start': 26,
  'end': 35},
 {'entity_group': 'LOC',
  'score': 0.9984333,
  'word': 'Seattle',
  'start': 39,
  'end': 46},
 {'entity_group': 'LOC',
  'score': 0.99770254,
  'word': 'Washington',
  'start': 48,
  'end': 58}]