In [1]:
!pip install datasets -q
!pip install tokenizers -q
!pip install transformers==4.28.0 -q
!pip install seqeval -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset

dataset = load_dataset("conllpp")



  0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
label_names = dataset["train"].features["ner_tags"].feature.names
id2label = {i:x for i,x in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}
print(label_names)
print(label2id)
print(id2label)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC'}


In [4]:
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification

In [5]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", padding='max_length', truncation=True, return_tensors='pt')

In [6]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None 
        label_ids = []
        for word_idx in word_ids: 
            if word_idx is None: 
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            elif word_idx != previous_word_idx:             
                label_ids.append(label[word_idx]) 
            else: 
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
                 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [7]:
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)



Map:   0%|          | 0/3250 [00:00<?, ? examples/s]



In [8]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_names), id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [19]:
import numpy as np
import datasets
metric = datasets.load_metric("seqeval") 
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2)
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_names[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_names[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
  } 

  metric = datasets.load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [21]:
from transformers import TrainingArguments, Trainer

batch_size = 16
epochs = 10

training_args = TrainingArguments(
    output_dir="retrained_ner",
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_strategy="epoch",
    save_strategy="no",
    fp16=True,
    fp16_full_eval=False,
    push_to_hub=True,
    hub_strategy="end",
    hub_token="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"
    )

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


/content/retrained_ner is already a clone of https://huggingface.co/sentientconch/retrained_ner. Make sure you pull the latest changes with `repo.git_pull()`.


In [23]:
trainer.train()



Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0174,0.08315,0.937289,0.931312,0.934291,0.983288
2,0.0193,0.087386,0.925161,0.932095,0.928615,0.98208
3,0.0104,0.081822,0.925276,0.937801,0.931496,0.984479
4,0.0071,0.083336,0.931264,0.939702,0.935464,0.98459
5,0.0037,0.08144,0.939996,0.948093,0.944027,0.986401
6,0.0029,0.094329,0.94094,0.944625,0.942779,0.98567
7,0.0018,0.097541,0.942612,0.946303,0.944454,0.986449
8,0.0012,0.102708,0.939505,0.95033,0.944886,0.986052
9,0.0013,0.104016,0.942118,0.948652,0.945373,0.985988
10,0.0009,0.104473,0.942994,0.949323,0.946148,0.986369


TrainOutput(global_step=8780, training_loss=0.0066062237923129, metrics={'train_runtime': 917.5937, 'train_samples_per_second': 153.02, 'train_steps_per_second': 9.569, 'total_flos': 3405986046170064.0, 'train_loss': 0.0066062237923129, 'epoch': 10.0})

In [32]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/415M [00:00<?, ?B/s]

Upload file runs/May12_04-16-32_701efb0688c3/events.out.tfevents.1683865302.701efb0688c3.3100.0:   0%|        …

Upload file training_args.bin:   0%|          | 1.00/3.56k [00:00<?, ?B/s]

Upload file runs/May12_04-28-58_701efb0688c3/1683865746.450723/events.out.tfevents.1683865746.701efb0688c3.310…

Upload file runs/May12_04-28-58_701efb0688c3/events.out.tfevents.1683865746.701efb0688c3.3100.2:   0%|        …

Upload file runs/May12_04-16-32_701efb0688c3/1683865302.8347883/events.out.tfevents.1683865302.701efb0688c3.31…

Upload file runs/May12_04-37-25_701efb0688c3/events.out.tfevents.1683866260.701efb0688c3.3100.4:   0%|        …

Upload file runs/May12_04-37-25_701efb0688c3/1683866260.5910938/events.out.tfevents.1683866260.701efb0688c3.31…

To https://huggingface.co/sentientconch/retrained_ner
   45d1486..1840fec  main -> main

   45d1486..1840fec  main -> main

To https://huggingface.co/sentientconch/retrained_ner
   1840fec..72f6b2c  main -> main

   1840fec..72f6b2c  main -> main



'https://huggingface.co/sentientconch/retrained_ner/commit/1840fec45f81f5baeb3860ad50adb09bd44d4339'

In [24]:
model=model.eval()

In [29]:
inference_encodings = tokenizer(["Golden State Warriors are a basketball team from san francisco"], padding = "max_length", truncation=True)

In [30]:
import torch
res = model(torch.tensor(inference_encodings["input_ids"]).cuda(), attention_mask=torch.tensor(inference_encodings["attention_mask"]).cuda())[0].argmax(dim=2)

In [31]:
for i, enc in enumerate(inference_encodings['input_ids'][0]):
 if enc:
  print(tokenizer.decode(enc), "\t", label_names[res[0][i].item()])

[CLS] 	 O
golden 	 B-ORG
state 	 I-ORG
warriors 	 I-ORG
are 	 O
a 	 O
basketball 	 O
team 	 O
from 	 O
san 	 B-LOC
francisco 	 I-LOC
[SEP] 	 O
