In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


## Login

2.1 Storage HF token

In [None]:
!git config --global user.email "jose.bucheli@correounivalle.edu.co"
!git config --global user.name "TheCryss"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

2.2 Load of the dataset

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Viewing token and ner entities

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [None]:
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [None]:
raw_datasets["train"].features["ner_tags"]

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [None]:
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

2.3.1  Define tokenize (BETO)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "NazaGara/NER-fine-tuned-BETO"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
inputs.tokens()

['[CLS]',
 'EU',
 're',
 '##ject',
 '##s',
 'Ger',
 '##man',
 'cal',
 '##l',
 'to',
 'boy',
 '##co',
 '##tt',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 1, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(word_ids)

print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[None, 0, 1, 1, 1, 2, 2, 3, 3, 4, 5, 5, 5, 6, 7, 7, 8, None]
[-100, 3, 0, 0, 0, 7, 8, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, -100]


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenize_and_align_labels(raw_datasets['train'][:5])

{'input_ids': [[4, 12993, 1073, 25844, 30934, 6888, 1262, 1933, 30938, 1166, 28134, 1180, 4640, 25011, 1030, 7389, 1009, 5], [4, 5603, 10371, 981, 23522, 5], [4, 18174, 7078, 5693, 30970, 30960, 4698, 1149, 4514, 1149, 2534, 5], [4, 3131, 11207, 27062, 2441, 2967, 4172, 3857, 15782, 17330, 4290, 1342, 2931, 25706, 30939, 1004, 7481, 6888, 1262, 15168, 1154, 1166, 28970, 30254, 1166, 12270, 1034, 25011, 1030, 7389, 1049, 2702, 1020, 8226, 2559, 3018, 20289, 1004, 1874, 10046, 6573, 1345, 979, 5146, 2402, 1620, 2519, 5619, 7936, 30939, 1166, 12270, 8415, 30944, 1009, 5], [4, 6888, 1262, 30950, 3, 1020, 5150, 6028, 1166, 3247, 11207, 19803, 3, 1020, 16978, 14003, 1098, 24747, 1066, 30931, 1005, 30583, 984, 979, 1838, 10157, 2441, 2967, 4172, 1005, 1493, 4836, 17330, 28970, 30254, 7304, 16768, 4602, 30950, 12270, 8415, 30944, 23497, 30940, 18202, 22934, 2228, 1019, 1074, 10046, 7067, 1026, 25594, 21311, 1049, 2702, 3247, 1020, 8226, 2543, 15168, 1154, 1004, 1021, 3406, 9869, 1015, 1009, 5]

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

2.3.2 padding

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    3,    0,    0,    0,    7,    8,    0,    0,    0,    0,    0,
            0,    7,    0,    0,    0, -100],
        [-100,    1,    2,    2,    2, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100]])

In [None]:
!pip install seqeval



In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
labels = raw_datasets["train"][0]["ner_tags"]
labels = [label_names[i] for i in labels]
labels

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

2.4 Finetuning

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

9

2.5 Train strategy

In [None]:
from transformers import TrainingArguments

repositoryName = "NER-finetuned-BETO"
args = TrainingArguments(
    repositoryName,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=True,
    #batch_eval_metrics=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8
)

2.6

In [None]:
import evaluate
import numpy as np

def compute_metrics(eval_preds):
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)

  true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
  true_predictions = [
      [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
  return {
      "precision": all_metrics["overall_precision"],
      "recall": all_metrics["overall_recall"],
      "f1": all_metrics["overall_f1"],
      "accuracy": all_metrics["overall_accuracy"]
  }

2.7

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Epoch,Training Loss,Validation Loss
1,0.1712,0.072545
2,0.0524,0.071664
3,0.026,0.067089
4,0.015,0.072779
5,0.009,0.075345


TrainOutput(global_step=4390, training_loss=0.04799505040422931, metrics={'train_runtime': 1166.2359, 'train_samples_per_second': 60.198, 'train_steps_per_second': 3.764, 'total_flos': 2661124242267306.0, 'train_loss': 0.04799505040422931, 'epoch': 5.0})

In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.0189,0.077683
2,0.0141,0.080756
3,0.0072,0.082225
4,0.004,0.091659
5,0.0022,0.091635


TrainOutput(global_step=4390, training_loss=0.009354664700992558, metrics={'train_runtime': 1211.1686, 'train_samples_per_second': 57.965, 'train_steps_per_second': 3.625, 'total_flos': 2661124242267306.0, 'train_loss': 0.009354664700992558, 'epoch': 5.0})