In [62]:
from datasets import load_dataset
from transformers import DataCollatorForTokenClassification
from datasets import load_metric
from transformers import  RobertaTokenizerFast,AutoTokenizer
from transformers import RobertaForTokenClassification,AutoModelForTokenClassification
from transformers import pipeline, TrainingArguments,Trainer
import numpy as np
%env CUDA_VISIBLE_DEVICES=4

env: CUDA_VISIBLE_DEVICES=4


In [63]:
ds_reviews = load_dataset("pandas", data_files = {"train": "df_train.pkl","test":"df_test.pkl","validation":"df_val.pkl"})

Using custom data configuration default-b6c22ba7c928d56f
Reusing dataset pandas (/datos/luis/.cache/datasets/pandas/default-b6c22ba7c928d56f/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade)


  0%|          | 0/3 [00:00<?, ?it/s]

In [64]:
ds_reviews = ds_reviews.remove_columns("__index_level_0__")

In [65]:
df_test = ds_reviews["test"].to_pandas()

In [66]:
df_test.sample(3)

Unnamed: 0,document,tokens,labels,ner_tags
486,"la atención maravillosa, imposible pedir más.","[la, atención, maravillosa, ,, imposible, pedi...","[O, O, B-MODIFIER, O, O, O, O, O]","[0, 0, 1, 0, 0, 0, 0, 0]"
22,salchicha alemana muy grande y el secreto muy ...,"[salchicha, alemana, muy, grande, y, el, secre...","[B-CONCEPT, O, B-MODIFIER, I-MODIFIER, O, O, B...","[3, 0, 1, 2, 0, 0, 3, 1, 2, 0]"
464,"los mejillones al vino blanco y puerros, riquí...","[los, mejillones, al, vino, blanco, y, puerros...","[O, B-CONCEPT, O, B-CONCEPT, I-CONCEPT, O, B-C...","[0, 3, 0, 3, 4, 0, 3, 0, 1, 0]"


In [67]:
label_names = ["O","B-MODIFIER","I-MODIFIER","B-CONCEPT","I-CONCEPT"]

In [68]:
model_checkpoint  = "PlanTL-GOB-ES/roberta-large-bne-sqac"
tokenizer         = RobertaTokenizerFast.from_pretrained(model_checkpoint,add_prefix_space=True) 
#model_checkpoint = 'mrm8488/TinyBERT-spanish-uncased-finetuned-ner'
#tokenizer        = AutoTokenizer.from_pretrained(model_checkpoint)

In [69]:
tokenizer.is_fast

True

In [70]:
inputs = tokenizer(ds_reviews["train"][1]["tokens"], is_split_into_words=True)
inputs.tokens()

['<s>',
 'Ġtanto',
 'Ġla',
 'ĠpanaderÃŃa',
 'Ġcomo',
 'Ġel',
 'Ġrestaurante',
 'Ġdan',
 'Ġun',
 'Ġservicio',
 'Ġmuy',
 'Ġbueno',
 'Ġ',
 '.',
 '</s>']

In [71]:
inputs

{'input_ids': [0, 1118, 332, 41367, 469, 344, 5684, 3592, 355, 1416, 728, 3333, 275, 68, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [72]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [73]:
labels = ds_reviews["train"][1]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[0, 0, 0, 0, 0, 3, 0, 0, 3, 1, 2, 0]
[-100, 0, 0, 0, 0, 0, 3, 0, 0, 3, 1, 2, 0, 0, -100]


In [74]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [75]:
tokenized_datasets = ds_reviews.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns= ds_reviews["train"].column_names,
)


Loading cached processed dataset at /datos/luis/.cache/datasets/pandas/default-b6c22ba7c928d56f/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade/cache-c44e146872687141.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /datos/luis/.cache/datasets/pandas/default-b6c22ba7c928d56f/0.0.0/6197c1e855b639d75a767140856841a562b7a71d129104973fe1962594877ade/cache-40fcf00a91ec5c02.arrow


In [76]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [77]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(5)])
batch["labels"].shape

torch.Size([5, 32])

In [78]:
metric = load_metric("seqeval")

In [79]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [80]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [81]:
model = RobertaForTokenClassification.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-large-bne-sqac were not used when initializing RobertaForTokenClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne-sqac and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
model.config._num_labels = 5
model.config.id2label    = id2label
model.config.label2id    = label2id
new_conf = model.config

In [83]:
model_new = RobertaForTokenClassification.from_pretrained(model_checkpoint,config=new_conf)

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-large-bne-sqac were not used when initializing RobertaForTokenClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-large-bne-sqac and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [84]:
args = TrainingArguments(
    "ner_roberta",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    push_to_hub=False,
)

In [85]:
trainer = Trainer(
    model                  = model_new,
    args                   = args,
    train_dataset          = tokenized_datasets["train"],
    eval_dataset           = tokenized_datasets["validation"],
    data_collator          = data_collator,
    compute_metrics        = compute_metrics,
    tokenizer              = tokenizer,
)

In [86]:
trainer.train()

***** Running training *****
  Num examples = 2600
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1625


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.132171,0.838548,0.89971,0.868053,0.953966
2,0.128800,0.111602,0.905622,0.914629,0.910103,0.966233
3,0.128800,0.124722,0.901116,0.936593,0.918512,0.971125
4,0.027500,0.132445,0.908391,0.928719,0.918443,0.970574
5,0.004700,0.138404,0.907958,0.936179,0.921853,0.971883


***** Running Evaluation *****
  Num examples = 763
  Batch size = 8
Saving model checkpoint to ner_roberta/checkpoint-325
Configuration saved in ner_roberta/checkpoint-325/config.json
Model weights saved in ner_roberta/checkpoint-325/pytorch_model.bin
tokenizer config file saved in ner_roberta/checkpoint-325/tokenizer_config.json
Special tokens file saved in ner_roberta/checkpoint-325/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 763
  Batch size = 8
Saving model checkpoint to ner_roberta/checkpoint-650
Configuration saved in ner_roberta/checkpoint-650/config.json
Model weights saved in ner_roberta/checkpoint-650/pytorch_model.bin
tokenizer config file saved in ner_roberta/checkpoint-650/tokenizer_config.json
Special tokens file saved in ner_roberta/checkpoint-650/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 763
  Batch size = 8
Saving model checkpoint to ner_roberta/checkpoint-975
Configuration saved in ner_roberta/checkpoint-975/c

TrainOutput(global_step=1625, training_loss=0.04958951918895428, metrics={'train_runtime': 270.5596, 'train_samples_per_second': 48.049, 'train_steps_per_second': 6.006, 'total_flos': 1020282074889120.0, 'train_loss': 0.04958951918895428, 'epoch': 5.0})

In [91]:
# Replace this with your own checkpoint
model_checkpoint = "ner_roberta/checkpoint-975"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint
)

loading configuration file ner_roberta/checkpoint-975/config.json
Model config RobertaConfig {
  "_name_or_path": "ner_roberta/checkpoint-975",
  "_num_labels": 5,
  "architectures": [
    "RobertaForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "id2label": {
    "0": "O",
    "1": "B-MODIFIER",
    "2": "I-MODIFIER",
    "3": "B-CONCEPT",
    "4": "I-CONCEPT"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "B-CONCEPT": "3",
    "B-MODIFIER": "1",
    "I-CONCEPT": "4",
    "I-MODIFIER": "2",
    "O": "0"
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transfor

In [95]:
review = df_test.iloc[9].document
print(review)

por cierto, el café, muy bueno.


In [96]:
token_classifier(review)

[{'entity': 'B-CONCEPT',
  'score': 0.9999236,
  'index': 5,
  'word': 'ĠcafÃ©',
  'start': 15,
  'end': 19},
 {'entity': 'B-MODIFIER',
  'score': 0.9991999,
  'index': 7,
  'word': 'Ġmuy',
  'start': 21,
  'end': 24},
 {'entity': 'I-MODIFIER',
  'score': 0.9997008,
  'index': 8,
  'word': 'Ġbueno',
  'start': 25,
  'end': 30}]

In [97]:
review[15:19]

'café'

In [98]:
review[21:30]

'muy bueno'