#Installation

In [None]:
!pip install datasets
!pip install transformers
!pip install seqeval

In [None]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

#Functions

In [None]:
def read_dataset(dir):
  res = pd.read_csv(dir)

  res = res.apply(lambda x : x.str[2:-2])
  res = res.apply(lambda x: x.str.split("\', \'"))
  return res

#Dataset Reading

Read all datasets

</br>

**VALIDATION**

ProfNER original training dataset

In [None]:
df_valid = read_dataset('valid.csv')
df_valid

Unnamed: 0,tokens,ner_tags
0,"[COMUNICADO, POR, CORONAVIRUS]","[O, O, O]"
1,"[El, Presidente, Ruso, Vladimir, Putin, ha, di...","[O, B-PROFESION, O, O, O, O, O, O, O, O, O, O,..."
2,"[#, nomequedoencasa]","[O, O]"
3,"[“, La, falta, de, transparencia, en, asuntos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[Esto, nos, demuestra, que, los, regímenes, au...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
4248,"[89, tacos, y, virus, superado, .]","[O, O, O, O, O, O]"
4249,[😍],[O]
4250,"[Sigan, para, bingo, en, este, 2020]","[O, O, O, O, O, O]"
4251,"[🇪, 🇸, |, URGENTE, -, CORONAVIRUS, :, España, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


**BASE**

ProfNER original training dataset

In [None]:
df_base = read_dataset('base.csv')
df_base

Unnamed: 0,tokens,ner_tags
0,"[Cerramos, nuestra, querida, Radio, 😢, Nuestro...","[O, O, O, O, O, O, B-PROFESION, O, B-PROFESION..."
1,"[Desde, mañana, todos, los, programas, de, Rad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
2,"[Seguimos, al, aire, con, el, compromiso, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[#, OtroEscandalo, #, HastaCuando, #, Denuncia...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
12702,"[Se, os, ve, el, plumero, ...]","[O, O, O, O, O, O]"
12703,"[¡A, TOD@S, !]","[O, O, O]"
12704,"[Corríjanme, si, me, equivoco, ,, pero, somos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
12705,"[Viendo, que, tenemos, 7, veces, más, casos, q...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


**FILTER**

ProfNER training dataset filtered with only 50% of entities

In [None]:
df_filter = read_dataset('filter.csv')
df_filter

Unnamed: 0,tokens,ner_tags
0,"[Desde, mañana, todos, los, programas, de, Rad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[Seguimos, al, aire, con, el, compromiso, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[#, OtroEscandalo, #, HastaCuando, #, Denuncia...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[La, privacidad, en, tiempos, de, coronavirus,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
11951,"[Se, os, ve, el, plumero, ...]","[O, O, O, O, O, O]"
11952,"[¡A, TOD@S, !]","[O, O, O]"
11953,"[Corríjanme, si, me, equivoco, ,, pero, somos,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
11954,"[Viendo, que, tenemos, 7, veces, más, casos, q...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


**WIKIPEDIA**

ProfNER filtered set with sentences generated with wikipedia

In [None]:
df_wikipedia = read_dataset('dataaug_wikipedia.csv')
df_wikipedia

Unnamed: 0,tokens,ner_tags
0,"[Desde, mañana, todos, los, programas, de, Rad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[Seguimos, al, aire, con, el, compromiso, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[#, OtroEscandalo, #, HastaCuando, #, Denuncia...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[La, privacidad, en, tiempos, de, coronavirus,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
12702,"[Los, crupiers, suelen, ser, empleados, por, l...","[O, B-PROFESION, O, O, O, O, O, O, O]"
12703,"[El, crupier, en, cada, juego, se, atendrá, ex...","[O, B-PROFESION, O, O, O, O, O, O, O, O, O, O,..."
12704,"[Los, métodos, de, entrenamiento, para, conver...","[O, O, O, O, O, O, O, O, B-PROFESION, O, O, O,..."
12705,"[En, Estados, Unidos, de, América, ,, el, blac...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


**WORD EMBEDDING**

ProfNER filtered set with sentences generated with wikipedia (using word_embedding similarity for semantic relevance)

In [None]:
df_embedding = read_dataset('dataaug_wikipedia+embedding.csv')
df_embedding

Unnamed: 0,tokens,ner_tags
0,"[Desde, mañana, todos, los, programas, de, Rad...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[Seguimos, al, aire, con, el, compromiso, de, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[#, OtroEscandalo, #, HastaCuando, #, Denuncia...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[¿, Es, necesario, entregar, nuestra, privacid...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,"[La, privacidad, en, tiempos, de, coronavirus,...","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
12702,"[No, todos, los, operadores, lineales, interes...","[O, O, O, B-PROFESION, O, O, O, O, O, O, O, O,..."
12703,"[Si, un, operador, está, definido, entre, dos,...","[O, O, B-PROFESION, O, O, O, O, O, O, O, O, O,..."
12704,"[Su, nombre, depende, del, autor, ,, son, los,...","[O, O, O, O, O, O, O, O, B-PROFESION, O, O, O,..."
12705,"[Los, operadores, de, orden, establecen, o, ve...","[O, B-PROFESION, O, O, O, O, O, O, O, O, O, O,..."


#Model Settings

In [None]:
label_list = ['O', 'B-PROFESION', 'I-PROFESION', 'B-SITUACION_LABORAL', 'I-SITUACION_LABORAL', 'B-ACTIVIDAD', 'I-ACTIVIDAD', 'B-FIGURATIVA', 'I-FIGURATIVA']
label_encoding_dict = {'O': 0, 'B-PROFESION': 1, 'I-PROFESION': 2, 'B-SITUACION_LABORAL':3, 'I-SITUACION_LABORAL':4, 'B-ACTIVIDAD':5, 'I-ACTIVIDAD':6, 'B-FIGURATIVA':7, 'I-FIGURATIVA':8}

task = "ner" 

model_checkpoint = "CenIA/distillbert-base-spanish-uncased"

batch_size = 16

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True, truncation=True, padding=True, max_length=512)

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), padding='max_length', truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

#Training

###Base

In [None]:
train_dataset = Dataset.from_pandas(df_base)
test_dataset = Dataset.from_pandas(df_valid)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('base.model')

Downloading:   0%|          | 0.00/530 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/257M [00:00<?, ?B/s]

Some weights of the model checkpoint at CenIA/distillbert-base-spanish-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at CenIA/distillbert-base-spanish-uncased and are newly initialized: ['classifier.weight', 'classifier.bias

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The following columns in the training set don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 12707
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2385


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: ignored

###Filter

In [None]:
train_dataset = Dataset.from_pandas(df_filter)
test_dataset = Dataset.from_pandas(df_valid)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('filter.model')

###Wikipedia

In [None]:
train_dataset = Dataset.from_pandas(df_wikipedia)
test_dataset = Dataset.from_pandas(df_valid)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('wikipedia.model')

###Embedding

In [None]:
train_dataset = Dataset.from_pandas(df_embedding)
test_dataset = Dataset.from_pandas(df_valid)

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

args = TrainingArguments(
    f"test-{task}",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()
trainer.save_model('embedding.model')

#Saving

In [None]:
!pip install pyocclient
import owncloud
oc = owncloud.Client('https://delicias.dia.fi.upm.es/nextcloud/')
oc.login('asanchez', 'AS.sczz.448')

In [None]:
!zip -r ./base.model.zip ./base.model
!zip -r ./filter.model.zip ./filter.model
!zip -r ./wikipedia.model.zip ./wikipedia.model
!zip -r ./embedding.model.zip ./embedding.model

In [None]:
oc.put_file('profner/new-filter/base-model.zip', 'base.model.zip')
oc.put_file('profner/new-filter/filter-model.zip', 'filter.model.zip')
oc.put_file('profner/new-filter/wikipedia-model.zip', 'wikipedia.model.zip')
oc.put_file('profner/new-filter/embedding-model.zip', 'embedding.model.zip')