# Install

In [None]:
!pip install datasets -q
!pip install transformers -q
!pip install seqeval -q
!pip install wandb -q
!pip install emoji -q


[K     |████████████████████████████████| 451 kB 10.7 MB/s 
[K     |████████████████████████████████| 182 kB 37.9 MB/s 
[K     |████████████████████████████████| 212 kB 33.8 MB/s 
[K     |████████████████████████████████| 132 kB 43.6 MB/s 
[K     |████████████████████████████████| 127 kB 45.2 MB/s 
[K     |████████████████████████████████| 5.5 MB 14.3 MB/s 
[K     |████████████████████████████████| 7.6 MB 55.8 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.9 MB 15.6 MB/s 
[K     |████████████████████████████████| 168 kB 73.9 MB/s 
[K     |████████████████████████████████| 182 kB 68.5 MB/s 
[K     |████████████████████████████████| 62 kB 1.3 MB/s 
[K     |████████████████████████████████| 168 kB 77.4 MB/s 
[K     |████████████████████████████████| 166 kB 77.2 MB/s 
[K     |████████████████████████████████| 166 kB 79.1 MB/s 
[K     |█████████████

In [None]:
import os
import itertools
import pandas as pd
import numpy as np
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import torch

#WanDB


In [None]:
!wandb login


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


2a90ed3cdb63fa31dc97bc91eaba83a3a7e0b708

#Dataset

In [None]:
import pandas as pd

def read_bio_dataset(dir):
  tok = []  #Aux list of tokens for current sentence
  bio = []  #Aux list of ner tags for current sentence
  df_list = []  #Final list with all the information

  with open(dir,'r',encoding='utf-8') as file:
    for line in file.readlines():

      #When reaching the end of a sentence, we append and restart tok and bio
      #We also check for non-empty sentences
      if line == '\n' and tok!=[] and bio!=[]:
        df_list.append([tok,bio])
        tok = []
        bio = []

      else:

        #We add the token and ner_tag to the list
        tok.append(line.split(' ')[0])
        bio.append(line.split(' ')[-1].replace('\n',''))

  #Returning df_list to a dataframe
  return pd.DataFrame(df_list, columns=['tokens','ner_tags'])



In [None]:
import emoji

def clean_data(dataset):
  rows_delete=[]
  for index, row in dataset.iterrows():
    tok= row['tokens']
    tags= row['ner_tags']
    new_tok=[]
    new_tags=[]
    for i in range(len(tok)):
      if 'http' in tok[i]:
        continue
      if emoji.is_emoji(tok[i]):
        continue
      if '' == tok[i]:
        continue
      if '#' == tok[i]:
        continue
      if '"' == tok[i]:
        continue
      if '@' in tok[i]:
        continue
      if 'u200d' in tok[i]:
        continue
      if '“' in tok[i]:
        continue
      if len(tok[i]) <=1:
        if ord(tok[i])>350:
          continue

      st=''
      for c in tok[i]:
        if ord(c)<=350:
          st=st+c


      new_tok.append(st)
      new_tags.append(tags[i])

    row['tokens']=new_tok
    row['ner_tags']=new_tags
    if len(new_tok)== 0:
      rows_delete.append(index)

    if len(new_tok)< 4 and all(element == 'O' for element in tags):
      rows_delete.append(index)

  dataset.drop(rows_delete, axis=0, inplace=True)
  dataset.reset_index(inplace=True, drop=True)
  return dataset


In [None]:
valid_data = read_bio_dataset('valid_spacy.txt')
valid_data = clean_data(valid_data)


In [None]:
training_data_or = read_bio_dataset('train_spacy.txt')
training_data_or = clean_data(training_data_or)


In [None]:
def read_tsv_dataset(name):
  training_data= pd.read_csv(name, sep="\t",encoding='utf8')
  training_data['tokens'] = training_data['tokens'].apply(eval)
  training_data['ner_tags'] = training_data['ner_tags'].apply(eval)
  return training_data


In [None]:
training_data_10= read_tsv_dataset('training_10.tsv')
training_data_30= read_tsv_dataset('training_30.tsv')
training_data_50= read_tsv_dataset('training_50.tsv')
training_data_or= read_tsv_dataset('training_or.tsv')
#training_data_do= read_tsv_dataset('training_do_mr.tsv')



In [None]:
training_data_50

In [None]:
test_dataset = Dataset.from_pandas(valid_data)

In [None]:
train_dataset_10 = Dataset.from_pandas(training_data_10)
train_dataset_30 = Dataset.from_pandas(training_data_30)
train_dataset_50 = Dataset.from_pandas(training_data_50)
train_dataset_or = Dataset.from_pandas(training_data_or)

In [None]:
train_dataset_or = Dataset.from_pandas(training_data_or)

#Model Settings

In [None]:
labels_list = ['O', 'B-PROFESION', 'I-PROFESION']
label_num_list= list(range(0,len(labels_list)))

In [None]:
label2id={}
id2label={}
for label,num in zip(labels_list,label_num_list):
  label2id[label]=num
  id2label[num]=label

In [None]:
label2id

{'O': 0, 'B-PROFESION': 1, 'I-PROFESION': 2}

In [None]:
task = "ner"

model_checkpoint = "PlanTL-GOB-ES/roberta-base-bne"
#model_checkpoint = 'dccuchile/bert-base-spanish-wwm-cased'
#model_checkpoint = "PlanTL-GOB-ES/roberta-large-bne"
#model_checkpoint = "bertin-project/bertin-roberta-base-spanish"
from transformers import RobertaTokenizerFast, RobertaModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True, truncation=True,  max_length=512)

Downloading:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/509k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]),  truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"{task}_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == 'O':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])#(label2id[label[word_idx]])
                #label_ids.append(label[word_idx])
            else:
                label_ids.append(label2id[label[word_idx]] if label_all_tokens else -100)#(label2id[label[word_idx]] if label_all_tokens else -100)
                #label_ids.append(label[word_idx] if label_all_tokens else -100)#(label2id[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
train_tokenized_datasets_or = train_dataset_or.map(tokenize_and_align_labels, batched=True)


  0%|          | 0/12 [00:00<?, ?ba/s]

In [None]:
train_tokenized_datasets_10 = train_dataset_10.map(tokenize_and_align_labels, batched=True)
train_tokenized_datasets_30 = train_dataset_30.map(tokenize_and_align_labels, batched=True)
train_tokenized_datasets_50 = train_dataset_50.map(tokenize_and_align_labels, batched=True)
#train_tokenized_datasets_do = train_dataset_do.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [None]:

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    #print(predictions)

    true_predictions = [[labels_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[labels_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {"precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"]}



#Training 10

In [None]:
import wandb
wandb.init(project="profner_mr", entity="pcalleja")

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpcalleja[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
from transformers import  RobertaForTokenClassification, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list), id2label = id2label, label2id = label2id)


Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.bias', 'classifier.weight']
You shoul

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs= 6
args = TrainingArguments(
    "profner_model_10",
    evaluation_strategy = "epoch",
    save_strategy="no",

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,

    weight_decay=1e-5,
    learning_rate=1e-4,
    #fp16=True,

    optim="adamw_torch",

    #report_to="wandb" ## WANDB

)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets_10,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The following columns in the training set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1105
  Num Epochs = 6
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 210
  Number of trainable parameters = 124055043
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.075643,0.0,0.0,0.0,0.987486
2,No log,0.076137,0.0,0.0,0.0,0.987486
3,No log,0.075102,0.0,0.0,0.0,0.987486
4,No log,0.062722,0.0,0.0,0.0,0.987486
5,No log,0.064114,0.412281,0.13643,0.205016,0.988601
6,No log,0.054558,0.431604,0.265602,0.328841,0.989033


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forw

TrainOutput(global_step=210, training_loss=0.07194346019199917, metrics={'train_runtime': 190.577, 'train_samples_per_second': 34.789, 'train_steps_per_second': 1.102, 'total_flos': 201812231170638.0, 'train_loss': 0.07194346019199917, 'epoch': 6.0})

# Training 30

In [None]:
import wandb
#wandb.init(project="profner_mr", entity="pcalleja")

from transformers import  RobertaForTokenClassification, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list), id2label = id2label, label2id = label2id)
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs= 6
args = TrainingArguments(
    "profner_model_30",
    evaluation_strategy = "epoch",
    save_strategy="no",

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,

    weight_decay=1e-5,
    learning_rate=1e-4,
    #fp16=True,

    optim="adamw_torch",

    #report_to="wandb" ## WANDB

)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets_30,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--PlanTL-GOB-ES--roberta-base-bne/snapshots/6f11dfab050340aebc487ccf58ce349b8f88bc67/config.json
Model config RobertaConfig {
  "_name_or_path": "PlanTL-GOB-ES/roberta-base-bne",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROFESION",
    "2": "I-PROFESION"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROFESION": 1,
    "I-PROFESION": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.044859,0.0,0.0,0.0,0.987486
2,No log,0.057114,0.0,0.0,0.0,0.987486
3,No log,0.051946,0.492063,0.044993,0.082447,0.987587
4,No log,0.046238,0.55814,0.208999,0.304118,0.988487
5,0.055900,0.049132,0.361991,0.348331,0.35503,0.988272
6,0.055900,0.0459,0.468835,0.251089,0.327032,0.988069


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forw

TrainOutput(global_step=624, training_loss=0.05206607167537396, metrics={'train_runtime': 320.7173, 'train_samples_per_second': 62.036, 'train_steps_per_second': 1.946, 'total_flos': 600220407776832.0, 'train_loss': 0.05206607167537396, 'epoch': 6.0})

# Training 50

In [None]:
import wandb
#wandb.init(project="profner_mr", entity="pcalleja")

from transformers import  RobertaForTokenClassification, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list), id2label = id2label, label2id = label2id)
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs= 6
args = TrainingArguments(
    "profner_model_50",
    evaluation_strategy = "epoch",
    save_strategy="no",

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,

    weight_decay=1e-5,
    learning_rate=1e-4,
    #fp16=True,

    optim="adamw_torch",

    #report_to="wandb" ## WANDB

)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets_50,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--PlanTL-GOB-ES--roberta-base-bne/snapshots/6f11dfab050340aebc487ccf58ce349b8f88bc67/config.json
Model config RobertaConfig {
  "_name_or_path": "PlanTL-GOB-ES/roberta-base-bne",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROFESION",
    "2": "I-PROFESION"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROFESION": 1,
    "I-PROFESION": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.075115,0.0,0.0,0.0,0.987486
2,No log,0.075548,0.0,0.0,0.0,0.987486
3,0.075900,0.075608,0.0,0.0,0.0,0.987486
4,0.075900,0.075224,0.0,0.0,0.0,0.987486
5,0.075900,0.075273,0.0,0.0,0.0,0.987486
6,0.070700,0.075172,0.0,0.0,0.0,0.987486


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forw

TrainOutput(global_step=1038, training_loss=0.07368411103654689, metrics={'train_runtime': 474.755, 'train_samples_per_second': 69.838, 'train_steps_per_second': 2.186, 'total_flos': 1014024308066640.0, 'train_loss': 0.07368411103654689, 'epoch': 6.0})

# Training Double

In [None]:
import wandb
wandb.init(project="profner_mr", entity="pcalleja")

from transformers import  RobertaForTokenClassification, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list), id2label = id2label, label2id = label2id)
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs= 6
args = TrainingArguments(
    "profner_model_do",
    evaluation_strategy = "epoch",
    save_strategy="no",

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,

    weight_decay=1e-5,
    learning_rate=1e-4,
    #fp16=True,

    optim="adamw_torch",

    report_to="wandb" ## WANDB

)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets_do,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Training Orignal

In [None]:
import wandb
#wandb.init(project="profner_mr", entity="pcalleja")

from transformers import  RobertaForTokenClassification, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(labels_list), id2label = id2label, label2id = label2id)
from transformers import TrainingArguments, Trainer

batch_size = 32
epochs= 6
args = TrainingArguments(
    "profner_model_or",
    evaluation_strategy = "epoch",
    save_strategy="no",

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,

    weight_decay=1e-5,
    learning_rate=1e-4,
    #fp16=True,

    optim="adamw_torch",

    #report_to="wandb" ## WANDB

)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets_or,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--PlanTL-GOB-ES--roberta-base-bne/snapshots/6f11dfab050340aebc487ccf58ce349b8f88bc67/config.json
Model config RobertaConfig {
  "_name_or_path": "PlanTL-GOB-ES/roberta-base-bne",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PROFESION",
    "2": "I-PROFESION"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-PROFESION": 1,
    "I-PROFESION": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.075395,0.0,0.0,0.0,0.987486
2,0.077200,0.075362,0.0,0.0,0.0,0.987486
3,0.067400,0.073883,0.0,0.0,0.0,0.987486
4,0.067400,0.068381,0.0,0.0,0.0,0.987486
5,0.067300,0.07122,0.0,0.0,0.0,0.987486
6,0.069500,0.071621,0.0,0.0,0.0,0.987486


The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 3693
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `RobertaForTokenClassification.forw

TrainOutput(global_step=2076, training_loss=0.07034020387138705, metrics={'train_runtime': 851.4921, 'train_samples_per_second': 77.877, 'train_steps_per_second': 2.438, 'total_flos': 2005858605003120.0, 'train_loss': 0.07034020387138705, 'epoch': 6.0})

In [None]:
training_data_or =clean_data(training_data_or)

In [None]:
train_tokenized_datasets_or[0]

In [None]:
!CUDA_LAUNCH_BLOCKING=1



In [None]:
!rm -r profner_model

In [None]:
trainer.evaluate()
trainer.save_model('profner1')

In [None]:
!pip install numba

from numba import cuda
device = cuda.get_current_device()
device.reset()

#Saving

In [None]:
!pip install pyocclient -q


  Building wheel for pyocclient (setup.py) ... [?25l[?25hdone


In [None]:
import owncloud
oc = owncloud.Client('https://delicias.dia.fi.upm.es/nextcloud/')
#oc.login('asanchez', 'AS.sczz.448')
oc.login('pcalleja', 'oWn.ser.5')

In [None]:
!zip -r ./base.model-5.zip ./base.model


In [None]:
oc.put_file('base-model-5.zip', 'base.model-5.zip')


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("profner1")
model = AutoModelForTokenClassification.from_pretrained("profner1")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)


In [None]:
example = "hola conductores de ambulancia y viva la guardia civil"

ner_results = nlp(example)
print(ner_results)

[]


In [None]:
nlp('Regarding Mossack Fonseca S.A.')