## Imports

In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 21.7 MB/s eta 0:00:01[K     |██                              | 20 kB 25.3 MB/s eta 0:00:01[K     |███                             | 30 kB 15.6 MB/s eta 0:00:01[K     |████                            | 40 kB 7.1 MB/s eta 0:00:01[K     |█████                           | 51 kB 4.3 MB/s eta 0:00:01[K     |██████                          | 61 kB 5.1 MB/s eta 0:00:01[K     |███████                         | 71 kB 5.8 MB/s eta 0:00:01[K     |████████                        | 81 kB 6.5 MB/s eta 0:00:01[K     |█████████                       | 92 kB 7.2 MB/s eta 0:00:01[K     |██████████                      | 102 kB 7.9 MB/s eta 0:00:01[K     |███████████                     | 112 kB 7.9 MB/s eta 0:00:01[K     |████████████                    | 122 kB 7.9 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 7.9 MB/s eta 0:00:01

In [2]:
from typing import List, Tuple
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

import datasets
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

from torch import cuda


## define helper functions

In [3]:
def get_balanced_set(df):
    """
    generate dataframe where labels are equally distributed (over-sampling)
    """
    max_size = df['se_label'].value_counts().max()
    balanced_list = [df]
    for class_index, group in df.groupby('se_label'):
        balanced_list.append(group.sample(max_size-len(group), replace=True))
    return pd.concat(balanced_list)

In [4]:
def tokenize(sentences): 
    """
    tokenize all tokens in the tokens column in each sentence
    """
    tokenized_inputs = tokenizer(sentences["tokenized_text"], truncation=True, is_split_into_words=False, max_length=512,)
    return tokenized_inputs


In [5]:
# Roberta
"""
def tokenize(sentences): 
    #tokenize all tokens in the tokens column in each sentence
    tokenized_inputs = tokenizer(sentences["tokenized_text"], truncation=True, is_split_into_words=True, max_length=512, add_prefix_space=True)
    return tokenized_inputs
"""


'\ndef tokenize(sentences): \n    #tokenize all tokens in the tokens column in each sentence\n    tokenized_inputs = tokenizer(sentences["tokenized_text"], truncation=True, is_split_into_words=True, max_length=512, add_prefix_space=True)\n    return tokenized_inputs\n'

In [6]:

metric = load_metric("accuracy", "f1")

def compute_metrics(model_predictions):
    """
    compute metrics during training with Trainer
    """
    pred, labels = model_predictions
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [7]:
def get_tokenized_dataset(df):
    """
    convert pandas dataframe into dataset object
    """
    #df.se_label = df.se_label.map(lambda x:[x]) # convert values in rows to list of values: 3 -> [3]
    #df.tokenized_text = df.tokenized_text.map(lambda x:[x])
    dataset = datasets.Dataset.from_pandas(df)
    dataset = dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
    dataset = dataset.rename_column("se_label", "labels") # prevent model key mismatch
    dataset = dataset.rename_column("tokenized_text", "inputs")
    return dataset

## define finetuning function

In [8]:
def get_finetuned_model(model_name:str, train_df:pd.DataFrame):
  """
  generate a training dataset and finetune the specify model on it. Returns the instantiated trainer for evaluation purposes
  """
  
  data_collator = DataCollatorWithPadding(tokenizer)
  train_dataset = get_tokenized_dataset(train_df)

  training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )
  
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
  model.to(device)

  train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )
  
  trainer.train()
  trainer.save_model()
  
  return trainer

## load & preprocess data

In [9]:
train_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-train.txt'
test_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-test.txt'
dev_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-dev.txt'
pool_path = '/content/drive/MyDrive/Colab Notebooks/master/sp_text_se_corpus.txt'

train = pd.read_csv(train_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) 
train.drop(columns=['text'], inplace = True)
dev = pd.read_csv(dev_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
dev.drop(columns=['text'], inplace = True)
test = pd.read_csv(test_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
test.drop(columns=['text'], inplace = True)

se_corpus = pd.concat([train, dev, test]) # because the data was fileted for 'se' the data splits are not reliable anymore
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'flat')].index)
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'fixed')].index)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(se_corpus.tokenized_text.values, se_corpus.se_label.values,test_size=0.2,random_state=2022, stratify=se_corpus.se_label.values)

train_df = pd.DataFrame(data={'tokenized_text': X_train, 'se_label': y_train}) # rebuild df
test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})

balanced_train_df = get_balanced_set(train_df) # create oversampled training set

print('shape of the data set splits:\n',balanced_train_df.shape, test_df.shape)
print(balanced_train_df.head(3))

shape of the data set splits:
 (8345, 2) (850, 2)
                                      tokenized_text   se_label
0  La sentencia pretende ser un medio de presión ...  expl:pass
1  Con la misma humildad , prudencia y obsesión p...    expl:pv
2  La caída de la corona frente a el dólar no pre...  expl:pass


In [11]:
le = LabelEncoder()
balanced_train_df['se_label'] = le.fit_transform(balanced_train_df.se_label.values)
test_df['se_label'] = le.transform(test_df.se_label.values)


## model instantiation and finetuning

In [12]:
# define the model names
bert_name = "bert-base-cased"
multilingual_name = "bert-base-multilingual-cased"
spanish_name = "PlanTL-GOB-ES/roberta-base-bne" # cased per default

In [13]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/master'
num_labels = len(le.classes_)
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### 1: Bert

In [14]:
model_output_dir = output_dir+'/bert'

#### finetuning

In [15]:
tokenizer = AutoTokenizer.from_pretrained(bert_name)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [16]:
instantiated_trainer = get_finetuned_model(bert_name, balanced_train_df)

  0%|          | 0/9 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3141,0.700199,0.718563,0.683174,0.718563,0.688148
2,0.5302,0.39694,0.827545,0.817352,0.827545,0.815507
3,0.3137,0.346164,0.858683,0.85033,0.858683,0.852962


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 835
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertF

In [17]:
# for evaluation (with currently set tokenizer)
bert_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [18]:
# load the saved finetuned model
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/bert/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/bert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation)

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [19]:
predictions = instantiated_trainer.predict(bert_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 850
  Batch size = 16


In [20]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

 expl:impers       0.32      0.30      0.31        47
   expl:pass       0.51      0.50      0.51       295
     expl:pv       0.62      0.62      0.62       419
        iobj       0.11      0.18      0.14        22
         obj       0.29      0.28      0.29        67

    accuracy                           0.52       850
   macro avg       0.37      0.38      0.37       850
weighted avg       0.53      0.52      0.52       850



#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [None]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
#pool.tokenized_text = pool.tokenized_text.map(lambda x:[x])
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

In [None]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(second_pool=le.inverse_transform(preds)) # add labels
labeled_pool.head()

In [None]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/bert_labeled_pool.csv', sep='\t')

### 2: multilingual

In [22]:
model_output_dir = output_dir+'/multilingual'

#### finetuning

In [23]:
# rebuild data (has been modified through get_tokenized_dataset())
balanced_train_df = get_balanced_set(train_df)
balanced_train_df['se_label'] = le.transform(balanced_train_df.se_label.values)

test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})
test_df['se_label'] = le.transform(test_df.se_label.values)

In [24]:
tokenizer = AutoTokenizer.from_pretrained(multilingual_name)

https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp6l6eki7t


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpbbv6gmlj


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
creating metadata file for /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidde

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
creating metadata file for /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpj402q_0x


Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24
creating metadata file for /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449

In [25]:
instantiated_trainer = get_finetuned_model(multilingual_name, balanced_train_df)

  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },


Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
creating metadata file for /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
loading weights file https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.b

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.7558,0.276326,0.908982,0.916254,0.908982,0.904367
2,0.1581,0.144232,0.960479,0.960145,0.960479,0.960037
3,0.0562,0.170252,0.965269,0.96513,0.965269,0.964796


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 835
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't

In [26]:
# for evaluation (with currently set tokenizer)
multilingual_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/multilingual/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/multilingual",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_h

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation without the previous fine-tuning step)

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(multilingual_name)
multilingual_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [28]:
predictions = instantiated_trainer.predict(multilingual_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 850
  Batch size = 16


In [29]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

 expl:impers       0.64      0.85      0.73        33
   expl:pass       0.86      0.80      0.83       309
     expl:pv       0.89      0.85      0.87       437
        iobj       0.49      0.69      0.57        26
         obj       0.54      0.78      0.64        45

    accuracy                           0.82       850
   macro avg       0.68      0.79      0.73       850
weighted avg       0.84      0.82      0.83       850



#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [38]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
#pool.tokenized_text = pool.tokenized_text.map(lambda x:[x])
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

  0%|          | 0/13 [00:00<?, ?ba/s]

In [44]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 12579
  Batch size = 16


In [45]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(second_pool=le.inverse_transform(preds)) # add labels
labeled_pool.head()

Unnamed: 0,tokenized_text,second_pool
0,"Los cargos fueron 138 , pero esa suma se incre...",expl:pv
1,Viernes 2 de Agosto a las 09:00 : Congreso Pro...,expl:pv
2,Weretilneck anunció que se reforzará la seguri...,expl:pv
3,"- El gobernador de Río Negro , Alberto Weretil...",expl:pv
4,Paralelamente se tramitará la construcción de ...,expl:pv


In [46]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/multilingual_labeled_pool.csv', sep='\t')

### 3: spanish

In [30]:
model_output_dir = output_dir+'/spanish'

#### finetuning

In [31]:
# rebuild data
balanced_train_df = get_balanced_set(train_df)
balanced_train_df['se_label'] = le.transform(balanced_train_df.se_label.values)

test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})
test_df['se_label'] = le.transform(test_df.se_label.values)

In [32]:
# tokenizer = AutoTokenizer.from_pretrained(spanish_name, add_prefix_space=True, use_fast=False)
tokenizer = AutoTokenizer.from_pretrained(spanish_name)

https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpuc5aq5qk


Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/5f84f2820e0dfd8c4a7b776a5ba9c4ad1893b40d9b41af518e6621648648a633.d8a7d006294d83173a76ac51a95b5a8470bbbc87c93c63633eaf9476656ed660
creating metadata file for /root/.cache/huggingface/transformers/5f84f2820e0dfd8c4a7b776a5ba9c4ad1893b40d9b41af518e6621648648a633.d8a7d006294d83173a76ac51a95b5a8470bbbc87c93c63633eaf9476656ed660
https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpdwgxpo8o


Downloading:   0%|          | 0.00/613 [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/9559bd682b1ae9cf25eb8bed5a9ab64b481da43e670debc0b519981fea4afd13.33b0b03a5bf5e640494a22a3aa4909c661effc0fa0e186b1513b17d9b058ca59
creating metadata file for /root/.cache/huggingface/transformers/9559bd682b1ae9cf25eb8bed5a9ab64b481da43e670debc0b519981fea4afd13.33b0b03a5bf5e640494a22a3aa4909c661effc0fa0e186b1513b17d9b058ca59
loading configuration file https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/9559bd682b1ae9cf25eb8bed5a9ab64b481da43e670debc0b519981fea4afd13.33b0b03a5bf5e640494a22a3aa4909c661effc0fa0e186b1513b17d9b058ca59
Model config RobertaConfig {
  "_name_or_path": "PlanTL-GOB-ES/roberta-base-bne",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gra

Downloading:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/29e9e9b32d49471e6270f83399af38178f2b21c4b221c746c5a844a40d40fd5b.26eadee3bbe78c0682ce89a698fbb1698a0eee50c36cf83be2280a0f2a7b23c1
creating metadata file for /root/.cache/huggingface/transformers/29e9e9b32d49471e6270f83399af38178f2b21c4b221c746c5a844a40d40fd5b.26eadee3bbe78c0682ce89a698fbb1698a0eee50c36cf83be2280a0f2a7b23c1
https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpvdro4_xs


Downloading:   0%|          | 0.00/497k [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/33c2651926c588e986e1467740986ce4dfe7b086fc7d8ce6a5aeb48781dee97a.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e21769d28776345634b23
creating metadata file for /root/.cache/huggingface/transformers/33c2651926c588e986e1467740986ce4dfe7b086fc7d8ce6a5aeb48781dee97a.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e21769d28776345634b23
https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpyc0knbua


Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/43dd0ef46be9435d2e263d4cd2c9a77e429d3771c9ed2f01dcb0505b4e3b6f46.bd775ba884c9e650b58a3a333a97e47c8d1b9d37cdbe19b22fb04b1e41beb19d
creating metadata file for /root/.cache/huggingface/transformers/43dd0ef46be9435d2e263d4cd2c9a77e429d3771c9ed2f01dcb0505b4e3b6f46.bd775ba884c9e650b58a3a333a97e47c8d1b9d37cdbe19b22fb04b1e41beb19d
https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpov9z35y_


Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/5751a892d96bece2932abbd3d21fdbd31d3d3ac7294f549557ead0c643243a6d.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
creating metadata file for /root/.cache/huggingface/transformers/5751a892d96bece2932abbd3d21fdbd31d3d3ac7294f549557ead0c643243a6d.cb2244924ab24d706b02fd7fcedaea4531566537687a539ebb94db511fd122a0
loading file https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/29e9e9b32d49471e6270f83399af38178f2b21c4b221c746c5a844a40d40fd5b.26eadee3bbe78c0682ce89a698fbb1698a0eee50c36cf83be2280a0f2a7b23c1
loading file https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/33c2651926c588e986e1467740986ce4dfe7b086fc7d8ce6a5aeb48781dee97a.0d24ae8bd5fabb1f5020f91bc602cefeb5a2938ab77e2

In [33]:
get_finetuned_model(spanish_name, balanced_train_df)

  0%|          | 0/9 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/9559bd682b1ae9cf25eb8bed5a9ab64b481da43e670debc0b519981fea4afd13.33b0b03a5bf5e640494a22a3aa4909c661effc0fa0e186b1513b17d9b058ca59
Model config RobertaConfig {
  "_name_or_path": "PlanTL-GOB-ES/roberta-base-bne",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

storing https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/7fe257df6064e5fd34555f4aa8cae121eea8f5945d21cc3873956123f8484ef3.c86d60e89da68465cb73e129befe8209faa3ac57b9aa272b87db45ba1f619582
creating metadata file for /root/.cache/huggingface/transformers/7fe257df6064e5fd34555f4aa8cae121eea8f5945d21cc3873956123f8484ef3.c86d60e89da68465cb73e129befe8209faa3ac57b9aa272b87db45ba1f619582
loading weights file https://huggingface.co/PlanTL-GOB-ES/roberta-base-bne/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/7fe257df6064e5fd34555f4aa8cae121eea8f5945d21cc3873956123f8484ef3.c86d60e89da68465cb73e129befe8209faa3ac57b9aa272b87db45ba1f619582
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_nor

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5506,0.148529,0.964072,0.963785,0.964072,0.963762
2,0.0487,0.125948,0.973653,0.973394,0.973653,0.973419
3,0.0114,0.132512,0.971257,0.971008,0.971257,0.970971


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 835
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a correspondi

<transformers.trainer.Trainer at 0x7f2ac18c40d0>

In [34]:
# for evaluation (with currently set tokenizer)
spanish_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [35]:
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/spanish/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/spanish",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "si

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation)

In [None]:
# execute tokenizer cell for Roberta

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(spanish_name, add_prefix_space=True, use_fast=False)
spanish_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=42, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [36]:
predictions = instantiated_trainer.predict(spanish_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 850
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

 expl:impers       0.00      0.00      0.00         7
   expl:pass       0.10      0.45      0.16        65
     expl:pv       0.90      0.49      0.64       759
        iobj       0.00      0.00      0.00         0
         obj       0.02      0.05      0.02        19

    accuracy                           0.48       850
   macro avg       0.20      0.20      0.16       850
weighted avg       0.81      0.48      0.58       850



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [None]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
pool.tokenized_text = pool.tokenized_text.map(lambda x:[x])
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

In [None]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(second_pool=le.inverse_transform(preds)) # add labels
labeled_pool.head()

In [None]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/spanish_labeled_pool.csv', sep='\t')