# Transformer se-classification task: Spanish

maxhof905


## Imports

In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[?25l[K     |█                               | 10 kB 32.8 MB/s eta 0:00:01[K     |██                              | 20 kB 39.0 MB/s eta 0:00:01[K     |███                             | 30 kB 42.3 MB/s eta 0:00:01[K     |████                            | 40 kB 27.6 MB/s eta 0:00:01[K     |█████                           | 51 kB 21.7 MB/s eta 0:00:01[K     |██████                          | 61 kB 24.2 MB/s eta 0:00:01[K     |███████                         | 71 kB 25.8 MB/s eta 0:00:01[K     |████████                        | 81 kB 26.6 MB/s eta 0:00:01[K     |█████████                       | 92 kB 28.7 MB/s eta 0:00:01[K     |██████████                      | 102 kB 27.8 MB/s eta 0:00:01[K     |███████████                     | 112 kB 27.8 MB/s eta 0:00:01[K     |████████████                    | 122 kB 27.8 MB/s eta 0:00:01[K     |█████████████                   | 133 kB 27.8 MB/s eta

In [2]:
from typing import List, Tuple
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

import datasets
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

from torch import cuda


## define helper functions

In [3]:
def get_balanced_set(df):
    """
    generate dataframe where labels are equally distributed (over-sampling)
    """
    max_size = df['se_label'].value_counts().max()
    balanced_list = [df]
    for class_index, group in df.groupby('se_label'):
        balanced_list.append(group.sample(max_size-len(group), replace=True))
    return pd.concat(balanced_list)

In [4]:
def tokenize(sentences): 
    """
    tokenize all tokens in the tokens column in each sentence
    """
    tokenized_inputs = tokenizer(sentences["tokenized_text"], truncation=True, is_split_into_words=False, max_length=512,)
    return tokenized_inputs


In [5]:

metric = load_metric("accuracy", "f1")

def compute_metrics(model_predictions):
    """
    compute metrics during training with Trainer
    """
    pred, labels = model_predictions
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [6]:
def get_tokenized_dataset(df):
    """
    convert pandas dataframe into dataset object
    """
    dataset = datasets.Dataset.from_pandas(df)
    dataset = dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
    dataset = dataset.rename_column("se_label", "labels") # prevent model key mismatch
    dataset = dataset.rename_column("tokenized_text", "inputs")
    return dataset

## define finetuning function

In [7]:
def get_finetuned_model(model_name:str, train_df:pd.DataFrame):
  """
  generate a training dataset and finetune the specify model on it. Returns the instantiated trainer for evaluation purposes
  """
  
  data_collator = DataCollatorWithPadding(tokenizer)
  train_dataset = get_tokenized_dataset(train_df)

  training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )
  
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
  model.to(device)

  train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )
  
  trainer.train()
  trainer.save_model()
  
  return trainer

## load & preprocess data

In [10]:
train_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-train.txt'
test_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-test.txt'
dev_path = '/content/drive/MyDrive/Colab Notebooks/master/es_ancora-ud-dev.txt'
pool_path = '/content/drive/MyDrive/Colab Notebooks/master/sp_text_se_corpus.txt'

train = pd.read_csv(train_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) 
train.drop(columns=['text'], inplace = True)
dev = pd.read_csv(dev_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
dev.drop(columns=['text'], inplace = True)
test = pd.read_csv(test_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
test.drop(columns=['text'], inplace = True)

se_corpus = pd.concat([train, dev, test]) # because the data was fileted for 'se' the data splits are not reliable anymore
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'flat')].index)
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'fixed')].index)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(se_corpus.tokenized_text.values, se_corpus.se_label.values,test_size=0.2,random_state=2022, stratify=se_corpus.se_label.values)

train_df = pd.DataFrame(data={'tokenized_text': X_train, 'se_label': y_train}) # rebuild df
test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})

balanced_train_df = get_balanced_set(train_df) # create oversampled training set

print('shape of the data set splits:\n',balanced_train_df.shape, test_df.shape)
print(balanced_train_df.head(3))

shape of the data set splits:
 (8345, 2) (850, 2)
                                      tokenized_text   se_label
0  La sentencia pretende ser un medio de presión ...  expl:pass
1  Con la misma humildad , prudencia y obsesión p...    expl:pv
2  La caída de la corona frente a el dólar no pre...  expl:pass


In [12]:
le = LabelEncoder()
balanced_train_df['se_label'] = le.fit_transform(balanced_train_df.se_label.values)
test_df['se_label'] = le.transform(test_df.se_label.values)


## model instantiation and finetuning

In [13]:
# define the model names
spanish_name = "PlanTL-GOB-ES/roberta-base-bne"

In [14]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/master'
num_labels = len(le.classes_)
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### 3: spanish

In [15]:
model_output_dir = output_dir+'/spanish'

#### finetuning

In [31]:
# rebuild data
balanced_train_df = get_balanced_set(train_df)
balanced_train_df['se_label'] = le.transform(balanced_train_df.se_label.values)

test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})
test_df['se_label'] = le.transform(test_df.se_label.values)

In [17]:
tokenizer = AutoTokenizer.from_pretrained(spanish_name)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/613 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/497k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [18]:
instantiated_trainer = get_finetuned_model(spanish_name, balanced_train_df)

  0%|          | 0/9 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.out_proj.weight', 'classi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.605,0.1759,0.94491,0.943871,0.94491,0.944058
2,0.0537,0.138659,0.966467,0.966961,0.966467,0.96618
3,0.0086,0.140829,0.973653,0.973533,0.973653,0.973582


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: __index_level_0__, inputs. If __index_level_0__, inputs are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 835
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/master/spanish/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a correspondi

In [19]:
# for evaluation (with currently set tokenizer)
spanish_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/spanish/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/spanish",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "si

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50262, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation)

In [None]:
# execute tokenizer cell for Roberta

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(spanish_name, add_prefix_space=True, use_fast=False)
spanish_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=42, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [23]:
predictions = instantiated_trainer.predict(spanish_test_dataset)

The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 850
  Batch size = 16


In [24]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

 expl:impers       0.57      0.89      0.69        28
   expl:pass       0.90      0.83      0.86       312
     expl:pv       0.92      0.82      0.87       471
        iobj       0.30      0.65      0.41        17
         obj       0.31      0.91      0.46        22

    accuracy                           0.82       850
   macro avg       0.60      0.82      0.66       850
weighted avg       0.87      0.82      0.84       850



#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [None]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
pool.tokenized_text = pool.tokenized_text.map(lambda x:[x])
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

In [None]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(se_label=le.inverse_transform(preds)) # add labels
labeled_pool.head()

In [None]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/spanish_labeled_pool.csv', header=False, sep='\t')