## Imports

In [1]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 4.9 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 66.3 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 57.1 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.9 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 61.3 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25

In [2]:
from typing import List, Tuple
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

import datasets
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

from torch import cuda


## define helper functions

In [3]:
def get_balanced_set(df):
    """
    generate dataframe where labels are equally distributed (over-sampling)
    """
    max_size = df['se_label'].value_counts().max()
    balanced_list = [df]
    for class_index, group in df.groupby('se_label'):
        balanced_list.append(group.sample(max_size-len(group), replace=True))
    return pd.concat(balanced_list)

In [4]:
def tokenize(sentences): 
    """
    tokenize all tokens in the tokens column in each sentence
    """
    tokenized_inputs = tokenizer(sentences["tokenized_text"], truncation=True, is_split_into_words=False, max_length=512,)
    return tokenized_inputs


In [5]:

metric = load_metric("accuracy", "f1")

def compute_metrics(model_predictions):
    """
    compute metrics during training with Trainer
    """
    pred, labels = model_predictions
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [6]:
def get_tokenized_dataset(df):
    """
    convert pandas dataframe into dataset object
    """
    #df.se_label = df.se_label.map(lambda x:[x]) # convert values in rows to list of values: 3 -> [3]
    #df.tokenized_text = df.tokenized_text.map(lambda x:[x])
    dataset = datasets.Dataset.from_pandas(df)
    dataset = dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
    dataset = dataset.rename_column("se_label", "labels") # prevent model key mismatch
    dataset = dataset.rename_column("tokenized_text", "inputs")
    return dataset

## define finetuning function

In [7]:
def get_finetuned_model(model_name:str, train_df:pd.DataFrame):
  """
  generate a training dataset and finetune the specify model on it. Returns the instantiated trainer for evaluation purposes
  """
  
  data_collator = DataCollatorWithPadding(tokenizer)
  train_dataset = get_tokenized_dataset(train_df)

  training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )
  
  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
  model.to(device)

  train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )
  
  trainer.train()
  trainer.save_model()
  
  return trainer

## load & preprocess data

In [30]:
train_path = '/content/drive/MyDrive/Colab Notebooks/master/pt_bosque-ud-train.txt'
test_path = '/content/drive/MyDrive/Colab Notebooks/master/pt_bosque-ud-test.txt'
dev_path = '/content/drive/MyDrive/Colab Notebooks/master/pt_bosque-ud-dev.txt'
pool_path = '/content/drive/MyDrive/Colab Notebooks/master/pt_text_se_corpus.txt'

train = pd.read_csv(train_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) 
train.drop(columns=['text'], inplace = True)
dev = pd.read_csv(dev_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
dev.drop(columns=['text'], inplace = True)
test = pd.read_csv(test_path, sep='\t', names=['text', 'tokenized_text', 'se_label']) # colab
test.drop(columns=['text'], inplace = True)

se_corpus = pd.concat([train, dev, test]) # because the data was fileted for 'se' the data splits are not reliable anymore
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'case')].index)
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'nmod')].index)
se_corpus = se_corpus.drop(se_corpus[(se_corpus['se_label'] == 'expl:pass')].index)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(se_corpus.tokenized_text.values, se_corpus.se_label.values,test_size=0.2,random_state=2022, stratify=se_corpus.se_label.values)

train_df = pd.DataFrame(data={'tokenized_text': X_train, 'se_label': y_train}) # rebuild df
test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})

balanced_train_df = get_balanced_set(train_df) # create oversampled training set

print('shape of the data set splits:\n',balanced_train_df.shape, test_df.shape)
print(balanced_train_df.head(3))

shape of the data set splits:
 (3696, 2) (251, 2)
                                      tokenized_text se_label
0  Se sentir que há um desejo coletivo , ele vai ...     mark
1  Esperam se , agora , as oportunas reportagens ...     expl
2  Para se ter uma idéia , em o Brasil a melhor m...    nsubj


In [33]:
le = LabelEncoder()
balanced_train_df['se_label'] = le.fit_transform(balanced_train_df.se_label.values)
test_df['se_label'] = le.transform(test_df.se_label.values)


## model instantiation and finetuning

In [13]:
# define the model names
bert_name = "bert-base-cased"
multilingual_name = "bert-base-multilingual-cased"
portuguese_name = "neuralmind/bert-base-portuguese-cased"

In [14]:
output_dir = '/content/drive/MyDrive/Colab Notebooks/master'
num_labels = len(le.classes_)
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


### 1: Bert

In [15]:
model_output_dir = output_dir+'/bert'

#### finetuning

In [34]:
tokenizer = AutoTokenizer.from_pretrained(bert_name)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve/main/voc

In [35]:
instantiated_trainer = get_finetuned_model(bert_name, balanced_train_df)

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
   

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.3117,0.6503,0.832432,0.819443,0.832432,0.820576
2,0.4447,0.280266,0.921622,0.920065,0.921622,0.919121
3,0.2171,0.233255,0.932432,0.936543,0.932432,0.927688


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/bert/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/ma

In [36]:
# for evaluation (with currently set tokenizer)
bert_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [37]:
# load the saved finetuned model
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/bert/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/bert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation)

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [39]:
predictions = instantiated_trainer.predict(bert_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 251
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

        expl       0.68      0.76      0.72       140
       fixed       0.00      0.00      0.00         0
        iobj       0.00      0.00      0.00         0
        mark       0.61      0.79      0.69        24
       nsubj       0.42      0.31      0.36        58
         obj       0.22      0.14      0.17        29

    accuracy                           0.59       251
   macro avg       0.32      0.33      0.32       251
weighted avg       0.56      0.59      0.57       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [None]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
#pool.tokenized_text = pool.tokenized_text.map(lambda x:[x])
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

In [None]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(se_label=le.inverse_transform(preds)) # add labels
labeled_pool.head()

In [None]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/bert_labeled_pool.csv', header=False, sep='\t')

### 2: multilingual

In [41]:
model_output_dir = output_dir+'/multilingual'

#### finetuning

In [42]:
# rebuild data (has been modified through get_tokenized_dataset())
balanced_train_df = get_balanced_set(train_df)
balanced_train_df['se_label'] = le.transform(balanced_train_df.se_label.values)

test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})
test_df['se_label'] = le.transform(test_df.se_label.values)

In [43]:
tokenizer = AutoTokenizer.from_pretrained(multilingual_name)

https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp1rf_xban


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
creating metadata file for /root/.cache/huggingface/transformers/f55e7a2ad4f8d0fff2733b3f79777e1e99247f2e4583703e92ce74453af8c235.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f
https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpyzo_zydm


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
creating metadata file for /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidde

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
creating metadata file for /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpnhu9nr8s


Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24
creating metadata file for /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449d18dc24
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/eff018e45de5364a8368df1f2df3461d506e2a111e9dd50af1fae061cd460ead.6c5b6600e968f4b5e08c86d8891ea99e51537fc2bf251435fb46922e8f7a7b29
loading file https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/46880f3b0081fda494a4e15b05787692aa4c1e21e0ff2428ba8b14d4eda0784d.b33e51591f94f17c238ee9b1fac75b96ff2678cbaed6e108feadb3449

In [44]:
instantiated_trainer = get_finetuned_model(multilingual_name, balanced_train_df)

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
creating metadata file for /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
loading weights file https://huggingface.co/bert-base-multilingual-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/0a3fd51713dcbb4def175c7f85bddc995d5976ce1dde327f99104e4d33069f17.aa7be4c79d76f4066d9b354496ea477c9ee39c5d889156dd1efb680643c2b052
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0196,0.30835,0.902703,0.908571,0.902703,0.903754
2,0.1693,0.124942,0.967568,0.968865,0.967568,0.967375
3,0.0489,0.116843,0.972973,0.973656,0.972973,0.972582


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/multilingual/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Co

In [45]:
# for evaluation (with currently set tokenizer)
multilingual_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [46]:
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/multilingual/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/multilingual",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation without the previous fine-tuning step)

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(multilingual_name)
multilingual_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=2022, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [47]:
predictions = instantiated_trainer.predict(multilingual_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 251
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

        expl       0.90      0.84      0.87       165
       fixed       1.00      1.00      1.00         2
        iobj       0.00      0.00      0.00         0
        mark       0.77      0.89      0.83        27
       nsubj       0.77      0.72      0.74        46
         obj       0.06      0.09      0.07        11

    accuracy                           0.79       251
   macro avg       0.58      0.59      0.58       251
weighted avg       0.82      0.79      0.81       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [None]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

In [None]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(se_label=le.inverse_transform(preds)) # add labels
labeled_pool.head()

In [None]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/multilingual_labeled_pool.csv', header=False, sep='\t')

### 3: portuguese

In [49]:
model_output_dir = output_dir+'/portuguese'

#### finetuning

In [50]:
# rebuild data
balanced_train_df = get_balanced_set(train_df)
balanced_train_df['se_label'] = le.transform(balanced_train_df.se_label.values)

test_df = pd.DataFrame(data={'tokenized_text': X_test, 'se_label': y_test})
test_df['se_label'] = le.transform(test_df.se_label.values)

In [51]:
tokenizer = AutoTokenizer.from_pretrained(portuguese_name)

https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpt7khriqh


Downloading:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/f1a9ba41d40e8c6f5ba4988aa2f7702c3b43768183e4b82483e04f2848841ecf.a6c00251b9344c189e2419373d6033016d0cd3d87ea59f6c86069046ac81956d
creating metadata file for /root/.cache/huggingface/transformers/f1a9ba41d40e8c6f5ba4988aa2f7702c3b43768183e4b82483e04f2848841ecf.a6c00251b9344c189e2419373d6033016d0cd3d87ea59f6c86069046ac81956d
https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp803kkj0y


Downloading:   0%|          | 0.00/647 [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
creating metadata file for /root/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
loading configuration file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hi

Downloading:   0%|          | 0.00/205k [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/aa6d50227b77416b26162efcf0cc9e9a702d13920840322060a2b41a44a8aff4.af25fb1e29ad0175300146695fd80069be69b211c52fa5486fa8aae2754cc814
creating metadata file for /root/.cache/huggingface/transformers/aa6d50227b77416b26162efcf0cc9e9a702d13920840322060a2b41a44a8aff4.af25fb1e29ad0175300146695fd80069be69b211c52fa5486fa8aae2754cc814
https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/added_tokens.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpu8k1qji4


Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/added_tokens.json in cache at /root/.cache/huggingface/transformers/9188d297517828a862f4e0b0700968574ca7ad38fbc0832c409bf7a9e5576b74.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b
creating metadata file for /root/.cache/huggingface/transformers/9188d297517828a862f4e0b0700968574ca7ad38fbc0832c409bf7a9e5576b74.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b
https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp814trx0z


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/eecc45187d085a1169eed91017d358cc0e9cbdd5dc236bcd710059dbf0a2f816.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/eecc45187d085a1169eed91017d358cc0e9cbdd5dc236bcd710059dbf0a2f816.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/aa6d50227b77416b26162efcf0cc9e9a702d13920840322060a2b41a44a8aff4.af25fb1e29ad0175300146695fd80069be69b211c52fa5486fa8aae2754cc814
loading file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/added_tokens.json from cache at 

In [52]:
instantiated_trainer = get_finetuned_model(portuguese_name, balanced_train_df)

  0%|          | 0/4 [00:00<?, ?ba/s]

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/e716e2151985ba669e7197b64cdde2552acee146494d40ffaf0688a3f152e6ed.18a0b8b86f3ebd4c8a1d8d6199178feae9971ff5420f1d12f0ed8326ffdff716
Model config BertConfig {
  "_name_or_path": "neuralmind/bert-base-portuguese-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

storing https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/1e42c907c340c902923496246dae63e33f64955c529720991b7ec5543a98e442.fa492fca6dcee85bef053cc60912a211feb1f7173129e4eb1a5164e817f2f5f2
creating metadata file for /root/.cache/huggingface/transformers/1e42c907c340c902923496246dae63e33f64955c529720991b7ec5543a98e442.fa492fca6dcee85bef053cc60912a211feb1f7173129e4eb1a5164e817f2f5f2
loading weights file https://huggingface.co/neuralmind/bert-base-portuguese-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/1e42c907c340c902923496246dae63e33f64955c529720991b7ec5543a98e442.fa492fca6dcee85bef053cc60912a211feb1f7173129e4eb1a5164e817f2f5f2
Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9884,0.250228,0.927027,0.929131,0.927027,0.923655
2,0.1395,0.092002,0.972973,0.974411,0.972973,0.972597
3,0.0357,0.088482,0.975676,0.976709,0.975676,0.975368


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs, __index_level_0__. If inputs, __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 370
  Batch size = 16
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/master/portuguese/checkpoint-500
Configuration saved in /content/drive/MyDrive/Colab Notebooks/master/portuguese/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/Colab 

In [53]:
# for evaluation (with currently set tokenizer)
portuguese_test_dataset = get_tokenized_dataset(test_df)

  0%|          | 0/1 [00:00<?, ?ba/s]

#### evaluation

In [54]:
model = AutoModelForSequenceClassification.from_pretrained(model_output_dir, num_labels=num_labels)
model.to(device)

loading configuration file /content/drive/MyDrive/Colab Notebooks/master/portuguese/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Notebooks/master/portuguese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_si

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

##### if trainer not still instantiated: 

we nedd to reinstantiated a trainer to create a tokenized test dataset and to predict its labels (e.g when rerunning evaluation)

In [None]:
# create test dataset with models tokenizer
tokenizer = AutoTokenizer.from_pretrained(portuguese_name, add_prefix_space=True, use_fast=False)
portuguese_test_dataset = get_tokenized_dataset(test_df)

In [None]:
# reinstantiate a trainer on the saved finetuned model
data_collator = DataCollatorWithPadding(tokenizer)
train_dataset = get_tokenized_dataset(train_df)

training_args = TrainingArguments(
    output_dir=model_output_dir,  
    evaluation_strategy="epoch",  # for explicit evaluation during training
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    seed = 2022,
    logging_strategy = 'epoch',
    )

train_dataset_split = train_dataset.train_test_split(test_size=0.1, seed=42, )

instantiated_trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_dataset_split["train"],
      eval_dataset=train_dataset_split["test"],
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
      )

##### predict train_dataset

In [55]:
predictions = instantiated_trainer.predict(portuguese_test_dataset)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 251
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(preds, predictions.label_ids, target_names=le.classes_))

              precision    recall  f1-score   support

        expl       0.90      0.85      0.87       163
       fixed       1.00      1.00      1.00         2
        iobj       0.00      0.00      0.00         0
        mark       0.94      0.97      0.95        30
       nsubj       0.77      0.82      0.80        40
         obj       0.22      0.25      0.24        16

    accuracy                           0.82       251
   macro avg       0.64      0.65      0.64       251
weighted avg       0.84      0.82      0.83       251



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### labelling pool

label the unlabeled pool of data with the current model to use it in a downstram  (evaluation) task (only done for the model with the best accuracy on test set)

In [59]:
pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
pool_dataset = datasets.Dataset.from_pandas(pool)
pool_dataset = pool_dataset.map(tokenize, batched=True) # map function tokenize on to train_dataset['tokenized_text']
pool_dataset = pool_dataset.rename_column("tokenized_text", "inputs")

  0%|          | 0/50 [00:00<?, ?ba/s]

In [60]:
predictions = instantiated_trainer.predict(pool_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: inputs. If inputs are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 49867
  Batch size = 16


In [61]:
labeled_pool = pd.read_csv(pool_path, names=['tokenized_text'], sep='\t')
labeled_pool = labeled_pool.assign(se_label=le.inverse_transform(preds)) # add labels
labeled_pool.head()

Unnamed: 0,tokenized_text,second_pool
0,Este filme conta a história de R um jovem zomb...,expl
1,"É super divertido , leve , fácil de se entender .",nsubj
2,Agora é hora de se agarrarem a os seus sonhos ...,expl
3,Depende de vocês o quanto se importam em acend...,expl
4,Somos todos um e nós esperamos por vocês se ju...,expl


In [62]:
labeled_pool.to_csv('/content/drive/MyDrive/Colab Notebooks/master/portuguese_labeled_pool.csv', header=False, sep='\t')