In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import numpy as np

In [3]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [4]:
import evaluate

#### Import and split

- Creates a sample of the full dataset to reduce the time it takes. (TODO)

- Uses Pandas DataFrames to split the data into train, validation and test using train_test_split() from sklearn and its stratify parameter for preserving class proportions.

- Converts DataFrames to Datasets and renames columns to aling data structure to be able to use Hugging Face's pre-train model.

In [5]:
df_input = pd.read_csv('/content/drive/MyDrive/train.csv')

In [6]:
n_sample = 1000
min_samples = 3

# Sample reproducibly
df_sample = df_input.sample(n=n_sample, random_state=42)

# Filter spanish since it will use pre-trained BETO Model
df_sample = df_sample[df_sample['language'] == 'spanish']

df_sample = df_sample.drop(["language", "label_quality"], axis = 1)

# Keep only categories with at least `min_samples`
valid_categories = df_sample['category'].value_counts()
valid_categories = valid_categories[valid_categories >= min_samples].index
df_sample = df_sample[df_sample['category'].isin(valid_categories)]

df_sample = df_sample.rename(columns={'title': 'text', 'category': 'labels'})

In [7]:
# Split train, valid
df_train, df_valid = train_test_split(df_sample, test_size=0.3, random_state=42, stratify=df_sample['labels'])

In [8]:
# Convert DataFrame to Dataset
train_dataset = Dataset.from_pandas(df_train.reset_index(drop=True))
val_dataset = Dataset.from_pandas(df_valid.reset_index(drop=True))

In [9]:
len(train_dataset), len(val_dataset)

(37, 16)

#### Preprocess

- Load a BETO cased tokenizer to preprocess the text field.

- Tokenize, pad, and truncate for training.

- Create a map of the expected ids to their labels.

- Use map to preprocess the entire dataset in one step.

- Use Accuracy metric from Evaluate library to evaluate model's performance during training.

- TODO: Check if I should use another metric.

In [10]:
# Create a mapping from category names to ids
label_to_id = {label: id for id, label in enumerate(train_dataset.unique('labels'))}
id_to_label = {id : label for label, id in label_to_id.items()}

In [11]:
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',
    num_labels = len(train_dataset.unique('labels')),
    id2label = id_to_label,
    label2id = label_to_id)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1) # TODO: axis = -1 ?
    # Use the mapped labels for computation
    return accuracy.compute(predictions=predictions, references=labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [12]:
# Map string labels to integers
def map_labels_to_ids(examples):
    return {'labels': label_to_id[examples['labels']]}

train_dataset = train_dataset.map(preprocess_function)
val_dataset = val_dataset.map(preprocess_function)
# test_dataset = test_dataset.map(preprocess_function)

train_dataset = train_dataset.map(map_labels_to_ids)
val_dataset = val_dataset.map(map_labels_to_ids)
# test_dataset = test_dataset.map(map_labels_to_ids)

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [13]:
# Display the first tokenized example from the train_dataset
print(train_dataset[0])

{'text': 'Pantalon Basilotta Vivo Tiro Medio', 'labels': 0, 'input_ids': [4, 5475, 8571, 30935, 4018, 3915, 12101, 19316, 28079, 5549, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


#### Create Model and Train

- Define your training hyperparameters in TrainingArguments. At the end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.

- Pass the training arguments to Trainer along with the model, dataset and compute_metrics function.

- TODO: Use datacollator().

  - WHY IS TOKENIZER NOT USED in this example? How does it work within the Training?

- Call train() to finetune your model.

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="beto_model_2",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.781546,0.0625
2,No log,2.765717,0.125
3,No log,2.748746,0.125
4,No log,2.736283,0.1875
5,No log,2.731673,0.1875


TrainOutput(global_step=10, training_loss=2.2510494232177733, metrics={'train_runtime': 377.4268, 'train_samples_per_second': 0.49, 'train_steps_per_second': 0.026, 'total_flos': 2932104010080.0, 'train_loss': 2.2510494232177733, 'epoch': 5.0})

In [None]:
trainer.save_model("beto_model_3")

#### Use the model with test dataset