In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import pandas as pd
from datasets import Dataset
import numpy as np

In [4]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [5]:
import evaluate

#### Import and split

- Creates a sample of the full dataset to reduce the time it takes. (TODO)

- Uses Pandas DataFrames to split the data into train, validation and test using train_test_split() from sklearn and its stratify parameter for preserving class proportions.

- Converts DataFrames to Datasets and renames columns to aling data structure to be able to use Hugging Face's pre-train model.

In [6]:
df_input = pd.read_csv('/content/drive/MyDrive/train.csv')

df_input = df_input[df_input['language'] == 'spanish']
df_input_sample = df_input.sample(n=70000).drop(["language", "label_quality"], axis = 1)

unique_categories = df_input_sample['category'].value_counts()
list_filtered_categories = unique_categories[unique_categories > 2].index.to_list()
df_input_filtered = df_input_sample[df_input_sample["category"].isin(list_filtered_categories)]
train_df, temp_df = train_test_split(df_input_filtered, test_size=0.2, random_state=42, stratify=df_input_filtered['category'])

unique_categories_test = temp_df['category'].value_counts()
list_filtered_categories_test = unique_categories_test[unique_categories_test > 2].index.to_list()
temp_df_filtered = temp_df[temp_df["category"].isin(list_filtered_categories_test)]

val_df, test_df = train_test_split(temp_df_filtered, test_size=0.5, random_state=42, stratify=temp_df_filtered['category'])
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
train_dataset = train_dataset.rename_column("title", "text").rename_column("category", "labels")
val_dataset = val_dataset.rename_column("title", "text").rename_column("category", "labels")
test_dataset = test_dataset.rename_column("title", "text").rename_column("category", "labels")

In [7]:
len(train_dataset), len(val_dataset), len(test_dataset)

(55917, 6748, 6748)

#### Preprocess

- Load a BETO cased tokenizer to preprocess the text field.

- Tokenize, pad, and truncate for training.

- Create a map of the expected ids to their labels.

- Use map to preprocess the entire dataset in one step.

- Use Accuracy metric from Evaluate library to evaluate model's performance during training.

- TODO: Check if I should use another metric.

In [8]:
# Create a mapping from category names to ids
label_to_id = {label: id for id, label in enumerate(train_dataset.unique('labels'))}
id_to_label = {id : label for label, id in label_to_id.items()}

In [9]:
tokenizer = AutoTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    'dccuchile/bert-base-spanish-wwm-cased',
    num_labels = len(train_dataset.unique('labels')),
    id2label = id_to_label,
    label2id = label_to_id)

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # convert the logits to their predicted class
    predictions = np.argmax(logits, axis=-1) # TODO: axis = -1 ?
    # Use the mapped labels for computation
    return accuracy.compute(predictions=predictions, references=labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [10]:
# Map string labels to integers
def map_labels_to_ids(examples):
    return {'labels': label_to_id[examples['labels']]}

train_dataset = train_dataset.map(preprocess_function)
val_dataset = val_dataset.map(preprocess_function)
test_dataset = test_dataset.map(preprocess_function)

train_dataset = train_dataset.map(map_labels_to_ids)
val_dataset = val_dataset.map(map_labels_to_ids)
test_dataset = test_dataset.map(map_labels_to_ids)

Map:   0%|          | 0/55917 [00:00<?, ? examples/s]

Map:   0%|          | 0/6748 [00:00<?, ? examples/s]

Map:   0%|          | 0/6748 [00:00<?, ? examples/s]

Map:   0%|          | 0/55917 [00:00<?, ? examples/s]

Map:   0%|          | 0/6748 [00:00<?, ? examples/s]

Map:   0%|          | 0/6748 [00:00<?, ? examples/s]

In [11]:
# Display the first tokenized example from the train_dataset
print(train_dataset[0])

{'text': 'Hidrolavadora Industrial Autónoma (explosión) 200bar 15l/m', 'labels': 0, 'input_ids': [4, 29208, 23700, 13794, 15314, 16152, 1147, 11315, 1135, 1272, 1757, 1860, 30938, 972, 1027, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


#### Create Model and Train

- Define your training hyperparameters in TrainingArguments. At the end of each epoch, the Trainer will evaluate the accuracy and save the training checkpoint.

- Pass the training arguments to Trainer along with the model, dataset and compute_metrics function.

- TODO: Use datacollator().

  - WHY IS TOKENIZER NOT USED in this example? How does it work within the Training?

- Call train() to finetune your model.

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="beto_model_2",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfrairem074[0m ([33mfrairem074-mycompanyworks[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,5.9984,5.289242,0.408121
2,5.0371,4.620927,0.467694


TrainOutput(global_step=3496, training_loss=5.769093738267842, metrics={'train_runtime': 752.7025, 'train_samples_per_second': 148.577, 'train_steps_per_second': 4.645, 'total_flos': 1713602580853908.0, 'train_loss': 5.769093738267842, 'epoch': 2.0})

In [13]:
trainer.save_model("beto_model_2")

In [14]:
trainer.evaluate(test_dataset)

{'eval_loss': 4.613694190979004,
 'eval_accuracy': 0.4693242442205098,
 'eval_runtime': 11.104,
 'eval_samples_per_second': 607.711,
 'eval_steps_per_second': 19.002,
 'epoch': 2.0}

#### Use the model with test dataset

predictions contains the logic (similar to probability) of each category. To get the most likely category np.argmax(predictions.predictions, axis=-1)

In [33]:
# The predictions variable is a tuple: (logits, labels, metrics)
predictions = trainer.predict(test_dataset)

predicted_ids = np.argmax(predictions.predictions, axis=-1)

# Transfor predicted_ids to predicted_labels
predicted_labels = [id_to_label[i] for i in predicted_ids]

# Display a few examples
num_examples_to_show = 5

print(f"Displaying {num_examples_to_show} examples from the test dataset with predicted categories:")
for i in range(num_examples_to_show):
    print(f"Original Text: {test_dataset['text'][i]}")
    print(f"True Category: {id_to_label[test_dataset['labels'][i]]}")
    print(f"Predicted Category: {predicted_labels[i]}")
    print("-" * 30)

Displaying 5 examples from the test dataset with predicted categories:
Original Text: Panel Plafon Led X10uni  Redondo 18w Exterior Calido
True Category: CEILING_LIGHTS
Predicted Category: CEILING_LIGHTS
------------------------------
Original Text: Silenblock Tope Corona Buje Original Piaggio Centro Motos
True Category: MOTORCYCLE_SHOCK_ABSORBERS
Predicted Category: SUSPENSION_CONTROL_ARM_BUSHINGS
------------------------------
Original Text: Drop Shot Armada Fiberglass Núcleo Eva. 38 Mm + Regalos
True Category: PADDLE_TENNIS_RACKETS
Predicted Category: DRONES
------------------------------
Original Text: Guías De Válvula Cepillos De Carbono Stl. - Cepillo Investig
True Category: IRRIGATION_VALVES
Predicted Category: HAIR_BRUSHES
------------------------------
Original Text: Coverplast Elastic Impermeable 3.8cm X 3.8cm - Caja X 100 U
True Category: SPORT_AND_MEDICAL_BANDAGES
Predicted Category: AUTOMOBILE_FENDER_LINERS
------------------------------


In [52]:
pre_train = AutoTokenizer.from_pretrained("beto_model_2")

In [50]:
len(predictions[0][0]), len(unique_categories_test)

(1481, 1476)

In [57]:
test_dataset[1]

{'text': 'Silenblock Tope Corona Buje Original Piaggio Centro Motos',
 'labels': 324,
 'input_ids': [4,
  5913,
  1014,
  1307,
  3246,
  981,
  17171,
  30931,
  17620,
  6174,
  1323,
  16818,
  25253,
  18241,
  1228,
  4294,
  28487,
  30934,
  5],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [58]:
#predictions.predictions contains the logic/prob of each category
#label is the Actual
#predict SUSPENSION_CONTROL_ARM_BUSHINGS instead of MOTORCYCLE_SHOCK_ABSORBERS
predictions.predictions[1], predictions.label_ids[1], id_to_label[324], np.argmax(predictions.predictions[1], axis=-1), id_to_label[301]                                                                                                      ]

(array([-0.55902815,  0.44506115, -0.47696608, ..., -1.1933162 ,
        -1.280476  , -0.46088094], dtype=float32),
 np.int64(324),
 'MOTORCYCLE_SHOCK_ABSORBERS',
 np.int64(301),
 'SUSPENSION_CONTROL_ARM_BUSHINGS')