In [1]:
model_name = "vinai/bertweet-base"

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import transformers
import datasets

print(f"Running on transformers v{transformers.__version__} and datasets v{datasets.__version__}")

Running on transformers v4.20.1 and datasets v2.1.0


In [5]:
!pip3 install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m537.0 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [6]:
import torch
from pathlib import Path
import evaluate
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer)
import numpy as np
import random
import os

## Seed everything

In [7]:
SEED = 42

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

## Load dataset

In [9]:
ds = load_dataset("tweet_eval", "irony")
ds

Downloading builder script:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/irony (download: 376.58 KiB, generated: 411.24 KiB, post-processed: Unknown size, total: 787.82 KiB) to /root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/32.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2862 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/955 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 2862
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 784
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 955
    })
})

In [10]:
ds["train"][0]

{'text': 'seeing ppl walking w/ crutches makes me really excited for the next 3 weeks of my life',
 'label': 1}

In [11]:
id2label = {0: "non_irony", 1: "irony"}
label2id = {"non_irony": 0, "irony": 1}

## Tokenize and encode

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/824k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def tokenize_func(examples):
  return tokenizer(examples["text"], truncation=True, max_length=128)

In [14]:
ds_enc = ds.map(tokenize_func, batched=True, remove_columns=["text"])
ds_enc

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2862
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 784
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 955
    })
})

## Load model

In [15]:
def model_init():
    num_labels = 2
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True).to(device)
    return model

## Load trainer

In [16]:
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [17]:
def compute_metrics(eval_pred):
    prediction_scores, labels = eval_pred
    predictions = np.argmax(prediction_scores, axis=1)
    return {
        'acc': acc.compute(predictions=predictions, references=labels)['accuracy'], 
        'f1': f1.compute(predictions=predictions, references=labels)['f1'],
        'precision': precision.compute(predictions=predictions, references=labels)['precision'],
        'recall': recall.compute(predictions=predictions, references=labels)['recall']
    }

In [18]:
training_args = TrainingArguments(
    output_dir="results",
    report_to="wandb",
    num_train_epochs=2,
    learning_rate=4e-5,
    weight_decay=0.1,
    warmup_ratio=0.1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    logging_strategy="steps",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    seed=SEED
)
        

In [19]:
trainer = Trainer(
    model_init=model_init, 
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds_enc["train"],
    eval_dataset=ds_enc["validation"],
    tokenizer=tokenizer
)

loading configuration file https://huggingface.co/vinai/bertweet-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/356366feedcea0917e30f7f235e1e062ffc2d28138445d5672a184be756c8686.a2b6026e688d1b19cebc0981d8f3a5b1668eabfda55b2c42049d5eac0bc8cb2d
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "non_irony",
    "1": "irony"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "irony": 1,
    "non_irony": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",

Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

storing https://huggingface.co/vinai/bertweet-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/4e07e2989cb95a6f63c704a7170b48e6e663cc203c05db424e47f4d75562cf0e.7b2adda243ecb4b085eb2d22ef1b2cd12a882a43bbb13a34c11e10f960b9bfc3
creating metadata file for /root/.cache/huggingface/transformers/4e07e2989cb95a6f63c704a7170b48e6e663cc203c05db424e47f4d75562cf0e.7b2adda243ecb4b085eb2d22ef1b2cd12a882a43bbb13a34c11e10f960b9bfc3
loading weights file https://huggingface.co/vinai/bertweet-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/4e07e2989cb95a6f63c704a7170b48e6e663cc203c05db424e47f4d75562cf0e.7b2adda243ecb4b085eb2d22ef1b2cd12a882a43bbb13a34c11e10f960b9bfc3
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.den

In [20]:
trainer.train()

loading configuration file https://huggingface.co/vinai/bertweet-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/356366feedcea0917e30f7f235e1e062ffc2d28138445d5672a184be756c8686.a2b6026e688d1b19cebc0981d8f3a5b1668eabfda55b2c42049d5eac0bc8cb2d
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "non_irony",
    "1": "irony"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "irony": 1,
    "non_irony": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 130,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",

Epoch,Training Loss,Validation Loss,Acc,F1,Precision,Recall
1,No log,0.511616,0.746597,0.702703,0.798883,0.627193
2,No log,0.508097,0.769634,0.756637,0.763393,0.75


***** Running Evaluation *****
  Num examples = 955
  Batch size = 16
Saving model checkpoint to results/checkpoint-179
Configuration saved in results/checkpoint-179/config.json
Model weights saved in results/checkpoint-179/pytorch_model.bin
tokenizer config file saved in results/checkpoint-179/tokenizer_config.json
Special tokens file saved in results/checkpoint-179/special_tokens_map.json
added tokens file saved in results/checkpoint-179/added_tokens.json
***** Running Evaluation *****
  Num examples = 955
  Batch size = 16
Saving model checkpoint to results/checkpoint-358
Configuration saved in results/checkpoint-358/config.json
Model weights saved in results/checkpoint-358/pytorch_model.bin
tokenizer config file saved in results/checkpoint-358/tokenizer_config.json
Special tokens file saved in results/checkpoint-358/special_tokens_map.json
added tokens file saved in results/checkpoint-358/added_tokens.json


Training completed. Do not forget to share your model on huggingface.co/mo

TrainOutput(global_step=358, training_loss=0.49930372184881283, metrics={'train_runtime': 61.8199, 'train_samples_per_second': 92.592, 'train_steps_per_second': 5.791, 'total_flos': 109208560192920.0, 'train_loss': 0.49930372184881283, 'epoch': 2.0})

In [21]:
trainer.evaluate(eval_dataset=ds_enc["test"], metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 784
  Batch size = 16


{'test_loss': 0.40321630239486694,
 'test_acc': 0.8380102040816326,
 'test_f1': 0.8198581560283689,
 'test_precision': 0.733502538071066,
 'test_recall': 0.9292604501607717,
 'test_runtime': 1.5233,
 'test_samples_per_second': 514.658,
 'test_steps_per_second': 32.166,
 'epoch': 2.0}