In [1]:
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
path = '/kaggle/input/bertweet-base-irony/results/checkpoint-358'

tokenizer = AutoTokenizer.from_pretrained(path)
config = AutoConfig.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path, config=config).to(device)

In [3]:
def tokenize_func(examples):
  return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=128)

In [4]:
from datasets import load_dataset
ds = load_dataset("tweet_eval", "irony")
valid_ds = ds["validation"]
valid_ds = valid_ds.map(tokenize_func, batched=True, remove_columns=["text"])

Downloading builder script:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Downloading and preparing dataset tweet_eval/irony (download: 376.58 KiB, generated: 411.24 KiB, post-processed: Unknown size, total: 787.82 KiB) to /root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/32.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/36.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/244 [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2862 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/784 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/955 [00:00<?, ? examples/s]

Dataset tweet_eval downloaded and prepared to /root/.cache/huggingface/datasets/tweet_eval/irony/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [5]:
valid_ds.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])
dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=16)

In [6]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [7]:
class _ECELoss(nn.Module):
    def __init__(self, n_bins=15):
        """
        n_bins (int): number of confidence interval bins
        """
        super(_ECELoss, self).__init__()
        bin_boundaries = torch.linspace(0, 1, n_bins + 1)
        self.bin_lowers = bin_boundaries[:-1]
        self.bin_uppers = bin_boundaries[1:]

    def forward(self, logits, labels):
        softmaxes = F.softmax(logits, dim=1)
        confidences, predictions = torch.max(softmaxes, 1)
        accuracies = predictions.eq(labels)

        ece = torch.zeros(1, device=logits.device)
        for bin_lower, bin_upper in zip(self.bin_lowers, self.bin_uppers):
            # Calculated |confidence - accuracy| in each bin
            in_bin = confidences.gt(bin_lower.item()) * confidences.le(bin_upper.item())
            prop_in_bin = in_bin.float().mean()
            if prop_in_bin.item() > 0:
                accuracy_in_bin = accuracies[in_bin].float().mean()
                avg_confidence_in_bin = confidences[in_bin].mean()
                ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin

        return ece

In [8]:
def temperature_scale(logits, temperature):
    # Expand temperature to match the size of logits
    temperature = temperature.unsqueeze(0).expand(logits.size(0), logits.size(1))
    return logits / temperature

In [9]:
def find_optimal_temperature(model, valid_loader, initial_temp=1.5, max_iter=10000):
    temp = torch.nn.Parameter(torch.tensor(initial_temp, dtype=torch.float, requires_grad=True, device=device))
    
    nll_criterion = nn.CrossEntropyLoss().to(device)
    ece_criterion = _ECELoss().to(device)

    # First: collect all the logits and labels for the validation set
    logits_list = []
    labels_list = []
    with torch.no_grad():
        for batch in valid_loader:
            enc = {
                'input_ids': batch['input_ids'].to(device),
                'token_type_ids': batch['token_type_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
            }
            logits = model(**enc).logits
            logits_list.append(logits)
            labels_list.append(batch['label'])
        logits = torch.cat(logits_list).to(device)
        labels = torch.cat(labels_list).to(device)

    # Calculate NLL and ECE before temperature scaling
    before_temperature_nll = nll_criterion(logits, labels).item()
    before_temperature_ece = ece_criterion(logits, labels).item()
    print('Before temperature - NLL: %.3f, ECE: %.3f' % (before_temperature_nll, before_temperature_ece))

    # Next: optimize the temperature w.r.t. NLL
    optimizer = optim.LBFGS([temp], lr=0.01, max_iter=max_iter)

    def eval():
        optimizer.zero_grad()
        loss = nll_criterion(temperature_scale(logits, temp), labels)
        loss.backward()
        return loss
    optimizer.step(eval)

    # Calculate NLL and ECE after temperature scaling
    after_temperature_nll = nll_criterion(temperature_scale(logits, temp), labels).item()
    after_temperature_ece = ece_criterion(temperature_scale(logits, temp), labels).item()
    print('Optimal temperature: %.3f' % temp.item())
    print('After temperature - NLL: %.3f, ECE: %.3f' % (after_temperature_nll, after_temperature_ece))

    return temp.item()

In [10]:
find_optimal_temperature(model, dataloader)

Before temperature - NLL: 0.508, ECE: 0.067
Optimal temperature: 1.393
After temperature - NLL: 0.491, ECE: 0.039


1.3929405212402344