This notebook is executed on Google Colab

In [1]:
!pip install sentencepiece -q
!pip install transformers -q

In [2]:
import os
import gc
import random
import warnings
import multiprocessing as mp
from tqdm import tqdm
from types import SimpleNamespace

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
from sklearn.model_selection import KFold


from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)

from kaggle_local_utils import get_dataloader1, set_seed, fit, get_optimizer

warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

# Preprocess Text

In [4]:
train = train.fillna('Missing Response or Question')
test = test.fillna('Missing Response or Question')

In [5]:
questions = train['Question'].unique()

original_responses = []
responses = []
targets = []

for question in questions:
    df = train[train['Question'] == question]
    text = question + '[SEP]' + ('[START]' + df['Response'] + '[END]').sum()
    responses.append(text)
    original_responses.append(df['Response'].to_numpy().tolist())
    targets.append(df['target'].to_numpy().tolist())

train = pd.DataFrame({'original_response': original_responses, 'responses': responses, 'target': targets})
train.head()

Unnamed: 0,original_response,responses,target
0,[Coevolution is a biological process that occu...,Explain the concept of coevolution.[SEP][START...,"[3, 6, 1, 5, 4, 0, 2]"
1,"[Yes, recurring fever and chills can be a symp...",Is it possible that recurring fever and chills...,"[4, 2, 5, 6, 1, 3, 0]"
2,[The expression 3! represents the factorial of...,Evaluate the expression 3![SEP][START]The expr...,"[1, 2, 3, 6, 0, 4, 5]"
3,[1. Messenger RNA (mRNA): mRNA carries genetic...,What are the roles of different types of RNA i...,"[3, 4, 5, 6, 1, 2, 0]"
4,[Gene flow refers to the movement of individua...,What is the role of gene flow in population ge...,"[3, 4, 2, 0, 6, 5, 1]"


# Config

In [6]:
cfg = {
    'num_folds': 5,
    'seeds': [252],
    'batch_size': 1,
    'epochs': 2,
    'weight_decay': 0.02,
    'learning_rate': 0.0002,
    'differential_learning_rate': 0.00002,
    'warmup_steps': 0,
    'max_length': 2000, # not use due to using dynamic padding
    'intermediate_dropout': 0.,
    'padding_quantile': 1.0,
    'device': 'cuda',
    'num_classes': 7,
    'embedding_size': 1024,
    'num_workers': mp.cpu_count(),
    'backbone': 'microsoft/deberta-v3-large'

}

cfg = SimpleNamespace(**cfg)

# Tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained(cfg.backbone)

special_tokens = ['\n', '[START]', '[END]']

tokenizer.add_tokens(['\n', '[START]', '[END]'], special_tokens=True)

special_token_ids = {}
for token in special_tokens:
    input_id = tokenizer(token, add_special_tokens=False).input_ids[0]
    special_token_ids[token] = input_id

special_token_ids

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

{'\n': 128001, '[START]': 128002, '[END]': 128003}

In [8]:
tokenizer.decode(
    tokenizer(
        train['responses'][1]
    ).input_ids
)

"[CLS] Is it possible that recurring fever and chills could indicate malaria?[SEP][START] Yes, recurring fever and chills can be a symptom of malaria, especially in areas where the disease is common. Malaria is caused by a parasite that is transmitted through the bite of an infected mosquito, and it can cause a range of symptoms including fever, chills, headache, muscle and joint pain, and fatigue. If you are experiencing recurring fever and chills, especially in combination with other symptoms such as nausea, vomiting, or diarrhea, it is important to see a healthcare provider for proper diagnosis and treatment. They may perform tests such as a blood smear or PCR to confirm the presence of malaria parasites in your blood, and they may prescribe antimalarial medication to treat the infection.[END][START] No, recurring fever and chills alone cannot confirm malaria. Malaria is a complex disease caused by a parasite and requires a comprehensive diagnosis, including physical examination, me

# Define a custom pooler

In [9]:
def custom_pooler(embeddings, input_ids):
    start_indices = torch.where(
        input_ids[0] == special_token_ids['[START]']
    )[0].cpu().numpy().tolist()

    end_indices = torch.where(
      input_ids[0] == special_token_ids['[END]']
    )[0].cpu().numpy().tolist()

    indices = list(zip(start_indices, end_indices))

    ret = torch.tensor([]).to(cfg.device)

    for start_idx, end_idx in indices:
        x = embeddings[0][start_idx+1:end_idx, :]
        x = x.mean(0).reshape(1, -1)
        ret = torch.cat([ret, x])

    return ret

# Architecture

In [10]:
class Net(nn.Module):
    def __init__(self, cfg):
        super(Net, self).__init__()
        self.num_classes = cfg.num_classes
        self.embedding_size = cfg.embedding_size

        config = AutoConfig.from_pretrained(cfg.backbone)
        config.attention_probs_dropout_prob = cfg.intermediate_dropout
        config.hidden_dropout_prob = cfg.intermediate_dropout

        self.backbone = AutoModel.from_pretrained(cfg.backbone, config=config)
        self.backbone.resize_token_embeddings(len(tokenizer))
        self.backbone.gradient_checkpointing_enable()

        self.head = nn.Linear(self.embedding_size , cfg.num_classes)

        self.pool = custom_pooler

        self.dropout_1 = nn.Dropout(0.1)
        self.dropout_2 = nn.Dropout(0.2)
        self.dropout_3 = nn.Dropout(0.3)
        self.dropout_4 = nn.Dropout(0.4)
        self.dropout_5 = nn.Dropout(0.5)

    def forward(self, input_ids, attention_mask):
        x = self.backbone(input_ids, attention_mask).last_hidden_state
        x = self.pool(x, input_ids)

        if self.training:
            logits_1 = self.head(self.dropout_1(x))
            logits_2 = self.head(self.dropout_2(x))
            logits_3 = self.head(self.dropout_3(x))
            logits_4 = self.head(self.dropout_4(x))
            logits_5 = self.head(self.dropout_5(x))
            logits = (logits_1 + logits_2 + logits_3 + logits_4 + logits_5) / 5

        else:
            logits = self.head(x)

        return logits

# Split folds

In [11]:
kf = KFold(
    n_splits=cfg.num_folds,
    random_state=1,
    shuffle=True
)

fold = 0

for _, val_idx in kf.split(train.responses, train.index.tolist()):
    fold += 1
    train.loc[train.index.isin(val_idx), 'fold'] = fold

train['fold'] = train['fold'].astype(int)
train.head()

Unnamed: 0,original_response,responses,target,fold
0,[Coevolution is a biological process that occu...,Explain the concept of coevolution.[SEP][START...,"[3, 6, 1, 5, 4, 0, 2]",1
1,"[Yes, recurring fever and chills can be a symp...",Is it possible that recurring fever and chills...,"[4, 2, 5, 6, 1, 3, 0]",3
2,[The expression 3! represents the factorial of...,Evaluate the expression 3![SEP][START]The expr...,"[1, 2, 3, 6, 0, 4, 5]",5
3,[1. Messenger RNA (mRNA): mRNA carries genetic...,What are the roles of different types of RNA i...,"[3, 4, 5, 6, 1, 2, 0]",1
4,[Gene flow refers to the movement of individua...,What is the role of gene flow in population ge...,"[3, 4, 2, 0, 6, 5, 1]",1


# Cross-validation

In [12]:
features = 'responses'
target = 'target'

val_scores = []

for fold in range(1, 6):
    print('Fold', fold)

    X_train = train[train.fold != fold][features]
    y_train = train[train.fold != fold][target]
    X_val = train[train.fold == fold][features]
    y_val = train[train.fold == fold][target]

    print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

    train_dataloader = get_dataloader1(
        tokenizer=tokenizer,
        X=X_train,
        y=y_train,
        mode='train',
        batch_size=cfg.batch_size,
        max_length=cfg.max_length,
        num_workers=cfg.num_workers
    )


    val_dataloader = get_dataloader1(
        tokenizer=tokenizer,
        X=X_val,
        y=y_val,
        mode='val',
        batch_size=cfg.batch_size,
        max_length=cfg.max_length,
        num_workers=cfg.num_workers
    )


    for seed in cfg.seeds:
        print('Seed', seed)
        set_seed(seed=seed)
        model = Net(cfg)
        model.to(cfg.device)

        optimizer = get_optimizer(
            model=model,
            learning_rate=cfg.learning_rate,
            diff_lr=cfg.differential_learning_rate,
            weight_decay=cfg.weight_decay
        )

        scheduler = get_cosine_schedule_with_warmup(
            optimizer=optimizer,
            num_warmup_steps=cfg.warmup_steps,
            num_training_steps=len(train_dataloader)*cfg.epochs
        )

        trained_model, val_score = fit(
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            epochs=cfg.epochs,
            cfg=cfg,
            train_dataloader=train_dataloader,
            val_dataloader=val_dataloader
        )

        val_scores.append(val_score)

        del model, trained_model, optimizer
        gc.collect()
        torch.cuda.empty_cache()

print('Mean CV:', np.mean(val_scores))
print('Std:', np.std(val_scores))

Fold 1
(454,) (454,) (114,) (114,)
Seed 252


Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/2 | Batch: 0/454 | Loss: 2.0114
Epoch: 1/2 | Batch: 10/454 | Loss: 1.7718
Epoch: 1/2 | Batch: 20/454 | Loss: 1.6359
Epoch: 1/2 | Batch: 30/454 | Loss: 1.6748
Epoch: 1/2 | Batch: 40/454 | Loss: 1.5728
Epoch: 1/2 | Batch: 50/454 | Loss: 1.2039
Epoch: 1/2 | Batch: 60/454 | Loss: 1.2712
Epoch: 1/2 | Batch: 70/454 | Loss: 1.4844
Epoch: 1/2 | Batch: 80/454 | Loss: 1.1040
Epoch: 1/2 | Batch: 90/454 | Loss: 1.3315
Epoch: 1/2 | Batch: 100/454 | Loss: 1.1306
Epoch: 1/2 | Batch: 110/454 | Loss: 1.1360
Epoch: 1/2 | Batch: 120/454 | Loss: 1.0954
Epoch: 1/2 | Batch: 130/454 | Loss: 1.4754
Epoch: 1/2 | Batch: 140/454 | Loss: 1.3812
Epoch: 1/2 | Batch: 150/454 | Loss: 1.6391
Epoch: 1/2 | Batch: 160/454 | Loss: 1.4070
Epoch: 1/2 | Batch: 170/454 | Loss: 0.9044
Epoch: 1/2 | Batch: 180/454 | Loss: 1.3168
Epoch: 1/2 | Batch: 190/454 | Loss: 0.7566
Epoch: 1/2 | Batch: 200/454 | Loss: 1.2301
Epoch: 1/2 | Batch: 210/454 | Loss: 0.8392
Epoch: 1/2 | Batch: 220/454 | Loss: 1.0314
Epoch: 1/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:50<00:00,  2.25it/s]

Validation score (Multi-class log loss): 0.9509434103965759



You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 2/2 | Batch: 0/454 | Loss: 1.0020
Epoch: 2/2 | Batch: 10/454 | Loss: 0.7234
Epoch: 2/2 | Batch: 20/454 | Loss: 0.4303
Epoch: 2/2 | Batch: 30/454 | Loss: 0.7663
Epoch: 2/2 | Batch: 40/454 | Loss: 0.4598
Epoch: 2/2 | Batch: 50/454 | Loss: 0.4153
Epoch: 2/2 | Batch: 60/454 | Loss: 0.6909
Epoch: 2/2 | Batch: 70/454 | Loss: 0.2604
Epoch: 2/2 | Batch: 80/454 | Loss: 0.6224
Epoch: 2/2 | Batch: 90/454 | Loss: 0.7041
Epoch: 2/2 | Batch: 100/454 | Loss: 0.4701
Epoch: 2/2 | Batch: 110/454 | Loss: 0.7863
Epoch: 2/2 | Batch: 120/454 | Loss: 0.8456
Epoch: 2/2 | Batch: 130/454 | Loss: 0.6487
Epoch: 2/2 | Batch: 140/454 | Loss: 0.5259
Epoch: 2/2 | Batch: 150/454 | Loss: 0.6829
Epoch: 2/2 | Batch: 160/454 | Loss: 0.6614
Epoch: 2/2 | Batch: 170/454 | Loss: 1.4023
Epoch: 2/2 | Batch: 180/454 | Loss: 2.5870
Epoch: 2/2 | Batch: 190/454 | Loss: 0.5975
Epoch: 2/2 | Batch: 200/454 | Loss: 0.5093
Epoch: 2/2 | Batch: 210/454 | Loss: 0.8005
Epoch: 2/2 | Batch: 220/454 | Loss: 0.3321
Epoch: 2/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:50<00:00,  2.25it/s]


Validation score (Multi-class log loss): 0.818980872631073
Total training time: 26.394 min
Fold 2
(454,) (454,) (114,) (114,)
Seed 252


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/2 | Batch: 0/454 | Loss: 2.0086
Epoch: 1/2 | Batch: 10/454 | Loss: 1.9768
Epoch: 1/2 | Batch: 20/454 | Loss: 1.9537
Epoch: 1/2 | Batch: 30/454 | Loss: 1.8629
Epoch: 1/2 | Batch: 40/454 | Loss: 1.8128
Epoch: 1/2 | Batch: 50/454 | Loss: 1.8252
Epoch: 1/2 | Batch: 60/454 | Loss: 3.2711
Epoch: 1/2 | Batch: 70/454 | Loss: 1.5087
Epoch: 1/2 | Batch: 80/454 | Loss: 1.4637
Epoch: 1/2 | Batch: 90/454 | Loss: 1.6313
Epoch: 1/2 | Batch: 100/454 | Loss: 1.7527
Epoch: 1/2 | Batch: 110/454 | Loss: 1.1814
Epoch: 1/2 | Batch: 120/454 | Loss: 1.2307
Epoch: 1/2 | Batch: 130/454 | Loss: 1.1089
Epoch: 1/2 | Batch: 140/454 | Loss: 0.9909
Epoch: 1/2 | Batch: 150/454 | Loss: 1.3522
Epoch: 1/2 | Batch: 160/454 | Loss: 1.0989
Epoch: 1/2 | Batch: 170/454 | Loss: 2.3057
Epoch: 1/2 | Batch: 180/454 | Loss: 1.4532
Epoch: 1/2 | Batch: 190/454 | Loss: 1.0742
Epoch: 1/2 | Batch: 200/454 | Loss: 1.4421
Epoch: 1/2 | Batch: 210/454 | Loss: 1.7229
Epoch: 1/2 | Batch: 220/454 | Loss: 0.8250
Epoch: 1/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:46<00:00,  2.46it/s]

Validation score (Multi-class log loss): 0.8730031847953796



You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 2/2 | Batch: 0/454 | Loss: 0.6815
Epoch: 2/2 | Batch: 10/454 | Loss: 0.4506
Epoch: 2/2 | Batch: 20/454 | Loss: 0.4218
Epoch: 2/2 | Batch: 30/454 | Loss: 0.4245
Epoch: 2/2 | Batch: 40/454 | Loss: 0.5856
Epoch: 2/2 | Batch: 50/454 | Loss: 0.7355
Epoch: 2/2 | Batch: 60/454 | Loss: 0.8676
Epoch: 2/2 | Batch: 70/454 | Loss: 0.6329
Epoch: 2/2 | Batch: 80/454 | Loss: 0.7078
Epoch: 2/2 | Batch: 90/454 | Loss: 0.4704
Epoch: 2/2 | Batch: 100/454 | Loss: 0.9057
Epoch: 2/2 | Batch: 110/454 | Loss: 0.3063
Epoch: 2/2 | Batch: 120/454 | Loss: 0.6935
Epoch: 2/2 | Batch: 130/454 | Loss: 0.2113
Epoch: 2/2 | Batch: 140/454 | Loss: 0.4835
Epoch: 2/2 | Batch: 150/454 | Loss: 0.4376
Epoch: 2/2 | Batch: 160/454 | Loss: 0.9830
Epoch: 2/2 | Batch: 170/454 | Loss: 0.4837
Epoch: 2/2 | Batch: 180/454 | Loss: 0.3916
Epoch: 2/2 | Batch: 190/454 | Loss: 0.2177
Epoch: 2/2 | Batch: 200/454 | Loss: 0.2933
Epoch: 2/2 | Batch: 210/454 | Loss: 0.4984
Epoch: 2/2 | Batch: 220/454 | Loss: 0.2980
Epoch: 2/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:46<00:00,  2.46it/s]


Validation score (Multi-class log loss): 0.7709437608718872
Total training time: 26.667 min
Fold 3
(454,) (454,) (114,) (114,)
Seed 252


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/2 | Batch: 0/454 | Loss: 2.2790
Epoch: 1/2 | Batch: 10/454 | Loss: 1.9665
Epoch: 1/2 | Batch: 20/454 | Loss: 1.8048
Epoch: 1/2 | Batch: 30/454 | Loss: 1.7372
Epoch: 1/2 | Batch: 40/454 | Loss: 1.9654
Epoch: 1/2 | Batch: 50/454 | Loss: 1.6161
Epoch: 1/2 | Batch: 60/454 | Loss: 1.4990
Epoch: 1/2 | Batch: 70/454 | Loss: 1.2013
Epoch: 1/2 | Batch: 80/454 | Loss: 1.2481
Epoch: 1/2 | Batch: 90/454 | Loss: 1.4706
Epoch: 1/2 | Batch: 100/454 | Loss: 1.6602
Epoch: 1/2 | Batch: 110/454 | Loss: 1.0650
Epoch: 1/2 | Batch: 120/454 | Loss: 0.8782
Epoch: 1/2 | Batch: 130/454 | Loss: 1.4137
Epoch: 1/2 | Batch: 140/454 | Loss: 2.0039
Epoch: 1/2 | Batch: 150/454 | Loss: 1.2699
Epoch: 1/2 | Batch: 160/454 | Loss: 1.1022
Epoch: 1/2 | Batch: 170/454 | Loss: 0.8610
Epoch: 1/2 | Batch: 180/454 | Loss: 1.0043
Epoch: 1/2 | Batch: 190/454 | Loss: 1.7381
Epoch: 1/2 | Batch: 200/454 | Loss: 1.1266
Epoch: 1/2 | Batch: 210/454 | Loss: 1.4798
Epoch: 1/2 | Batch: 220/454 | Loss: 1.3993
Epoch: 1/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:50<00:00,  2.28it/s]

Validation score (Multi-class log loss): 1.0362141132354736



You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 2/2 | Batch: 0/454 | Loss: 0.6259
Epoch: 2/2 | Batch: 10/454 | Loss: 0.8703
Epoch: 2/2 | Batch: 20/454 | Loss: 0.4499
Epoch: 2/2 | Batch: 30/454 | Loss: 0.9788
Epoch: 2/2 | Batch: 40/454 | Loss: 0.2919
Epoch: 2/2 | Batch: 50/454 | Loss: 0.4229
Epoch: 2/2 | Batch: 60/454 | Loss: 0.6589
Epoch: 2/2 | Batch: 70/454 | Loss: 0.4961
Epoch: 2/2 | Batch: 80/454 | Loss: 0.4687
Epoch: 2/2 | Batch: 90/454 | Loss: 0.5790
Epoch: 2/2 | Batch: 100/454 | Loss: 0.9981
Epoch: 2/2 | Batch: 110/454 | Loss: 0.3950
Epoch: 2/2 | Batch: 120/454 | Loss: 0.8371
Epoch: 2/2 | Batch: 130/454 | Loss: 0.2825
Epoch: 2/2 | Batch: 140/454 | Loss: 0.4801
Epoch: 2/2 | Batch: 150/454 | Loss: 0.2630
Epoch: 2/2 | Batch: 160/454 | Loss: 0.4306
Epoch: 2/2 | Batch: 170/454 | Loss: 0.7698
Epoch: 2/2 | Batch: 180/454 | Loss: 0.3090
Epoch: 2/2 | Batch: 190/454 | Loss: 0.3034
Epoch: 2/2 | Batch: 200/454 | Loss: 0.5724
Epoch: 2/2 | Batch: 210/454 | Loss: 0.9051
Epoch: 2/2 | Batch: 220/454 | Loss: 0.5450
Epoch: 2/2 | Batch: 23

  0%|          | 0/114 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 114/114 [00:49<00:00,  2.28it/s]


Validation score (Multi-class log loss): 0.9493780732154846
Total training time: 26.401 min
Fold 4
(455,) (455,) (113,) (113,)
Seed 252


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 1/2 | Batch: 0/455 | Loss: 2.0534
Epoch: 1/2 | Batch: 10/455 | Loss: 1.9090
Epoch: 1/2 | Batch: 20/455 | Loss: 1.7312
Epoch: 1/2 | Batch: 30/455 | Loss: 1.9915
Epoch: 1/2 | Batch: 40/455 | Loss: 1.9488
Epoch: 1/2 | Batch: 50/455 | Loss: 1.8262
Epoch: 1/2 | Batch: 60/455 | Loss: 1.9768
Epoch: 1/2 | Batch: 70/455 | Loss: 1.9330
Epoch: 1/2 | Batch: 80/455 | Loss: 1.9402
Epoch: 1/2 | Batch: 90/455 | Loss: 1.9657
Epoch: 1/2 | Batch: 100/455 | Loss: 2.0187
Epoch: 1/2 | Batch: 110/455 | Loss: 1.9761
Epoch: 1/2 | Batch: 120/455 | Loss: 1.9930
Epoch: 1/2 | Batch: 130/455 | Loss: 1.9798
Epoch: 1/2 | Batch: 140/455 | Loss: 2.0303
Epoch: 1/2 | Batch: 150/455 | Loss: 2.0804
Epoch: 1/2 | Batch: 160/455 | Loss: 2.0205
Epoch: 1/2 | Batch: 170/455 | Loss: 1.8718
Epoch: 1/2 | Batch: 180/455 | Loss: 1.9033
Epoch: 1/2 | Batch: 190/455 | Loss: 1.9556
Epoch: 1/2 | Batch: 200/455 | Loss: 1.9636
Epoch: 1/2 | Batch: 210/455 | Loss: 1.9195
Epoch: 1/2 | Batch: 220/455 | Loss: 1.9254
Epoch: 1/2 | Batch: 23

  0%|          | 0/113 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 113/113 [00:49<00:00,  2.30it/s]

Validation score (Multi-class log loss): 1.9459179639816284



You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch: 2/2 | Batch: 0/455 | Loss: 2.0254
Epoch: 2/2 | Batch: 10/455 | Loss: 1.8969
Epoch: 2/2 | Batch: 20/455 | Loss: 1.8993
Epoch: 2/2 | Batch: 30/455 | Loss: 1.9837
Epoch: 2/2 | Batch: 40/455 | Loss: 1.9188
Epoch: 2/2 | Batch: 50/455 | Loss: 1.9785
Epoch: 2/2 | Batch: 60/455 | Loss: 1.9223
Epoch: 2/2 | Batch: 70/455 | Loss: 1.9367
Epoch: 2/2 | Batch: 80/455 | Loss: 1.8545
Epoch: 2/2 | Batch: 90/455 | Loss: 1.9544
Epoch: 2/2 | Batch: 100/455 | Loss: 1.9708
Epoch: 2/2 | Batch: 110/455 | Loss: 1.9583
Epoch: 2/2 | Batch: 120/455 | Loss: 1.8920
Epoch: 2/2 | Batch: 130/455 | Loss: 1.9191


KeyboardInterrupt: ignored