In [1]:
%pip install torch
%pip install transformers[torch]
%pip install ipywidgets
%pip install datasets
%pip install -U scikit-learn
%pip install -U ipywidgets
%pip install 'accelerate>=1.1.0'

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import torch.nn as nn
import pandas as pd
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from itertools import product
from collections import defaultdict

pd.set_option('display.width',1000)


In [2]:
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)

EPOCHS = 3
MAX_LENGTH = 128
K = 5


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [3]:
from load_data import DontPatronizeMe

DATA_DIR = 'data/'
TEST_PATH = f'{DATA_DIR}task4_test.tsv'

dpm = DontPatronizeMe(DATA_DIR, TEST_PATH)
dpm.load_task1()

data = dpm.train_task1_df

trids = pd.read_csv(f'{DATA_DIR}train_semeval_parids-labels.csv')
valids = pd.read_csv(f'{DATA_DIR}dev_semeval_parids-labels.csv')

trids['par_id'] = trids.par_id.astype(str)
valids['par_id'] = valids.par_id.astype(str)

cols = ['par_id', 'text', 'label_y']

trdf = trids.merge(data, on='par_id', how='left')[cols]
valdf = valids.merge(data, on='par_id', how='left')[cols]

# rename label_y to label for Trainer
trdf.rename(columns={"label_y": "labels"}, inplace=True)
valdf.rename(columns={"label_y": "labels"}, inplace=True)


print(trdf.head())
print()

print(valdf.head())
print()

trds = Dataset.from_pandas(trdf)
print(trds)


  par_id                                               text  labels
0   4341  The scheme saw an estimated 150,000 children f...       1
1   4136  Durban 's homeless communities reconciliation ...       1
2  10352  The next immediate problem that cropped up was...       1
3   8279  Far more important than the implications for t...       1
4   1164  To strengthen child-sensitive social protectio...       1

  par_id                                               text  labels
0   4046  We also know that they can benefit by receivin...       1
1   1279  Pope Francis washed and kissed the feet of Mus...       1
2   8330  Many refugees do n't want to be resettled anyw...       1
3   4063  "Budding chefs , like "" Fred "" , "" Winston ...       1
4   4089  "In a 90-degree view of his constituency , one...       1

Dataset({
    features: ['par_id', 'text', 'labels'],
    num_rows: 8375
})


In [4]:
counts = trdf['labels'].value_counts()
print("Original class distribution:")
print(counts)
print()

weights = torch.tensor([counts[0]/counts[0], counts[0]/counts[1]], dtype=torch.float).to(device)
print("Class weights:", weights)
print()


Original class distribution:
labels
0    7581
1     794
Name: count, dtype: int64

Class weights: tensor([1.0000, 9.5479], device='cuda:0')



In [5]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=RANDOM_SEED)

X = trdf["text"].values
y = trdf["labels"].values

folds = list(skf.split(X, y))


In [6]:
from sklearn.metrics import f1_score

class BalancedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels").view(-1)

        outputs = model(**inputs)
        logits = outputs.logits.view(-1, 2)

        loss_fn = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    f1 = f1_score(labels, preds, pos_label=1)
    return { "f1": f1 }

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

rob_checkpoint = "FacebookAI/roberta-base"
deb_checkpoint = "microsoft/deberta-base"

rob_tokenizer = AutoTokenizer.from_pretrained(rob_checkpoint)
deb_tokenizer = AutoTokenizer.from_pretrained(deb_checkpoint)

def tokenize(tokenizer, batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

architectures = [("Roberta", rob_checkpoint, rob_tokenizer), ("Deberta", deb_checkpoint, deb_tokenizer)]




In [8]:
LRS = [2e-5, 3e-5]
BSZS = [16, 32]

num_epochs = len(folds) * len(architectures) * len(LRS) * len(BSZS) * EPOCHS
print(f"Total training epochs: {num_epochs}")
print()

# Determine best hyperparameters using k-fold cross-validation
def hyperparameter_search():
    hp_metrics = {}
    hp_metrics["Roberta"] = defaultdict(float)
    hp_metrics["Deberta"] = defaultdict(float)

    for name, checkpoint, tokenizer in architectures:
        print(f"Training {name}...")
        print()

        for fold, (train_idx, val_idx) in enumerate(folds):
            print(f"Fold {fold + 1}/{K}")

            trds = Dataset.from_pandas(trdf.iloc[train_idx])
            valds = Dataset.from_pandas(trdf.iloc[val_idx])

            trds = trds.map(lambda batch: tokenize(tokenizer, batch), batched=True)
            valds = valds.map(lambda batch: tokenize(tokenizer, batch), batched=True)

            trds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
            valds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

            for lr, bsz in product(LRS, BSZS):
                model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)
                training_args = TrainingArguments(
                    learning_rate=lr,
                    per_device_train_batch_size=bsz,
                    per_device_eval_batch_size=bsz,
                    num_train_epochs=EPOCHS,
                    logging_strategy="epoch",
                    eval_strategy="epoch",
                    save_strategy="no",
                    # save_strategy="epoch",
                    # save_total_limit=1,
                    # load_best_model_at_end=True,
                    # metric_for_best_model="f1",
                    remove_unused_columns=False,
                )

                trainer = BalancedTrainer(
                    model=model,
                    args=training_args,
                    train_dataset=trds,
                    eval_dataset=valds,
                    compute_metrics=compute_metrics
                )

                trainer.train()
                metrics = trainer.evaluate()

                hp_metrics[name][(lr, bsz)] += metrics["eval_f1"]
            
    res = {}
    for name in hp_metrics:
        best_hp = max(hp_metrics[name], key=hp_metrics[name].get)
        res[name] = {
            "best_hps": best_hp,
            "avg_f1": hp_metrics[name][best_hp] / K
        }

    return res

Total training epochs: 120



In [44]:
tuning_res = hyperparameter_search()
print(tuning_res)

Training Roberta...

Fold 1/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.653023,0.684731,0.125
2,0.536443,0.265858,0.454212
3,0.521442,0.296016,0.50165


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.561457,0.256276,0.465649
2,0.358944,0.22456,0.545455
3,0.261012,0.272728,0.55409


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.650717,0.578916,0.0125
2,0.552264,0.305134,0.459677
3,0.445533,0.280158,0.547231


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.576968,0.251539,0.39834
2,0.401458,0.241181,0.486111
3,0.301805,0.282075,0.561934


Fold 2/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.585191,0.221789,0.534591
2,0.442349,0.23991,0.502024
3,0.33854,0.297082,0.588235


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.526299,0.245521,0.549865
2,0.370247,0.253792,0.5625
3,0.233162,0.290084,0.560209


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.602557,0.237626,0.380531
2,0.441589,0.255082,0.456897
3,0.35775,0.319995,0.583893


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.549193,0.22815,0.549254
2,0.364181,0.229441,0.540059
3,0.214177,0.291018,0.54386


Fold 3/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.608562,0.301465,0.501053
2,0.453702,0.234336,0.571429
3,0.33189,0.316216,0.568862


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.565889,0.396072,0.535135
2,0.362182,0.202648,0.577778
3,0.248378,0.260279,0.58221


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.625365,0.401285,0.469298
2,0.451189,0.235495,0.573529
3,0.332042,0.331026,0.575851


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.56734,0.405836,0.535519
2,0.408803,0.227893,0.574648
3,0.262383,0.29941,0.582633


Fold 4/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.595188,0.247671,0.492857
2,0.444413,0.275868,0.5
3,0.323813,0.359225,0.506329


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.550034,0.319244,0.510345
2,0.374634,0.28954,0.518703
3,0.240601,0.304312,0.518717


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.618028,0.275064,0.510029
2,0.465135,0.273531,0.510791
3,0.326335,0.355554,0.528662


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.574846,0.287093,0.48254
2,0.414097,0.300937,0.548148
3,0.253172,0.301918,0.548913


Fold 5/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.601328,0.22634,0.48
2,0.43893,0.256,0.504202
3,0.313651,0.327651,0.542056


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.546106,0.225451,0.51505
2,0.390929,0.238764,0.521246
3,0.265216,0.297513,0.555


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.618155,0.246743,0.457516
2,0.472141,0.257355,0.503704
3,0.345035,0.357207,0.540373


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.558159,0.241899,0.461017
2,0.394859,0.217909,0.525
3,0.274334,0.289332,0.541436


Training Deberta...

Fold 1/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.640362,0.252675,0.4
2,0.432678,0.224661,0.505747
3,0.297076,0.28688,0.561514


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.582546,0.266149,0.522013
2,0.371616,0.232371,0.562319
3,0.23877,0.245645,0.565598


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.699522,0.96672,0.170262
2,0.703424,0.628728,0.0125
3,0.68513,0.637917,0.117647


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.569855,0.269402,0.492754
2,0.369737,0.238685,0.548476
3,0.198963,0.246403,0.586826


Fold 2/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.706355,0.77058,0.179157
2,0.571512,0.24742,0.5625
3,0.462759,0.270995,0.559322


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.547305,0.287314,0.502392
2,0.374984,0.304882,0.539379
3,0.247194,0.298766,0.537859


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.60863,0.25317,0.485356
2,0.453889,0.2632,0.528986
3,0.282931,0.331991,0.55


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.539258,0.248009,0.52568
2,0.391293,0.257332,0.579387
3,0.217048,0.288746,0.536986


Fold 3/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.603286,0.284084,0.483951
2,0.406834,0.231592,0.51938
3,0.271454,0.307045,0.567073


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.56475,0.300509,0.531707
2,0.348576,0.204554,0.595469
3,0.221536,0.262597,0.597333


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.626001,0.323549,0.465374
2,0.477659,0.249016,0.474308
3,0.382433,0.314167,0.519481


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.608071,0.365809,0.507317
2,0.379896,0.211216,0.555957
3,0.232405,0.262205,0.586895


Fold 4/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.613819,0.229964,0.450593
2,0.463934,0.250815,0.527076
3,0.325602,0.313227,0.527607


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.55044,0.314677,0.503597
2,0.352363,0.254729,0.554348
3,0.222264,0.285404,0.559078


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.709617,0.545898,0.0
2,0.664039,0.347247,0.36962
3,0.524604,0.367139,0.466667


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.552532,0.271543,0.482955
2,0.363501,0.229156,0.51634
3,0.213356,0.275991,0.515337


Fold 5/5


Map:   0%|          | 0/6700 [00:00<?, ? examples/s]

Map:   0%|          | 0/1675 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.619364,0.292903,0.459459
2,0.482848,0.227002,0.489451
3,0.352226,0.297262,0.548673


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.577434,0.243409,0.5
2,0.393531,0.247552,0.503067
3,0.277445,0.317008,0.521519


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.636734,0.276559,0.456338
2,0.517149,0.242057,0.37963
3,0.413638,0.289767,0.5


Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,F1
1,0.645945,0.361185,0.430155
2,0.429827,0.238447,0.375546
3,0.317793,0.305161,0.506394


{'Roberta': {'best_hps': (3e-05, 32), 'avg_f1': 0.5557551489307914}, 'Deberta': {'best_hps': (2e-05, 32), 'avg_f1': 0.5562773611889518}}


In [9]:
tuning_res = {
    "Roberta": {
        "best_hps": (3e-5, 32),
        "avg_f1": 0.5557551489307914
    },
    "Deberta": {
        "best_hps": (2e-5, 32),
        "avg_f1": 0.5562773611889518
    }
}

In [10]:
from sklearn.model_selection import train_test_split

FINAL_EPOCHS = 5 
VAL_SIZE = 0.1
WEIGHT_DECAY = 0.05
DROPOUT = 0.2

# Split into train and val sets for monitoring best checkpoint
trdf_train, trdf_val = train_test_split(trdf, test_size=VAL_SIZE, random_state=RANDOM_SEED, stratify=trdf['labels'])

# Retrain best models on full training set
def train_ensemble_models(tuning_results):
    ensemble_models = {}

    for name, checkpoint, tokenizer in architectures:
        print(f"Retraining {name} on full training set...")

        trds = Dataset.from_pandas(trdf_train)
        valds = Dataset.from_pandas(trdf_val)

        trds = trds.map(lambda batch: tokenize(tokenizer, batch), batched=True)
        valds = valds.map(lambda batch: tokenize(tokenizer, batch), batched=True)
        trds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
        valds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

        lr, bsz = tuning_results[name]["best_hps"]

        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2,
                                                                   hidden_dropout_prob=DROPOUT,
                                                                   attention_probs_dropout_prob=DROPOUT).to(device)
        training_args = TrainingArguments(
            output_dir=f'./final_results/{name}',
            learning_rate=lr,
            per_device_train_batch_size=bsz,
            per_device_eval_batch_size=bsz,
            num_train_epochs=FINAL_EPOCHS,
            logging_dir=f'./final_logs/{name}',
            eval_strategy="epoch",
            logging_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            weight_decay=WEIGHT_DECAY
        )

        trainer = BalancedTrainer(
            model=model,
            args=training_args,
            train_dataset=trds,
            eval_dataset=valds,
            compute_metrics=compute_metrics
        )

        trainer.train()
        metrics = trainer.evaluate()

        model = model.to("cpu")

        ensemble_models[name] = (model, tokenizer, metrics["eval_f1"])
    
    tot_f1 = sum(m[2] for m in ensemble_models.values())
    for name in ensemble_models:
        model, tokenizer, f1 = ensemble_models[name]
        ensemble_weight = f1 / tot_f1
        ensemble_models[name] = (model, tokenizer, ensemble_weight)
    
    return ensemble_models


In [11]:
ensemble_models = train_ensemble_models(tuning_res)
print(ensemble_models)

Retraining Roberta on full training set...


Map:   0%|          | 0/7537 [00:00<?, ? examples/s]

Map:   0%|          | 0/838 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.weight         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBO

Epoch,Training Loss,Validation Loss,F1
1,0.586013,0.556028,0.357955
2,0.453254,0.455058,0.455696
3,0.34382,0.570429,0.506912
4,0.27521,0.547164,0.511416
5,0.20914,0.681463,0.512077


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Retraining Deberta on full training set...


Map:   0%|          | 0/7537 [00:00<?, ? examples/s]

Map:   0%|          | 0/838 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/196 [00:00<?, ?it/s]

[1mDebertaForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
classifier.bias                         | MISSING    | 
pooler.dense.weight                     | MISSING    | 
classifier.weight                       | MISSING    | 
pooler.dense.bias                       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m
`logging_dir` is deprecated and will b

Epoch,Training Loss,Validation Loss,F1
1,0.665891,0.584425,0.309623
2,0.462348,0.472078,0.495495
3,0.345894,0.667231,0.486772
4,0.274227,0.721025,0.525253
5,0.203023,0.800883,0.489583


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'deberta.encoder.layer.3.attention.output.LayerNorm.weight', 'deberta.encoder.layer.3.attention.output.LayerNorm.bias', 'deberta.encoder.layer.3.output.LayerNorm.weight', 'deberta.encoder.layer.3.output.Laye

{'Roberta': (RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.2, inplace=False)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)


In [56]:
# Move ensemble models to CPU
for name in ensemble_models:
    model, tokenizer, weight = ensemble_models[name]
    ensemble_models[name] = (model.to("cpu"), tokenizer, weight)

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader

BSZ=16

def ensemble_predict(dataset, ensemble_models):
    dataloader = DataLoader(dataset, batch_size=BSZ)
    all_preds = []

    for batch in dataloader:
        texts = batch["text"]
        batch_probs = None

        for model, tokenizer, weight in ensemble_models.values():
            # Tokenize this batch for this model
            encodings = tokenizer(
                texts,
                padding=True,
                truncation=True,
                return_tensors="pt"
            )
            input_ids = encodings["input_ids"].to(device)
            attention_mask = encodings["attention_mask"].to(device)

            model.to(device)
            model.eval()
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                probs = F.softmax(outputs.logits, dim=-1) * weight

            # Accumulate weighted probabilities
            if batch_probs is None:
                batch_probs = probs
            else:
                batch_probs += probs

            # Move model back to CPU to free GPU memory
            model.to("cpu")

        # Final predictions for this batch
        preds = torch.argmax(batch_probs, dim=-1)
        all_preds.extend(preds.cpu().numpy())

    return all_preds

In [28]:
# Helper function to save predictions to an output file
def labels2file(p, outf_path):
    with open(outf_path, "w") as f:
        for pred in p:
            f.write(f"{pred}\n")

In [None]:
import os

valds = Dataset.from_pandas(trdf_val)

# Generate training dev set predictions using ensemble
preds = ensemble_predict(valds, ensemble_models)
labels2file(preds, os.path.join('res/', 'tval.txt'))

# Save reference labels for training dev set
labels2file(trdf_val['labels'].tolist(), os.path.join('ref/', 'tval.txt'))

!python3 evaluation.py . . tval.txt
!cat scores.txt


In [None]:
import os

valds = Dataset.from_pandas(valdf)

# Generate dev set predictions using ensemble
preds = ensemble_predict(valds, ensemble_models)
labels2file(preds, os.path.join('res/', 'dev.txt'))

# Save reference labels for dev set
labels2file(valdf['labels'].tolist(), os.path.join('ref/', 'dev.txt'))


In [None]:
!python3 evaluation.py . . dev.txt
!cat scores.txt

In [36]:
dpm.load_test()

cols = ['par_id', 'text', 'label']
testdf = dpm.test_set_df.copy()
testdf = testdf[cols]
testdf.rename(columns={"label": "labels"}, inplace=True)
print(testdf.head())

testds = Dataset.from_pandas(testdf)

  par_id                                               text  labels
0    t_4  Members of the church , which is part of Ken C...       1
1    t_5  "To ensure that "" Priority Agriculture Progra...       1
2    t_6  The deportees stepped off their flight from El...       1
3    t_7  PIMS staffer who raped disabled girl at ICU wa...       1
4    t_9  I conclude , Yes , the general FEELING generat...       1


In [None]:
# Generate test set predictions
preds = ensemble_predict(testds, ensemble_models)
labels2file(preds, os.path.join('res/', 'test.txt'))