# Notebook for DeBERTa-v3-base

In [53]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
from conllu import TokenList
import polars as pl
from stanza.models.common.doc import Token
from label_legends.preprocess import create_conllu, holdout, load_conllu, load_data, load_train, transform, load_vectorizer, reverse_vocabulary, vocabulary, ids_to_tokens, tokens_to_ids, vectorize_tokens, strip_stopwords
import logging

logging.basicConfig(level=logging.INFO)

# Importing dependencies for this specific model

In [55]:
import torch
from torch.utils.data import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from time import time


In [56]:
from transformers import AutoModelForSequenceClassification, AutoConfig, DebertaV2Tokenizer, Trainer, TrainingArguments
MODEL_NAME = 'microsoft/deberta-v3-base'
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
config = AutoConfig.from_pretrained(MODEL_NAME)
tokenizer =DebertaV2Tokenizer.from_pretrained(MODEL_NAME)
#DO I need the tokenizer here?

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
load_data().collect()

id,rewire_id,text,annotator,label_sexist,label_category,label_vector,split,tokens
i64,str,str,i64,str,str,str,str,list[str]
0,"""sexism2022_english-0""",""" I wonder what keeps that witc…",17,"""sexist""","""2. derogation""","""2.2 aggressive and emotive att…","""train""","[""i"", ""wonder"", … ""😄""]"
1,"""sexism2022_english-0""",""" I wonder what keeps that witc…",2,"""sexist""","""2. derogation""","""2.2 aggressive and emotive att…","""train""","[""i"", ""wonder"", … ""😄""]"
10,"""sexism2022_english-100""","""Good for her! My grandson had …",3,"""not sexist""","""none""","""none""","""train""","[""good"", ""for"", … ""!""]"
100,"""sexism2022_english-10026""","""It is not insulting, it's **ex…",14,"""sexist""","""2. derogation""","""2.1 descriptive attacks""","""test""","[""it"", ""be"", … "".**""]"
1000,"""sexism2022_english-10297""","""The article said Madonna offer…",5,"""sexist""","""2. derogation""","""2.3 dehumanising attacks & ove…","""train""","[""the"", ""article"", … "".""]"
…,…,…,…,…,…,…,…,…
9995,"""sexism2022_english-12996""","""Shudder.. if you had to have s…",17,"""sexist""","""2. derogation""","""2.3 dehumanising attacks & ove…","""test""","[""shudder"", "".."", … "".""]"
9996,"""sexism2022_english-12997""","""You mean one that forces women…",6,"""not sexist""","""none""","""none""","""train""","[""you"", ""mean"", … ""?""]"
9997,"""sexism2022_english-12997""","""You mean one that forces women…",4,"""not sexist""","""none""","""none""","""train""","[""you"", ""mean"", … ""?""]"
9998,"""sexism2022_english-12997""","""You mean one that forces women…",2,"""sexist""","""3. animosity""","""3.2 immutable gender differenc…","""train""","[""you"", ""mean"", … ""?""]"


In [58]:
val, tra = holdout()
tra = transform(tra)
val = transform(val)

# Convert 'label' column from string '0' and '1' to integer 0 and 1
tra = tra.with_columns(
    polars.col("label").cast(polars.Int32)
)
val = val.with_columns(
    polars.col("label").cast(polars.Int32)
)

train_texts = tra["text"].to_list()
train_labels = tra["label"].to_list()
val_texts = val["text"].to_list()
val_labels = val["label"].to_list()


In [59]:
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=128,  # Adjust this as needed
    return_tensors="pt",
)
val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors="pt",
)


 33%|███▎      | 3675/11025 [15:43:17<31:26:34, 15.40s/it]

[A

In [60]:
class SexistDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

    
    
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [61]:
train_dataset = SexistDataset(train_encodings, train_labels)
val_dataset = SexistDataset(val_encodings, val_labels)

In [62]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=2,
    evaluation_strategy="epoch",  # Keep this for evaluation at the end of each epoch
    logging_steps=100,  # Log less frequently than every step, but more often than every epoch
    save_strategy="epoch",  # Save checkpoints at the end of each epoch
    load_best_model_at_end=True,  # Load best model based on evaluation loss
    metric_for_best_model='eval_loss',  # Track evaluation loss for best model
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

#trainer.train()


  trainer = Trainer(
  1%|          | 119/11025 [12:16:45<1125:22:25, 371.48s/it]


In [63]:
#trainer.evaluate()


In [64]:

train_output = trainer.train()  # Returns a TrainOutput object
train_metrics = train_output.metrics  # Contains train_runtime, train_loss, etc.

# Assume trainer.evaluate() has been run and returned results
eval_results = trainer.evaluate()  # Contains eval_runtime, eval_accuracy, etc.

# Combine training and evaluation metrics
#all_metrics = {**train_metrics, **eval_results}
metrics = {**eval_results}

# Create a Polars DataFrame
df = pl.DataFrame(metrics)
df

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  1%|          | 100/11025 [00:36<1:06:37,  2.73it/s]

{'loss': 0.5642, 'grad_norm': 2.3384995460510254, 'learning_rate': 1.981859410430839e-05, 'epoch': 0.03}


  2%|▏         | 200/11025 [01:13<1:06:03,  2.73it/s]

{'loss': 0.5272, 'grad_norm': 15.648771286010742, 'learning_rate': 1.963718820861678e-05, 'epoch': 0.05}


  3%|▎         | 300/11025 [01:49<1:05:22,  2.73it/s]

{'loss': 0.5822, 'grad_norm': 1.468311071395874, 'learning_rate': 1.945578231292517e-05, 'epoch': 0.08}


  4%|▎         | 400/11025 [02:26<1:04:53,  2.73it/s]

{'loss': 0.5128, 'grad_norm': 1.9529191255569458, 'learning_rate': 1.9274376417233563e-05, 'epoch': 0.11}


  5%|▍         | 500/11025 [03:03<1:04:26,  2.72it/s]

{'loss': 0.4616, 'grad_norm': 3.1888108253479004, 'learning_rate': 1.9092970521541953e-05, 'epoch': 0.14}


  5%|▌         | 600/11025 [03:39<1:03:42,  2.73it/s]

{'loss': 0.4432, 'grad_norm': 12.392617225646973, 'learning_rate': 1.8911564625850343e-05, 'epoch': 0.16}


  6%|▋         | 700/11025 [04:16<1:03:10,  2.72it/s]

{'loss': 0.453, 'grad_norm': 2.466106414794922, 'learning_rate': 1.8730158730158732e-05, 'epoch': 0.19}


  7%|▋         | 800/11025 [04:53<1:02:33,  2.72it/s]

{'loss': 0.4374, 'grad_norm': 11.236799240112305, 'learning_rate': 1.8548752834467122e-05, 'epoch': 0.22}


  8%|▊         | 900/11025 [05:30<1:01:58,  2.72it/s]

{'loss': 0.436, 'grad_norm': 6.2112321853637695, 'learning_rate': 1.836734693877551e-05, 'epoch': 0.24}


  9%|▉         | 1000/11025 [06:06<1:01:17,  2.73it/s]

{'loss': 0.4448, 'grad_norm': 10.407325744628906, 'learning_rate': 1.81859410430839e-05, 'epoch': 0.27}


 10%|▉         | 1100/11025 [06:43<1:00:38,  2.73it/s]

{'loss': 0.4364, 'grad_norm': 3.7003865242004395, 'learning_rate': 1.8004535147392294e-05, 'epoch': 0.3}


 11%|█         | 1200/11025 [07:20<1:00:05,  2.72it/s]

{'loss': 0.3919, 'grad_norm': 1.5171334743499756, 'learning_rate': 1.782312925170068e-05, 'epoch': 0.33}


 12%|█▏        | 1300/11025 [07:56<1:00:10,  2.69it/s]

{'loss': 0.434, 'grad_norm': 8.173460960388184, 'learning_rate': 1.7641723356009073e-05, 'epoch': 0.35}


 13%|█▎        | 1400/11025 [08:33<58:49,  2.73it/s]  

{'loss': 0.4301, 'grad_norm': 3.084192991256714, 'learning_rate': 1.7460317460317463e-05, 'epoch': 0.38}


 14%|█▎        | 1500/11025 [09:10<58:15,  2.73it/s]

{'loss': 0.4271, 'grad_norm': 4.467639923095703, 'learning_rate': 1.7278911564625852e-05, 'epoch': 0.41}


 15%|█▍        | 1600/11025 [09:46<57:39,  2.72it/s]

{'loss': 0.4014, 'grad_norm': 4.073261260986328, 'learning_rate': 1.7097505668934242e-05, 'epoch': 0.44}


 15%|█▌        | 1700/11025 [19:27<3:49:59,  1.48s/it]   

{'loss': 0.4436, 'grad_norm': 5.532982349395752, 'learning_rate': 1.691609977324263e-05, 'epoch': 0.46}


 16%|█▋        | 1800/11025 [20:04<56:55,  2.70it/s]  

{'loss': 0.4133, 'grad_norm': 4.643570423126221, 'learning_rate': 1.673469387755102e-05, 'epoch': 0.49}


 17%|█▋        | 1900/11025 [20:41<55:35,  2.74it/s]  

{'loss': 0.385, 'grad_norm': 2.6342008113861084, 'learning_rate': 1.655328798185941e-05, 'epoch': 0.52}


 18%|█▊        | 2000/11025 [21:18<55:01,  2.73it/s]

{'loss': 0.4044, 'grad_norm': 13.557908058166504, 'learning_rate': 1.63718820861678e-05, 'epoch': 0.54}


 19%|█▉        | 2100/11025 [21:56<54:47,  2.72it/s]  

{'loss': 0.3847, 'grad_norm': 4.300295352935791, 'learning_rate': 1.6190476190476193e-05, 'epoch': 0.57}


 20%|█▉        | 2200/11025 [22:32<54:23,  2.70it/s]

{'loss': 0.4002, 'grad_norm': 3.9787437915802, 'learning_rate': 1.6009070294784583e-05, 'epoch': 0.6}


 21%|██        | 2300/11025 [23:09<53:35,  2.71it/s]

{'loss': 0.4527, 'grad_norm': 8.525747299194336, 'learning_rate': 1.5827664399092972e-05, 'epoch': 0.63}


 22%|██▏       | 2400/11025 [23:46<53:06,  2.71it/s]

{'loss': 0.435, 'grad_norm': 8.526215553283691, 'learning_rate': 1.5646258503401362e-05, 'epoch': 0.65}


 23%|██▎       | 2500/11025 [24:23<52:22,  2.71it/s]

{'loss': 0.3802, 'grad_norm': 28.95743751525879, 'learning_rate': 1.546485260770975e-05, 'epoch': 0.68}


 24%|██▎       | 2600/11025 [25:00<51:44,  2.71it/s]

{'loss': 0.4411, 'grad_norm': 7.04342794418335, 'learning_rate': 1.528344671201814e-05, 'epoch': 0.71}


 24%|██▍       | 2700/11025 [25:37<51:14,  2.71it/s]

{'loss': 0.3894, 'grad_norm': 6.365671157836914, 'learning_rate': 1.510204081632653e-05, 'epoch': 0.73}


 25%|██▌       | 2800/11025 [26:14<50:30,  2.71it/s]

{'loss': 0.4057, 'grad_norm': 3.6063196659088135, 'learning_rate': 1.4920634920634922e-05, 'epoch': 0.76}


 26%|██▋       | 2900/11025 [26:51<50:00,  2.71it/s]

{'loss': 0.405, 'grad_norm': 4.066201210021973, 'learning_rate': 1.4739229024943311e-05, 'epoch': 0.79}


 27%|██▋       | 3000/11025 [27:28<49:15,  2.71it/s]

{'loss': 0.3891, 'grad_norm': 2.5872621536254883, 'learning_rate': 1.4557823129251703e-05, 'epoch': 0.82}


 28%|██▊       | 3100/11025 [28:05<48:49,  2.71it/s]

{'loss': 0.3744, 'grad_norm': 1.2128922939300537, 'learning_rate': 1.4376417233560092e-05, 'epoch': 0.84}


 29%|██▉       | 3200/11025 [28:42<48:09,  2.71it/s]

{'loss': 0.3647, 'grad_norm': 3.478449821472168, 'learning_rate': 1.4195011337868484e-05, 'epoch': 0.87}


 30%|██▉       | 3300/11025 [29:19<47:48,  2.69it/s]

{'loss': 0.3944, 'grad_norm': 4.648478984832764, 'learning_rate': 1.4013605442176872e-05, 'epoch': 0.9}


 31%|███       | 3400/11025 [29:56<46:52,  2.71it/s]

{'loss': 0.4202, 'grad_norm': 2.7207529544830322, 'learning_rate': 1.3832199546485261e-05, 'epoch': 0.93}


 32%|███▏      | 3500/11025 [30:33<46:13,  2.71it/s]

{'loss': 0.3414, 'grad_norm': 0.6616837978363037, 'learning_rate': 1.3650793650793652e-05, 'epoch': 0.95}


 33%|███▎      | 3600/11025 [31:09<45:40,  2.71it/s]

{'loss': 0.4137, 'grad_norm': 8.376431465148926, 'learning_rate': 1.3469387755102042e-05, 'epoch': 0.98}


 33%|███▎      | 3675/11025 [31:37<45:16,  2.71it/s]
 33%|███▎      | 3675/11025 [34:02<45:16,  2.71it/s]

{'eval_loss': 0.34722602367401123, 'eval_accuracy': 0.8602380952380952, 'eval_f1': 0.8602380952380952, 'eval_precision': 0.8602380952380952, 'eval_recall': 0.8602380952380952, 'eval_runtime': 145.2868, 'eval_samples_per_second': 86.725, 'eval_steps_per_second': 10.841, 'epoch': 1.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 34%|███▎      | 3700/11025 [34:14<46:07,  2.65it/s]   

{'loss': 0.4049, 'grad_norm': 1.6981748342514038, 'learning_rate': 1.3287981859410433e-05, 'epoch': 1.01}


 34%|███▍      | 3800/11025 [34:51<44:27,  2.71it/s]

{'loss': 0.3576, 'grad_norm': 0.6789193153381348, 'learning_rate': 1.3106575963718821e-05, 'epoch': 1.03}


 35%|███▌      | 3900/11025 [35:28<43:42,  2.72it/s]

{'loss': 0.3267, 'grad_norm': 11.099424362182617, 'learning_rate': 1.2925170068027212e-05, 'epoch': 1.06}


 36%|███▋      | 4000/11025 [36:05<43:10,  2.71it/s]

{'loss': 0.3286, 'grad_norm': 4.872696399688721, 'learning_rate': 1.2743764172335602e-05, 'epoch': 1.09}


 37%|███▋      | 4100/11025 [36:42<42:38,  2.71it/s]

{'loss': 0.3947, 'grad_norm': 6.930663108825684, 'learning_rate': 1.2562358276643992e-05, 'epoch': 1.12}


 38%|███▊      | 4200/11025 [37:18<41:51,  2.72it/s]

{'loss': 0.3097, 'grad_norm': 7.130961894989014, 'learning_rate': 1.2380952380952383e-05, 'epoch': 1.14}


 39%|███▉      | 4300/11025 [37:55<41:15,  2.72it/s]

{'loss': 0.3374, 'grad_norm': 2.6293985843658447, 'learning_rate': 1.219954648526077e-05, 'epoch': 1.17}


 40%|███▉      | 4400/11025 [38:32<40:38,  2.72it/s]

{'loss': 0.3487, 'grad_norm': 10.807917594909668, 'learning_rate': 1.2018140589569162e-05, 'epoch': 1.2}


 41%|████      | 4500/11025 [39:09<40:10,  2.71it/s]

{'loss': 0.2714, 'grad_norm': 13.368829727172852, 'learning_rate': 1.1836734693877552e-05, 'epoch': 1.22}


 42%|████▏     | 4600/11025 [39:46<39:24,  2.72it/s]

{'loss': 0.3488, 'grad_norm': 12.660006523132324, 'learning_rate': 1.1655328798185943e-05, 'epoch': 1.25}


 43%|████▎     | 4700/11025 [40:23<38:53,  2.71it/s]

{'loss': 0.3389, 'grad_norm': 3.8340837955474854, 'learning_rate': 1.1473922902494332e-05, 'epoch': 1.28}


 44%|████▎     | 4800/11025 [41:00<38:17,  2.71it/s]

{'loss': 0.3701, 'grad_norm': 5.896657466888428, 'learning_rate': 1.1292517006802722e-05, 'epoch': 1.31}


 44%|████▍     | 4900/11025 [41:37<37:39,  2.71it/s]

{'loss': 0.3346, 'grad_norm': 3.722095489501953, 'learning_rate': 1.1111111111111113e-05, 'epoch': 1.33}


 45%|████▌     | 5000/11025 [42:14<37:00,  2.71it/s]

{'loss': 0.3358, 'grad_norm': 5.105600833892822, 'learning_rate': 1.0929705215419501e-05, 'epoch': 1.36}


 46%|████▋     | 5100/11025 [42:50<36:22,  2.72it/s]

{'loss': 0.3445, 'grad_norm': 14.764114379882812, 'learning_rate': 1.0748299319727893e-05, 'epoch': 1.39}


 47%|████▋     | 5200/11025 [43:27<36:11,  2.68it/s]

{'loss': 0.3287, 'grad_norm': 17.771944046020508, 'learning_rate': 1.0566893424036282e-05, 'epoch': 1.41}


 48%|████▊     | 5300/11025 [44:04<35:08,  2.72it/s]

{'loss': 0.3749, 'grad_norm': 0.7998957633972168, 'learning_rate': 1.0385487528344672e-05, 'epoch': 1.44}


 49%|████▉     | 5400/11025 [44:41<34:45,  2.70it/s]

{'loss': 0.2946, 'grad_norm': 23.48276138305664, 'learning_rate': 1.0204081632653063e-05, 'epoch': 1.47}


 50%|████▉     | 5500/11025 [45:18<33:57,  2.71it/s]

{'loss': 0.3394, 'grad_norm': 9.033584594726562, 'learning_rate': 1.0022675736961451e-05, 'epoch': 1.5}


 51%|█████     | 5600/11025 [45:55<33:18,  2.71it/s]

{'loss': 0.3193, 'grad_norm': 5.216367721557617, 'learning_rate': 9.841269841269842e-06, 'epoch': 1.52}


 52%|█████▏    | 5700/11025 [46:32<32:40,  2.72it/s]

{'loss': 0.3435, 'grad_norm': 4.628591060638428, 'learning_rate': 9.659863945578232e-06, 'epoch': 1.55}


 53%|█████▎    | 5800/11025 [47:09<32:03,  2.72it/s]

{'loss': 0.3136, 'grad_norm': 5.819119930267334, 'learning_rate': 9.478458049886621e-06, 'epoch': 1.58}


 54%|█████▎    | 5900/11025 [47:45<31:29,  2.71it/s]

{'loss': 0.3571, 'grad_norm': 2.2997682094573975, 'learning_rate': 9.297052154195013e-06, 'epoch': 1.61}


 54%|█████▍    | 6000/11025 [48:22<30:53,  2.71it/s]

{'loss': 0.3386, 'grad_norm': 4.412736892700195, 'learning_rate': 9.115646258503402e-06, 'epoch': 1.63}


 55%|█████▌    | 6100/11025 [48:59<30:24,  2.70it/s]

{'loss': 0.3089, 'grad_norm': 3.178201675415039, 'learning_rate': 8.934240362811792e-06, 'epoch': 1.66}


 56%|█████▌    | 6200/11025 [49:36<29:35,  2.72it/s]

{'loss': 0.3738, 'grad_norm': 4.705409049987793, 'learning_rate': 8.752834467120183e-06, 'epoch': 1.69}


 57%|█████▋    | 6300/11025 [50:13<29:06,  2.71it/s]

{'loss': 0.3306, 'grad_norm': 4.535904407501221, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.71}


 58%|█████▊    | 6400/11025 [50:50<28:42,  2.69it/s]

{'loss': 0.3291, 'grad_norm': 3.2317323684692383, 'learning_rate': 8.390022675736962e-06, 'epoch': 1.74}


 59%|█████▉    | 6500/11025 [51:28<27:47,  2.71it/s]

{'loss': 0.3297, 'grad_norm': 0.8524833917617798, 'learning_rate': 8.208616780045352e-06, 'epoch': 1.77}


 60%|█████▉    | 6600/11025 [52:05<27:54,  2.64it/s]

{'loss': 0.3172, 'grad_norm': 10.946956634521484, 'learning_rate': 8.027210884353741e-06, 'epoch': 1.8}


 61%|██████    | 6700/11025 [52:43<26:39,  2.70it/s]

{'loss': 0.3468, 'grad_norm': 4.7197418212890625, 'learning_rate': 7.845804988662133e-06, 'epoch': 1.82}


 62%|██████▏   | 6800/11025 [53:20<25:52,  2.72it/s]

{'loss': 0.3357, 'grad_norm': 6.027444839477539, 'learning_rate': 7.664399092970522e-06, 'epoch': 1.85}


 63%|██████▎   | 6900/11025 [53:57<25:14,  2.72it/s]

{'loss': 0.3185, 'grad_norm': 0.7744088172912598, 'learning_rate': 7.482993197278913e-06, 'epoch': 1.88}


 63%|██████▎   | 7000/11025 [54:33<24:44,  2.71it/s]

{'loss': 0.328, 'grad_norm': 11.245203971862793, 'learning_rate': 7.301587301587301e-06, 'epoch': 1.9}


 64%|██████▍   | 7100/11025 [55:10<24:03,  2.72it/s]

{'loss': 0.3518, 'grad_norm': 1.8758736848831177, 'learning_rate': 7.120181405895692e-06, 'epoch': 1.93}


 65%|██████▌   | 7200/11025 [55:47<23:27,  2.72it/s]

{'loss': 0.3562, 'grad_norm': 11.788519859313965, 'learning_rate': 6.938775510204082e-06, 'epoch': 1.96}


 66%|██████▌   | 7300/11025 [56:24<23:13,  2.67it/s]

{'loss': 0.3439, 'grad_norm': 0.6973892450332642, 'learning_rate': 6.757369614512473e-06, 'epoch': 1.99}


 67%|██████▋   | 7350/11025 [56:43<22:32,  2.72it/s]
 67%|██████▋   | 7350/11025 [59:08<22:32,  2.72it/s]

{'eval_loss': 0.37213829159736633, 'eval_accuracy': 0.8688888888888889, 'eval_f1': 0.8688888888888889, 'eval_precision': 0.8688888888888889, 'eval_recall': 0.8688888888888889, 'eval_runtime': 145.534, 'eval_samples_per_second': 86.578, 'eval_steps_per_second': 10.822, 'epoch': 2.0}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 67%|██████▋   | 7400/11025 [59:29<22:10,  2.72it/s]   

{'loss': 0.3208, 'grad_norm': 1.186288833618164, 'learning_rate': 6.575963718820862e-06, 'epoch': 2.01}


 68%|██████▊   | 7500/11025 [1:00:06<21:35,  2.72it/s]

{'loss': 0.2767, 'grad_norm': 0.14747169613838196, 'learning_rate': 6.394557823129253e-06, 'epoch': 2.04}


 69%|██████▉   | 7600/11025 [1:00:43<20:59,  2.72it/s]

{'loss': 0.2686, 'grad_norm': 27.704206466674805, 'learning_rate': 6.2131519274376415e-06, 'epoch': 2.07}


 70%|██████▉   | 7700/11025 [1:01:20<20:20,  2.72it/s]

{'loss': 0.3182, 'grad_norm': 0.252187579870224, 'learning_rate': 6.031746031746032e-06, 'epoch': 2.1}


 71%|███████   | 7800/11025 [1:01:57<19:45,  2.72it/s]

{'loss': 0.2844, 'grad_norm': 15.414360046386719, 'learning_rate': 5.850340136054422e-06, 'epoch': 2.12}


 72%|███████▏  | 7900/11025 [1:02:33<19:07,  2.72it/s]

{'loss': 0.2705, 'grad_norm': 5.5390238761901855, 'learning_rate': 5.668934240362812e-06, 'epoch': 2.15}


 73%|███████▎  | 8000/11025 [1:03:10<18:34,  2.71it/s]

{'loss': 0.2916, 'grad_norm': 15.586450576782227, 'learning_rate': 5.487528344671202e-06, 'epoch': 2.18}


 73%|███████▎  | 8100/11025 [1:03:47<17:57,  2.72it/s]

{'loss': 0.3024, 'grad_norm': 21.718416213989258, 'learning_rate': 5.306122448979593e-06, 'epoch': 2.2}


 74%|███████▍  | 8200/11025 [1:04:24<17:18,  2.72it/s]

{'loss': 0.2579, 'grad_norm': 5.794596195220947, 'learning_rate': 5.124716553287983e-06, 'epoch': 2.23}


 75%|███████▌  | 8300/11025 [1:05:00<16:42,  2.72it/s]

{'loss': 0.3339, 'grad_norm': 4.677666664123535, 'learning_rate': 4.943310657596373e-06, 'epoch': 2.26}


 76%|███████▌  | 8400/11025 [1:05:37<16:01,  2.73it/s]

{'loss': 0.3004, 'grad_norm': 5.421273708343506, 'learning_rate': 4.761904761904762e-06, 'epoch': 2.29}


 77%|███████▋  | 8500/11025 [1:06:14<15:35,  2.70it/s]

{'loss': 0.2872, 'grad_norm': 9.169305801391602, 'learning_rate': 4.580498866213152e-06, 'epoch': 2.31}


 78%|███████▊  | 8600/11025 [1:06:51<14:59,  2.70it/s]

{'loss': 0.3187, 'grad_norm': 0.3572438657283783, 'learning_rate': 4.399092970521542e-06, 'epoch': 2.34}


 79%|███████▉  | 8700/11025 [1:07:28<14:20,  2.70it/s]

{'loss': 0.2935, 'grad_norm': 4.875082015991211, 'learning_rate': 4.217687074829933e-06, 'epoch': 2.37}


 80%|███████▉  | 8800/11025 [1:08:05<13:50,  2.68it/s]

{'loss': 0.2992, 'grad_norm': 8.535686492919922, 'learning_rate': 4.036281179138322e-06, 'epoch': 2.39}


 81%|████████  | 8900/11025 [1:08:42<13:04,  2.71it/s]

{'loss': 0.2633, 'grad_norm': 6.415994644165039, 'learning_rate': 3.854875283446712e-06, 'epoch': 2.42}


 82%|████████▏ | 9000/11025 [1:09:19<12:26,  2.71it/s]

{'loss': 0.2803, 'grad_norm': 7.441958427429199, 'learning_rate': 3.6734693877551024e-06, 'epoch': 2.45}


 83%|████████▎ | 9100/11025 [1:09:56<11:49,  2.71it/s]

{'loss': 0.2915, 'grad_norm': 23.626115798950195, 'learning_rate': 3.492063492063492e-06, 'epoch': 2.48}


 83%|████████▎ | 9200/11025 [1:10:33<11:12,  2.72it/s]

{'loss': 0.3585, 'grad_norm': 0.6630282998085022, 'learning_rate': 3.3106575963718824e-06, 'epoch': 2.5}


 84%|████████▍ | 9300/11025 [1:11:10<10:34,  2.72it/s]

{'loss': 0.2724, 'grad_norm': 1.598374366760254, 'learning_rate': 3.1292517006802725e-06, 'epoch': 2.53}


 85%|████████▌ | 9400/11025 [1:11:46<09:57,  2.72it/s]

{'loss': 0.2619, 'grad_norm': 12.990674018859863, 'learning_rate': 2.947845804988662e-06, 'epoch': 2.56}


 86%|████████▌ | 9500/11025 [1:12:23<09:28,  2.68it/s]

{'loss': 0.2849, 'grad_norm': 3.587864637374878, 'learning_rate': 2.7664399092970525e-06, 'epoch': 2.59}


 87%|████████▋ | 9600/11025 [1:13:00<08:42,  2.73it/s]

{'loss': 0.2996, 'grad_norm': 0.6649580597877502, 'learning_rate': 2.5850340136054425e-06, 'epoch': 2.61}


 88%|████████▊ | 9700/11025 [1:13:37<08:08,  2.71it/s]

{'loss': 0.3211, 'grad_norm': 0.2486201524734497, 'learning_rate': 2.4036281179138325e-06, 'epoch': 2.64}


 89%|████████▉ | 9800/11025 [1:14:14<07:30,  2.72it/s]

{'loss': 0.3019, 'grad_norm': 6.439983367919922, 'learning_rate': 2.222222222222222e-06, 'epoch': 2.67}


 90%|████████▉ | 9900/11025 [1:14:50<06:53,  2.72it/s]

{'loss': 0.2892, 'grad_norm': 0.2587137222290039, 'learning_rate': 2.0408163265306125e-06, 'epoch': 2.69}


 91%|█████████ | 10000/11025 [1:15:28<06:17,  2.72it/s]

{'loss': 0.2977, 'grad_norm': 1.3160762786865234, 'learning_rate': 1.8594104308390023e-06, 'epoch': 2.72}


 92%|█████████▏| 10100/11025 [1:16:04<05:39,  2.72it/s]

{'loss': 0.2561, 'grad_norm': 3.244680166244507, 'learning_rate': 1.6780045351473925e-06, 'epoch': 2.75}


 93%|█████████▎| 10200/11025 [1:16:41<05:03,  2.72it/s]

{'loss': 0.3216, 'grad_norm': 10.350027084350586, 'learning_rate': 1.4965986394557825e-06, 'epoch': 2.78}


 93%|█████████▎| 10300/11025 [1:17:18<04:26,  2.72it/s]

{'loss': 0.2802, 'grad_norm': 17.438474655151367, 'learning_rate': 1.3151927437641723e-06, 'epoch': 2.8}


 94%|█████████▍| 10400/11025 [1:17:55<03:49,  2.72it/s]

{'loss': 0.2922, 'grad_norm': 11.688016891479492, 'learning_rate': 1.1337868480725626e-06, 'epoch': 2.83}


 95%|█████████▌| 10500/11025 [1:18:31<03:13,  2.72it/s]

{'loss': 0.3016, 'grad_norm': 22.057931900024414, 'learning_rate': 9.523809523809525e-07, 'epoch': 2.86}


 96%|█████████▌| 10600/11025 [1:19:08<02:35,  2.72it/s]

{'loss': 0.2495, 'grad_norm': 14.829246520996094, 'learning_rate': 7.709750566893425e-07, 'epoch': 2.88}


 97%|█████████▋| 10700/11025 [1:19:45<01:59,  2.72it/s]

{'loss': 0.3276, 'grad_norm': 16.814266204833984, 'learning_rate': 5.895691609977325e-07, 'epoch': 2.91}


 98%|█████████▊| 10800/11025 [1:20:22<01:22,  2.73it/s]

{'loss': 0.2694, 'grad_norm': 0.498172402381897, 'learning_rate': 4.0816326530612243e-07, 'epoch': 2.94}


 99%|█████████▉| 10900/11025 [1:20:58<00:46,  2.71it/s]

{'loss': 0.3488, 'grad_norm': 6.004455089569092, 'learning_rate': 2.267573696145125e-07, 'epoch': 2.97}


100%|█████████▉| 11000/11025 [1:21:35<00:09,  2.72it/s]

{'loss': 0.2798, 'grad_norm': 1.0642284154891968, 'learning_rate': 4.53514739229025e-08, 'epoch': 2.99}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

100%|██████████| 11025/11025 [1:24:12<00:00,  2.72it/s]

{'eval_loss': 0.46143072843551636, 'eval_accuracy': 0.8668253968253968, 'eval_f1': 0.8668253968253968, 'eval_precision': 0.8668253968253968, 'eval_recall': 0.8668253968253968, 'eval_runtime': 145.0987, 'eval_samples_per_second': 86.837, 'eval_steps_per_second': 10.855, 'epoch': 3.0}


100%|██████████| 11025/11025 [1:24:15<00:00,  2.18it/s]
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'train_runtime': 5055.2793, 'train_samples_per_second': 17.447, 'train_steps_per_second': 2.181, 'train_loss': 0.3528726281060113, 'epoch': 3.0}


100%|██████████| 1575/1575 [02:25<00:00, 10.81it/s]


eval_loss,eval_accuracy,eval_f1,eval_precision,eval_recall,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
f64,f64,f64,f64,f64,f64,f64,f64,f64
0.347226,0.860238,0.860238,0.860238,0.860238,145.968,86.32,10.79,3.0
