## Training DeBERTa 4 over 90% of transformed SciFact dataset

In [None]:
import numpy as np
import pandas as pd
import time

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

import datasets
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback, IntervalStrategy

In [7]:
with open('./SciFact_train.csv', 'rb') as fh:
    df_train = pd.read_csv(fh)
df_train.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
with open('./SciFact_valid.csv', 'rb') as fh:
    df_valid = pd.read_csv(fh)
df_valid.drop('Unnamed: 0', axis=1, inplace=True)

In [9]:
with open('./SciFact_test.csv', 'rb') as fh:
    df_test = pd.read_csv(fh)
df_test.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
df_train_90 = pd.concat([df_train, df_test], ignore_index=True)
df_train_90

Unnamed: 0,claim,abstract,label
0,Angiotensin converting enzyme inhibitors are a...,Renal considerations in angiotensin converting...,1
1,Reducing H3k4me3 methylation induces mouse epi...,MLL1 Inhibition Reprograms Epiblast Stem Cells...,1
2,Expression of oncolytic virus antigens as pept...,Detecting and targeting tumor relapse by its r...,0
3,Varenicline monotherapy is more effective afte...,Combination varenicline and bupropion SR for t...,2
4,Acute ablation of Snail in the embryonic corte...,Control of Apoptosis by Asymmetric Cell Divisi...,0
...,...,...,...
1088,Charcoal shows no benefit for acute paraquat p...,Effect of activated charcoal hemoperfusion on ...,1
1089,The risk of breast cancer among parous women d...,Pregnancy characteristics and maternal risk of...,2
1090,Inositol lipid 3-phosphatase PTEN converts Ptd...,"PTEN Regulates PI(3,4)P2 Signaling Downstream ...",1
1091,Combination nicotine replacement therapies wit...,Combination varenicline and bupropion SR for t...,1


In [11]:
df_valid['text'] = tokenizer.cls_token + df_valid['claim'] + tokenizer.sep_token + df_valid['abstract'] + tokenizer.sep_token
df_valid.drop(columns=['claim', 'abstract'], inplace=True)

df_train_90['text'] = tokenizer.cls_token + df_train_90['claim'] + tokenizer.sep_token + df_train_90['abstract'] + tokenizer.sep_token
df_train_90.drop(columns=['claim', 'abstract'], inplace=True)

In [12]:
konacno_train = Dataset.from_pandas(df_train_90)
konacno_valid = Dataset.from_pandas(df_valid)

In [13]:
dataset_dict_90 = datasets.DatasetDict({
    'train': konacno_train,
    'valid': konacno_valid
})

In [15]:
tokenized_90 = dataset_dict_90.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
tokenized_90

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1093
    })
    valid: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
})

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='weighted')
    precision = precision_score(y_true=labels, y_pred=pred, average='weighted')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')    
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [17]:
training_args = TrainingArguments(
    output_dir="./DeBERTa_4_90_new",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    save_total_limit = 5,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 50,
    #save_strategy="epoch",
    metric_for_best_model = 'f1',
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_90['train'],
    eval_dataset=tokenized_90['valid'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

In [None]:
start_time = time.time()
trainer.train()
total_time = time.time()-start_time

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
50,No log,1.066213,0.366667,0.134444,0.366667,0.196748
100,No log,0.988397,0.483333,0.513614,0.483333,0.369491
150,No log,1.057728,0.575,0.631869,0.575,0.557665
200,No log,0.777534,0.708333,0.56119,0.708333,0.624557
250,No log,0.923023,0.616667,0.55904,0.616667,0.543764
300,No log,0.770434,0.625,0.490835,0.625,0.541522
350,No log,0.640513,0.75,0.595095,0.75,0.66185
400,No log,0.727374,0.65,0.6415,0.65,0.638998
450,No log,0.706437,0.691667,0.720506,0.691667,0.649547
500,0.859000,0.635349,0.8,0.813652,0.8,0.792877


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
trainer.save_model("DeBERTa_early4_90_new")