In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
from datasets import load_dataset, DatasetDict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [5]:
model_ckpt = "/teamspace/studios/this_studio/Automatic-exam-scoring/training/bge-small-en-finetuned-oversampled/checkpoint-1000"

In [6]:
data = DatasetDict(
    {
        "train": load_dataset("csv", data_files="../data/train_oversampled.csv", split="train"),
        "validation": load_dataset("csv", data_files="../data/val_oversampled.csv", split="train"),
        "test": load_dataset("csv", data_files="../data/test.csv", split="train")
    }
)

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'corrected_text', 'length', 'ratio_err', 'labels', 'is_generated'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'corrected_text', 'length', 'ratio_err', 'labels', 'is_generated'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['corrected_text', 'length', 'ratio_err', 'labels'],
        num_rows: 3462
    })
})

In [8]:
data["train"] = data["train"].remove_columns(["Unnamed: 0", "length", "ratio_err", "is_generated"])
data["validation"] = data["validation"].remove_columns(["Unnamed: 0", "length", "ratio_err", "is_generated"])
data["test"] = data["test"].remove_columns(["length", "ratio_err"])

In [9]:
data

DatasetDict({
    train: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 3462
    })
})

In [10]:
data = data.map(lambda x: 
    {
        "text": x["corrected_text"]
    },
    remove_columns=["corrected_text"]
)

In [11]:
data

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 3462
    })
})

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels=6).to("cuda")

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [14]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [15]:
data = data.map(tokenize, batched=True)

In [16]:
data = data.map(lambda x:
    {
        "labels": x["labels"] - 1
    }
)

In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
def compute_metrics(preds: EvalPrediction):
    """
    Compute metrics for the task

    Args:
        preds {EvalPrediction}: the predictions from the model
    Returns:
        dict: a dictionary of metrics
    """
    
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    weighted_precision = precision_score(labels, preds, average="weighted")
    weighted_recall = recall_score(labels, preds, average="weighted")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    mcc = matthews_corrcoef(labels, preds)

    macro_precision = precision_score(labels, preds, average="macro")
    macro_recall = recall_score(labels, preds, average="macro")
    macro_f1 = f1_score(labels, preds, average="macro")

    micro_precision = precision_score(labels, preds, average="micro")
    micro_recall = recall_score(labels, preds, average="micro")
    micro_f1 = f1_score(labels, preds, average="micro")

    return {
        "accuracy": acc,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "weighted_f1": weighted_f1,
        "mcc": mcc,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1
    }

In [19]:
BATCH_SIZE = 32
LR = 2e-5

In [20]:
training_args = TrainingArguments(
    output_dir="./bge-small-en-finetuned-oversampled",
    num_train_epochs=4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    learning_rate=LR,
    evaluation_strategy="steps",
    eval_steps=200,
    report_to="tensorboard",
    lr_scheduler_type="cosine"
)

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [22]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Weighted Precision,Weighted Recall,Weighted F1,Mcc,Macro Precision,Macro Recall,Macro F1,Micro Precision,Micro Recall,Micro F1
200,0.5372,0.557486,0.782761,0.792233,0.782761,0.784528,0.74089,0.79061,0.784143,0.784285,0.782761,0.782761,0.782761
400,0.4851,0.540861,0.777087,0.778463,0.777087,0.776563,0.733026,0.777842,0.777497,0.776432,0.777087,0.777087,0.777087
600,0.4541,0.55585,0.778168,0.79869,0.778168,0.784984,0.735069,0.797096,0.778401,0.784368,0.778168,0.778168,0.778168
800,0.4425,0.557812,0.775466,0.789165,0.775466,0.780305,0.731298,0.787799,0.775847,0.779864,0.775466,0.775466,0.775466
1000,0.3818,0.564485,0.781681,0.79247,0.781681,0.784987,0.739014,0.790954,0.782419,0.784555,0.781681,0.781681,0.781681
1200,0.4214,0.554898,0.782221,0.794114,0.782221,0.786613,0.739362,0.792882,0.78234,0.786038,0.782221,0.782221,0.782221
1400,0.3953,0.544715,0.780059,0.78612,0.780059,0.780801,0.737153,0.785034,0.780555,0.780448,0.780059,0.780059,0.780059
1600,0.3684,0.550533,0.783572,0.788468,0.783572,0.785473,0.740526,0.787275,0.783684,0.784919,0.783572,0.783572,0.783572
1800,0.3616,0.55297,0.783842,0.790361,0.783842,0.786218,0.741018,0.789315,0.784251,0.785859,0.783842,0.783842,0.783842


TrainOutput(global_step=1852, training_loss=0.43635281218823296, metrics={'train_runtime': 1888.5596, 'train_samples_per_second': 31.353, 'train_steps_per_second': 0.981, 'total_flos': 3900736220258304.0, 'train_loss': 0.43635281218823296, 'epoch': 4.0})

In [23]:
trainer.evaluate(
    eval_dataset=data["test"]
)

{'eval_loss': 0.9609768390655518,
 'eval_accuracy': 0.6031195840554593,
 'eval_weighted_precision': 0.6019493329938492,
 'eval_weighted_recall': 0.6031195840554593,
 'eval_weighted_f1': 0.5983203265683172,
 'eval_mcc': 0.4617024523430584,
 'eval_macro_precision': 0.5363805989455412,
 'eval_macro_recall': 0.4865621870958263,
 'eval_macro_f1': 0.4964570410129227,
 'eval_micro_precision': 0.6031195840554593,
 'eval_micro_recall': 0.6031195840554593,
 'eval_micro_f1': 0.6031195840554593,
 'eval_runtime': 31.123,
 'eval_samples_per_second': 111.236,
 'eval_steps_per_second': 3.502,
 'epoch': 4.0}