In [72]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
from datasets import load_dataset, DatasetDict
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [53]:
model_ckpt = "BAAI/bge-small-en"

In [54]:
data = DatasetDict(
    {
        "train": load_dataset("csv", data_files="../data/train_oversampled.csv", split="train"),
        "validation": load_dataset("csv", data_files="../data/val_oversampled.csv", split="train"),
        "test": load_dataset("csv", data_files="../data/test.csv", split="train")
    }
)

In [55]:
data

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'corrected_text', 'length', 'ratio_err', 'labels', 'is_generated'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'corrected_text', 'length', 'ratio_err', 'labels', 'is_generated'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['corrected_text', 'length', 'ratio_err', 'labels'],
        num_rows: 3462
    })
})

In [56]:
data["train"] = data["train"].remove_columns(["Unnamed: 0", "length", "ratio_err", "is_generated"])
data["validation"] = data["validation"].remove_columns(["Unnamed: 0", "length", "ratio_err", "is_generated"])
data["test"] = data["test"].remove_columns(["length", "ratio_err"])

In [57]:
data

DatasetDict({
    train: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['corrected_text', 'labels'],
        num_rows: 3462
    })
})

In [58]:
data = data.map(lambda x: 
    {
        "text": x["corrected_text"]
    },
    remove_columns=["corrected_text"]
)

Map:   0%|          | 0/14803 [00:00<?, ? examples/s]

Map:   0%|          | 0/3701 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

In [59]:
data

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 14803
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 3701
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 3462
    })
})

In [60]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels=6).to("mps")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-small-en and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [62]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True)

In [63]:
data = data.map(tokenize, batched=True)

Map:   0%|          | 0/14803 [00:00<?, ? examples/s]

Map:   0%|          | 0/3701 [00:00<?, ? examples/s]

Map:   0%|          | 0/3462 [00:00<?, ? examples/s]

In [64]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [73]:
def compute_metrics(preds: EvalPrediction):
    """
    Compute metrics for the task

    Args:
        preds {EvalPrediction}: the predictions from the model
    Returns:
        dict: a dictionary of metrics
    """
    
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    weighted_precision = precision_score(labels, preds, average="weighted")
    weighted_recall = recall_score(labels, preds, average="weighted")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    mcc = matthews_corrcoef(labels, preds)

    macro_precision = precision_score(labels, preds, average="macro")
    macro_recall = recall_score(labels, preds, average="macro")
    macro_f1 = f1_score(labels, preds, average="macro")

    micro_precision = precision_score(labels, preds, average="micro")
    micro_recall = recall_score(labels, preds, average="micro")
    micro_f1 = f1_score(labels, preds, average="micro")

    return {
        "accuracy": acc,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "weighted_f1": weighted_f1,
        "mcc": mcc,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1
    }

In [95]:
BATCH_SIZE = 32
LS = len(data["train"]) // BATCH_SIZE
LR = 2e-5

In [97]:
training_args = TrainingArguments(
    output_dir="./bge-small-en-finetuned-oversampled",
    num_train_epochs=6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=LS,
    learning_rate=LR,
    evaluation_strategy="steps",
    eval_steps=LS,
    # load_best_model_at_end=True,
    report_to="tensorboard",
    lr_scheduler_type="cosine"
)

In [98]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"],
    eval_dataset=data["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [100]:
trainer.train()