In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
from datasets import load_dataset, DatasetDict
import torch
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

In [2]:
model_ckpt = "Alibaba-NLP/gte-base-en-v1.5"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
print(device)

cuda


In [6]:
dataset = DatasetDict({
    "train": load_dataset("csv", data_files="data/train_under.csv", split="train"),
    "validation": load_dataset("csv", data_files="data/val_under.csv", split="train"),
    "test": load_dataset("csv", data_files="data/test.csv", split="train"),
})

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['corrected_text', 'length', 'ratio_err', 'labels'],
        num_rows: 8923
    })
    validation: Dataset({
        features: ['corrected_text', 'length', 'ratio_err', 'labels'],
        num_rows: 2231
    })
    test: Dataset({
        features: ['corrected_text', 'length', 'ratio_err', 'labels'],
        num_rows: 3462
    })
})


In [8]:
dataset = dataset.map(
    lambda x: {
        "text" : x["corrected_text"]
    },
    remove_columns = ["corrected_text", "length", "ratio_err"]
)

In [9]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'text'],
        num_rows: 8923
    })
    validation: Dataset({
        features: ['labels', 'text'],
        num_rows: 2231
    })
    test: Dataset({
        features: ['labels', 'text'],
        num_rows: 3462
    })
})


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [11]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [12]:
dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/2231 [00:00<?, ? examples/s]

In [13]:
tokenizer.tokenize("Hello world, how are u ?", add_special_tokens=True)

['[CLS]', 'hello', 'world', ',', 'how', 'are', 'u', '?', '[SEP]']

In [14]:
tokenizer.pad_token

'[PAD]'

In [15]:
dataset = dataset.map(lambda x: {"labels": x["labels"] - 1})

Map:   0%|          | 0/2231 [00:00<?, ? examples/s]

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           trust_remote_code=True,
                                                           num_labels=6).to(device)

Some weights of NewForSequenceClassification were not initialized from the model checkpoint at Alibaba-NLP/gte-base-en-v1.5 and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
print(model)

NewForSequenceClassification(
  (new): NewModel(
    (embeddings): NewEmbeddings(
      (word_embeddings): Embedding(30528, 768, padding_idx=0)
      (rotary_emb): NTKScalingRotaryEmbedding()
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): NewEncoder(
      (layer): ModuleList(
        (0-11): 12 x NewLayer(
          (attention): NewAttention(
            (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (o_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (mlp): NewGatedMLP(
            (up_gate_proj): Linear(in_features=768, out_features=6144, bias=False)
            (down_proj): Linear(in_features=3072, out_features=768, bias=True)
            (act_fn): GELUActivation()
            (hidden_dropout): Dropout(p=0.1, inplace=False)
          )
          (attn_ln): LayerNorm((768,), 

In [18]:
def compute_metrics(preds: EvalPrediction):
    """
    Compute metrics for the task

    Args:
        preds {EvalPrediction}: the predictions from the model
    Returns:
        dict: a dictionary of metrics
    """
    
    labels = preds.label_ids
    preds = preds.predictions.argmax(-1)

    acc = accuracy_score(labels, preds)
    weighted_precision = precision_score(labels, preds, average="weighted")
    weighted_recall = recall_score(labels, preds, average="weighted")
    weighted_f1 = f1_score(labels, preds, average="weighted")
    mcc = matthews_corrcoef(labels, preds)

    macro_precision = precision_score(labels, preds, average="macro")
    macro_recall = recall_score(labels, preds, average="macro")
    macro_f1 = f1_score(labels, preds, average="macro")

    micro_precision = precision_score(labels, preds, average="micro")
    micro_recall = recall_score(labels, preds, average="micro")
    micro_f1 = f1_score(labels, preds, average="micro")

    return {
        "accuracy": acc,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "weighted_f1": weighted_f1,
        "mcc": mcc,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall,
        "macro_f1": macro_f1,
        "micro_precision": micro_precision,
        "micro_recall": micro_recall,
        "micro_f1": micro_f1
    }

In [33]:
BATCH_SIZE = 32
LR = 2e-5

training_args = TrainingArguments(
    output_dir="./training/gte-base-rating-fine-tuned",
    num_train_epochs=6,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    learning_rate=LR,
    evaluation_strategy="steps",
    eval_steps=200,
    report_to="tensorboard",
    lr_scheduler_type="cosine"
)

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [30]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.55 GiB. GPU 0 has a total capacity of 22.17 GiB of which 462.38 MiB is free. Process 14916 has 21.71 GiB memory in use. Of the allocated memory 20.14 GiB is allocated by PyTorch, and 1.35 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)