In [1]:
import math

import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from lib.trainer import get_trainer
from utils import (
    prepare_starncoder_tokenizer,
    get_latest_checkpoint,
    concat_tokens_to_chunks,
)
from config import BaseConfig, MLMConfig

In [None]:
# Checkpoints
MODEL_CHECKPOINT = "neuralsentry/starencoder-finetuned-class"
TOKENIZER_CHECKPOINT = "neuralsentry/starencoder-finetuned-class"

# Prepare Models
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT)
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Config
model_name = f"starencoder-finetuned-class"
config = MLMConfig(
    NUM_TRAIN_EPOCHS=20,
    BATCH_SIZE=32,
    MAX_INPUT_LENGTH=512,
    PUSH_TO_HUB=True,
    HUB_MODEL_ID=f"neuralsentry/{model_name}",
)

# Prepare Datasets
raw_dataset = load_dataset("neuralsentry/git-commits-labelled", split="train")
tokenized_dataset = raw_dataset.map(
    lambda x: tokenizer(
        x["commit_msg"], truncation=True, max_length=config.MAX_INPUT_LENGTH
    ),
    batched=True,
    remove_columns=["commit_msg", "sha", "remote_url", "date"],
)
split_dataset = tokenized_dataset.train_test_split(
    train_size=config.TRAIN_SIZE, test_size=config.EVAL_SIZE, seed=420
)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Prepare Training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir=f"./models/{model_name}",
    overwrite_output_dir=True,
    learning_rate=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    fp16=config.FP16,
    evaluation_strategy="epoch",
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    logging_steps=math.floor(
        (len(train_dataset) // config.BATCH_SIZE) * config.LOGGING_STEPS
    ),
    save_strategy="epoch",
    push_to_hub=config.PUSH_TO_HUB,
    hub_model_id=config.HUB_MODEL_ID,
)


def compute_metrics(pred: EvalPrediction):
    preds = np.argmax(pred.predictions[0], axis=-1)
    precision = precision_score(y_true=pred.label_ids, y_pred=preds)
    recall = recall_score(y_true=pred.label_ids, y_pred=preds)
    f1 = f1_score(y_true=pred.label_ids, y_pred=preds)
    accuracy = accuracy_score(y_true=pred.label_ids, y_pred=preds)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [5]:
trainer.evaluate()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 0.4350101947784424,
 'eval_accuracy': 0.8961038961038961,
 'eval_precision': 0.8604651162790697,
 'eval_recall': 0.9487179487179487,
 'eval_f1': 0.9024390243902439,
 'eval_runtime': 0.676,
 'eval_samples_per_second': 113.905,
 'eval_steps_per_second': 4.438}