In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from datasets import load_dataset


model = AutoModelForSequenceClassification.from_pretrained(
    "MateuszW/classifier-distilbert",
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(
    "MateuszW/classifier-distilbert",
    model_max_length=512,
)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_dataset = load_dataset(
    "MateuszW/spoiler_generation",
    data_files={
        "train": "clf_data/train.csv",
        "validation": "clf_data/val.csv",
    },
).map(tokenize_function, batched=True)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=None,
    eval_dataset=None,
)

In [None]:
len(tokenized_dataset["validation"]["label"])

In [None]:
pred = trainer.predict(tokenized_dataset["validation"])

In [None]:
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(tokenized_dataset["validation"]["label"], pred.predictions.argmax(1))

In [None]:
from collections import Counter

Counter(tokenized_dataset["validation"]["label"])