In [None]:
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from utils.base_directory import BASE_DIRECTORY
from utils.datasets import get_datasets, systematic_sample
from utils.models import DATASET_BENCHMARK_MODEL_NAMES, POLITICAL_LEANING_LABEL_MAPPING, \
    POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING

In [None]:
DATASET_SAMPLE_SIZE = 1_000

datasets = []
dataset_names = []
for dataset in get_datasets():
    dataframe = dataset.dataframe
    dataset_names.append(dataset.name)

    n_samples = int(np.ceil(DATASET_SAMPLE_SIZE / dataframe["leaning"].nunique()))
    dataframe = (dataframe.groupby("leaning", group_keys=False, observed=True)[["body", "leaning"]]
                 .apply(lambda group: systematic_sample(group, n_samples))
                 .head(DATASET_SAMPLE_SIZE))

    dataframe = dataframe.rename(columns={"leaning": "label"})
    label_mapping = POLITICAL_LEANING_LABEL_MAPPING if len(dataframe["label"].unique()) == 3 \
        else POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING
    dataframe["label"] = dataframe["label"].cat.rename_categories(label_mapping)

    datasets.append(Dataset.from_pandas(dataframe))

In [None]:
MODELS_MAX_LENGTH = 512

for model_name in DATASET_BENCHMARK_MODEL_NAMES:
    print(f"fine-tuning {model_name} on:")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    for dataset, dataset_name in zip(datasets, dataset_names):
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(dataset.unique("label")))
        print(f"  {dataset_name}")
        output_directory = (BASE_DIRECTORY
                            / "models_custom"
                            / "dataset_benchmark"
                            / model_name.split("/")[-1]
                            / dataset_name)

        tokenized_dataset = dataset.map(
            lambda batch: tokenizer(batch["body"], max_length=MODELS_MAX_LENGTH, truncation=True, padding="max_length"),
            batched=True
        )
        training_arguments = TrainingArguments(
            auto_find_batch_size=True,
            save_strategy="no",
            output_dir=output_directory
        )
        trainer = Trainer(
            model=model,
            args=training_arguments,
            train_dataset=tokenized_dataset
        )
        trainer.train()
        trainer.save_model(output_directory)