In [None]:
from pathlib import Path

import numpy as np
from datasets import Dataset, DatasetInfo, concatenate_datasets

from utils.dataset_utils import get_datasets_for_leave_one_out_benchmark, systematic_sample
from utils.model_utils import POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING, finetune_custom_models

In [None]:
DATASET_SAMPLE_SIZE = 1_000

datasets_separate = []
for dataset in get_datasets_for_leave_one_out_benchmark():
    dataframe = dataset.dataframe

    n_samples = int(np.ceil(DATASET_SAMPLE_SIZE / dataframe["leaning"].nunique()))
    dataframe = (dataframe.groupby("leaning", group_keys=False, observed=True)[["body", "leaning"]]
                 .apply(lambda group: systematic_sample(group, n_samples))
                 .head(DATASET_SAMPLE_SIZE))

    dataframe = dataframe.rename(columns={"leaning": "label"})
    # Always map with center, otherwise the non-center datasets would produce wrong labels.
    dataframe["label"] = dataframe["label"].cat.rename_categories(POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING)

    datasets_separate.append(Dataset.from_pandas(dataframe, info=DatasetInfo(dataset_name=dataset.name)))

datasets = [
    concatenate_datasets(
        list(filter(
            lambda dataset: dataset.info.dataset_name != left_out_dataset.info.dataset_name, datasets_separate
        )),
        info=left_out_dataset.info
    ) for left_out_dataset in datasets_separate
]

In [None]:
TRAINING_SEED = 37
DATA_SEED = 37

finetune_custom_models(Path("dataset_benchmark", "leave_one_out"), datasets, TRAINING_SEED, DATA_SEED)