In [None]:
from pathlib import Path

import numpy as np
from datasets import Dataset, DatasetInfo

from utils.dataset_utils import get_datasets, systematic_sample
from utils.model_utils import POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING, \
    POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING, finetune_custom_models

In [None]:
DATASET_SAMPLE_SIZE = 1_000

datasets = []
for dataset in get_datasets():
    dataframe = dataset.dataframe

    n_samples = int(np.ceil(DATASET_SAMPLE_SIZE / dataframe["leaning"].nunique()))
    dataframe = (dataframe.groupby("leaning", group_keys=False, observed=True)[["body", "leaning"]]
                 .apply(lambda group: systematic_sample(group, n_samples))
                 .head(DATASET_SAMPLE_SIZE))

    dataframe = dataframe.rename(columns={"leaning": "label"})
    label_mapping = POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING if len(dataframe["label"].unique()) == 3 \
        else POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING
    dataframe["label"] = dataframe["label"].cat.rename_categories(label_mapping)

    datasets.append(Dataset.from_pandas(dataframe, info=DatasetInfo(dataset_name=dataset.name)))

In [None]:
TRAINING_SEED = 37
DATA_SEED = 37

finetune_custom_models(Path("dataset_benchmark", "leave_one_in"), datasets, TRAINING_SEED, DATA_SEED)