In [None]:
from pathlib import Path

from transformers import IntervalStrategy

from utils.dataset_utils import get_leaning_datasets
from utils.model_utils import finetune_models

In [None]:
RESULT_SUBDIRECTORY_NAME = "political_leaning"
GET_DATASETS = lambda: get_leaning_datasets()
TEST_DATASET_SAMPLE_FRACTION = 0.15
EVAL_DATASET_SAMPLE_FRACTION = 0.15
TRAIN_DATASET_SAMPLE_SIZE = 2_000

whole_datasets = list(GET_DATASETS())

for dataset in whole_datasets:
    test_dataset = dataset.take_even_class_sample_by_fraction(TEST_DATASET_SAMPLE_FRACTION)
    # Remove the test sample from the source dataframe.
    dataset.dataframe = dataset.dataframe.loc[~dataset.dataframe.index.isin(test_dataset.dataframe.index)]

train_datasets = []
eval_datasets = []
for dataset in whole_datasets:
    eval_dataset = (
        dataset.take_even_class_sample_by_fraction(EVAL_DATASET_SAMPLE_FRACTION)
        .transform_for_inference()
    )

    # Remove the eval sample from the source dataframe.
    dataset.dataframe = dataset.dataframe.loc[~dataset.dataframe.index.isin(eval_dataset.dataframe.index)]

    train_dataset = (
        dataset.take_even_class_sample_by_size(TRAIN_DATASET_SAMPLE_SIZE)
        .transform_for_inference()
    )

    # Skip datasets containing only one class.
    if train_dataset.dataframe["label"].nunique() >= 2:
        train_datasets.append(train_dataset.to_huggingface())
        eval_datasets.append(eval_dataset.to_huggingface())

In [None]:
finetune_models(
    Path("dataset_benchmark", "leave_one_in", RESULT_SUBDIRECTORY_NAME),
    train_datasets,
    eval_datasets,
)