In [None]:

from pathlib import Path

import evaluate
from transformers import IntervalStrategy

from utils.dataset_utils import get_leaning_datasets
from utils.model_utils import finetune_models

In [None]:
RESULT_SUBDIRECTORY_NAME = "political_leaning"
GET_DATASETS = lambda: get_leaning_datasets()
TRAIN_DATASET_SAMPLE_SIZE = 1_000
EVAL_DATASET_SAMPLE_SIZE = 100

train_datasets = []
eval_datasets = []
for dataset in GET_DATASETS():
    eval_dataset = dataset.take_even_class_distribution_sample(EVAL_DATASET_SAMPLE_SIZE)
    eval_dataset = eval_dataset.transform_for_inference()

    # Remove the eval sample from the source dataframe.
    dataset.dataframe = dataset.dataframe.loc[~dataset.dataframe.index.isin(eval_dataset.dataframe.index)]

    train_dataset = dataset.take_even_class_distribution_sample(TRAIN_DATASET_SAMPLE_SIZE)
    train_dataset = train_dataset.transform_for_inference()

    # Skip datasets containing only one class.
    if train_dataset.dataframe["label"].nunique() >= 2:
        train_datasets.append(train_dataset.to_huggingface())
        eval_datasets.append(eval_dataset.to_huggingface())

In [None]:
TRAINING_SEED = 37
DATA_SEED = 37
EVAL_STRATEGY = IntervalStrategy.EPOCH

finetune_models(
    Path("dataset_benchmark", "leave_one_in", RESULT_SUBDIRECTORY_NAME),
    train_datasets,
    eval_datasets,
    EVAL_STRATEGY,
    TRAINING_SEED,
    DATA_SEED
)