In [None]:
from pathlib import Path

import evaluate
from datasets import Dataset, DatasetInfo
from transformers import IntervalStrategy

from utils.dataset_utils import get_datasets, take_sample, transform_train_labels
from utils.model_utils import POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING, \
    POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING, finetune_custom_models

In [None]:
TRAIN_DATASET_SAMPLE_SIZE = 1_000
EVAL_DATASET_SAMPLE_SIZE = 100

train_datasets = []
eval_datasets = []
for dataset in get_datasets():
    dataframe = dataset.dataframe
    label_mapping = POLITICAL_LEANING_WITH_CENTER_LABEL_MAPPING if len(dataframe["leaning"].unique()) == 3 \
        else POLITICAL_LEANING_NO_CENTER_LABEL_MAPPING

    eval_dataframe = take_sample(dataframe, EVAL_DATASET_SAMPLE_SIZE)
    eval_dataframe = transform_train_labels(eval_dataframe, label_mapping)

    # Remove the eval sample from the source dataframe.
    dataframe = dataframe.loc[~dataframe.index.isin(eval_dataframe.index)]

    train_dataframe = take_sample(dataframe, TRAIN_DATASET_SAMPLE_SIZE)
    train_dataframe = transform_train_labels(train_dataframe, label_mapping)

    train_datasets.append(Dataset.from_pandas(train_dataframe, info=DatasetInfo(dataset_name=dataset.name)))
    eval_datasets.append(Dataset.from_pandas(eval_dataframe, info=DatasetInfo(dataset_name=dataset.name)))

In [None]:
TRAINING_SEED = 37
DATA_SEED = 37
METRIC = evaluate.load("accuracy")
EVAL_STRATEGY = IntervalStrategy.EPOCH

finetune_custom_models(
    Path("dataset_benchmark", "leave_one_in"),
    train_datasets,
    eval_datasets,
    METRIC,
    EVAL_STRATEGY,
    TRAINING_SEED,
    DATA_SEED
)