In [None]:
from itertools import chain
from pathlib import Path

import datasets
from datasets import DatasetInfo, concatenate_datasets
from transformers import IntervalStrategy

from utils import dataset_utils
from utils.dataset_utils import get_politicalness_datasets, \
    get_politicalness_datasets_from_leaning_datasets_for_leave_one_out_benchmark, \
    leaning_with_center_label_mapping
from utils.model_utils import finetune_models

In [None]:
TRAINING_POLITICAL_LEANING = False
RESULT_SUBDIRECTORY_NAME = "political_leaning"
GET_DATASETS = lambda: chain(
    get_politicalness_datasets(),
    get_politicalness_datasets_from_leaning_datasets_for_leave_one_out_benchmark()
)
TEST_DATASET_SAMPLE_PART = 0.15
EVAL_DATASET_SAMPLE_SIZE = 10_000
TRAIN_DATASET_SAMPLE_SIZE = 10_000

CENTER_LEANING_CLASS_SIZE_MULTIPLIERS = {
    "article_bias_prediction": 5,
    "bignewsbln": 10,  # There is not enough center leaning examples to balance this out.
    "commoncrawl_news_articles": 5.5,
    "gpt4_political_bias": 3.25,
    "gpt4_political_ideologies": 3,
    "qbias": 3.25,
}
CENTER_LEANING_CLASS_SIZE_MULTIPLIER_DEFAULT = 2.6

whole_datasets = list(GET_DATASETS())

for dataset in whole_datasets:
    print(dataset.name, round(len(dataset.dataframe) * TEST_DATASET_SAMPLE_PART))
    test_dataset = dataset.take_even_class_distribution_sample(round(len(dataset.dataframe) * TEST_DATASET_SAMPLE_PART))
    # Remove the test sample from the source dataframe.
    dataset.dataframe = dataset.dataframe.loc[~dataset.dataframe.index.isin(test_dataset.dataframe.index)]

eval_datasets = []
for dataset in whole_datasets:
    dataset = dataset.take_even_class_distribution_sample(EVAL_DATASET_SAMPLE_SIZE)
    dataset = dataset.transform_for_inference(
        leaning_with_center_label_mapping if TRAINING_POLITICAL_LEANING else None
    )
    eval_datasets.append(dataset.to_huggingface())


def get_train_dataset(left_out_dataset: dataset_utils.Dataset) -> datasets.Dataset:
    train_datasets_separate = []

    for dataset in filter(
            lambda dataset: dataset.name != left_out_dataset.name,
            whole_datasets,
    ):
        if TRAINING_POLITICAL_LEANING:
            dataset = dataset.take_balanced_class_distribution_sample(
                TRAIN_DATASET_SAMPLE_SIZE,
                CENTER_LEANING_CLASS_SIZE_MULTIPLIERS[left_out_dataset.name]
                if left_out_dataset.name in CENTER_LEANING_CLASS_SIZE_MULTIPLIERS.keys()
                else CENTER_LEANING_CLASS_SIZE_MULTIPLIER_DEFAULT
            )
            dataset = dataset.transform_for_inference(
                leaning_with_center_label_mapping
            )
        else:
            dataset.take_even_class_distribution_sample(TRAIN_DATASET_SAMPLE_SIZE)
            dataset = dataset.transform_for_inference()

        train_datasets_separate.append(dataset.to_huggingface())

    dataset = concatenate_datasets(
        train_datasets_separate,
        info=DatasetInfo(dataset_name=left_out_dataset.name)
    )
    print(left_out_dataset.name)
    print(dataset.to_pandas().groupby("label").count())
    return dataset


train_datasets = [
    get_train_dataset(left_out_dataset) for left_out_dataset in whole_datasets
]

In [None]:
TRAINING_SEED = 37
DATA_SEED = 37
EVAL_STRATEGY = IntervalStrategy.EPOCH

finetune_models(
    Path("dataset_benchmark", "leave_one_out", RESULT_SUBDIRECTORY_NAME),
    train_datasets,
    eval_datasets,
    EVAL_STRATEGY,
    TRAINING_SEED,
    DATA_SEED
)