In [1]:
from itertools import chain
from pathlib import Path

import datasets
from datasets import DatasetInfo, concatenate_datasets
from transformers import IntervalStrategy

from utils import dataset_utils
from utils.dataset_utils import get_politicalness_datasets, \
    get_politicalness_datasets_from_leaning_datasets_for_leave_one_out_benchmark, \
    leaning_with_center_label_mapping
from utils.model_utils import finetune_models

In [2]:
TRAINING_POLITICAL_LEANING = False
RESULT_SUBDIRECTORY_NAME = "politicalness"
GET_DATASETS = lambda: chain(
    get_politicalness_datasets(),
    get_politicalness_datasets_from_leaning_datasets_for_leave_one_out_benchmark()
)
TEST_DATASET_SAMPLE_FRACTION = 0.15
EVAL_DATASET_SAMPLE_SIZE = 100
TRAIN_DATASET_SAMPLE_SIZE = 10_000

CENTER_LEANING_CLASS_TRAIN_SIZE_MULTIPLIER = 4.5

whole_datasets = list(GET_DATASETS())

for dataset in whole_datasets:
    test_dataset = dataset.take_even_class_sample_by_fraction(TEST_DATASET_SAMPLE_FRACTION)
    # Remove the test sample from the source dataframe.
    dataset.dataframe = dataset.dataframe.loc[~dataset.dataframe.index.isin(test_dataset.dataframe.index)]


def get_eval_dataset(whole_dataset: dataset_utils.Dataset) -> datasets.Dataset:
    eval_dataset = whole_dataset.take_even_class_sample_by_size(EVAL_DATASET_SAMPLE_SIZE)
    # Remove the eval sample from the source dataframe.
    whole_dataset.dataframe = whole_dataset.dataframe.loc[
        ~whole_dataset.dataframe.index.isin(eval_dataset.dataframe.index)]
    return eval_dataset.transform_for_inference(
        leaning_with_center_label_mapping if TRAINING_POLITICAL_LEANING else None
    ).to_huggingface()


eval_dataset = concatenate_datasets(
    [get_eval_dataset(whole_dataset) for whole_dataset in whole_datasets],
    info=DatasetInfo(dataset_name="eval")
)


def get_train_dataset(whole_dataset: dataset_utils.Dataset) -> datasets.Dataset:
    if TRAINING_POLITICAL_LEANING:
        train_dataset = whole_dataset.take_balanced_class_sample_by_size(
            TRAIN_DATASET_SAMPLE_SIZE,
            CENTER_LEANING_CLASS_TRAIN_SIZE_MULTIPLIER
        )
        train_dataset = train_dataset.transform_for_inference(
            leaning_with_center_label_mapping
        )
    else:
        train_dataset = whole_dataset.take_even_class_sample_by_size(TRAIN_DATASET_SAMPLE_SIZE)
        train_dataset = train_dataset.transform_for_inference()

    return train_dataset.to_huggingface()


train_dataset = concatenate_datasets(
    [get_train_dataset(whole_dataset) for whole_dataset in whole_datasets],
    info=DatasetInfo(dataset_name="train")
)

del whole_datasets

train_dataset.to_pandas().groupby("label").count()

Unnamed: 0_level_0,text,__index_level_0__
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,151776,151776
1,93660,93660


In [None]:
TRAINING_SEED = 37
DATA_SEED = 37
EVAL_STRATEGY = IntervalStrategy.EPOCH

finetune_models(
    Path("training", RESULT_SUBDIRECTORY_NAME),
    [train_dataset],
    [eval_dataset],
    EVAL_STRATEGY,
    TRAINING_SEED,
    DATA_SEED
)