In [1]:
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

from utils.base_directory import BASE_DIRECTORY
from utils.datasets import get_datasets
from utils.models import dataset_benchmark_model_names

In [2]:
DATASET_SAMPLE_SIZE = 1000
POLITICAL_LEANING_MAPPING = {"left": 0, "center": 1, "right": 2}


def systematic_sample(group, n):
    if n <= 0:
        raise ValueError("the sample size must be positive")
    if n >= len(group):
        return group
    indexes = list(range(0, len(group), max(1, len(group) // n)))[:n]
    return group.iloc[indexes]


datasets = []
dataset_names = []
for dataset in get_datasets():
    dataframe = dataset.dataframe
    dataset_names.append(dataset.name)

    n_samples = int(np.ceil(DATASET_SAMPLE_SIZE / dataframe["leaning"].nunique()))
    dataframe = (dataframe.groupby("leaning", group_keys=False, observed=True)[["body", "leaning"]]
                 .apply(lambda group: systematic_sample(group, n_samples))
                 .head(DATASET_SAMPLE_SIZE))

    dataframe = dataframe.rename(columns={"leaning": "label"})
    dataframe["label"] = dataframe["label"].cat.rename_categories(POLITICAL_LEANING_MAPPING)

    datasets.append(Dataset.from_pandas(dataframe))

In [3]:
MODELS_MAX_LENGTH = 512

for model_name in dataset_benchmark_model_names:
    print(f"fine-tuning {model_name} on:")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
    for dataset, dataset_name in zip(datasets, dataset_names):
        print(f"  {dataset_name}")
        output_directory = (BASE_DIRECTORY
                            / "models_custom"
                            / "dataset_benchmark"
                            / model_name.split("/")[-1]
                            / dataset_name)

        tokenized_dataset = dataset.map(
            lambda batch: tokenizer(batch["body"], max_length=MODELS_MAX_LENGTH, truncation=True, padding="max_length"),
            batched=True
        )
        training_arguments = TrainingArguments(
            auto_find_batch_size=True,
            save_strategy="no",
            output_dir=output_directory
        )
        trainer = Trainer(
            model=model,
            args=training_arguments,
            train_dataset=tokenized_dataset
        )
        trainer.train()
        trainer.save_model(output_directory)

fine-tuning FacebookAI/roberta-base on:


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  article_bias_prediction


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


KeyboardInterrupt: 