In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from utils.datasets import get_datasets
from utils.models import get_existing_models

In [2]:
DATASET_SAMPLE_SIZE = 10

datasets = get_datasets()
datasets_without_center = get_datasets()

for dataset in datasets:
    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

for dataset in datasets_without_center:
    dataset.dataframe = dataset.dataframe[dataset.dataframe["leaning"] != "center"]

    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

In [3]:
TRUNCATE_TOKENS = True
# Lambda, so that the generator can be reused.
GET_MODELS = lambda: get_existing_models()

accuracy_results = []

for model_index, model in enumerate(GET_MODELS()):
    print(f"evaluating {model.name} on:")
    accuracy_results.append([])
    for dataset_index in range(len(datasets)):
        dataset = datasets[dataset_index] if model.supports_center_leaning else datasets_without_center[dataset_index]
        print(f"  {dataset.name}")

        predictions = []
        for body_index, body in enumerate(tqdm(dataset.dataframe["body"])):
            try:
                predictions.append(model.predict(body, TRUNCATE_TOKENS))
            except RuntimeError:
                if TRUNCATE_TOKENS:
                    raise
                predictions.append(None)

        valid_indices = [i for i, prediction in enumerate(predictions) if prediction is not None]
        predictions = list(map(lambda prediction: prediction.value, [predictions[i] for i in valid_indices]))
        accuracy = accuracy_score(
            dataset.dataframe["leaning"].iloc[valid_indices].tolist(),
            predictions
        ) if len(predictions) > 0 else 0
        accuracy_results[-1].append(
            f"{len(valid_indices) * accuracy:.0f}/{len(valid_indices)} ({np.round(accuracy * 100, 2)} %)"
        )

evaluating PoliticalBiasBert on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 23.33it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 36.67it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 103.12it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 37.78it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 34.20it/s]


evaluating PoliticalBiasPredictionAllsidesDeberta on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:01<00:00,  9.09it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:01<00:00,  5.05it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 56.13it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:01<00:00,  6.84it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:01<00:00,  6.61it/s]


evaluating DistilBertPoliticalBias on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 52.02it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 49.20it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 186.62it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 53.91it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 53.81it/s]


evaluating BertPoliticalBiasFinetune on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 36.24it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 30.39it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 119.88it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 32.71it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 33.42it/s]


evaluating DistilBertPoliticalFinetune on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 70.59it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 61.23it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 200.32it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 64.96it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 64.58it/s]


In [5]:
results_df = pd.DataFrame(
    accuracy_results,
    index=list(map(lambda model: model.name, GET_MODELS())),
    columns=list(map(lambda dataset: dataset.name, datasets)),
)

results_df

Unnamed: 0,commoncrawl_news_articles,article_bias_prediction,qbias,webis_news_bias_20,webis_bias_flipper_18
PoliticalBiasBert,1/10 (10.0 %),7/10 (70.0 %),4/10 (40.0 %),6/10 (60.0 %),6/10 (60.0 %)
PoliticalBiasPredictionAllsidesDeberta,6/10 (60.0 %),1/10 (10.0 %),7/10 (70.0 %),8/10 (80.0 %),7/10 (70.0 %)
DistilBertPoliticalBias,1/10 (10.0 %),1/10 (10.0 %),2/10 (20.0 %),4/10 (40.0 %),4/10 (40.0 %)
BertPoliticalBiasFinetune,4/10 (40.0 %),7/10 (70.0 %),6/10 (60.0 %),5/10 (50.0 %),5/10 (50.0 %)
DistilBertPoliticalFinetune,5/10 (50.0 %),2/10 (20.0 %),5/10 (50.0 %),3/10 (30.0 %),4/10 (40.0 %)
