In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from utils.datasets import get_datasets
from utils.models import get_existing_models, get_dataset_benchmark_models

In [2]:
DATASET_SAMPLE_SIZE = 10

datasets = get_datasets()
datasets_without_center = get_datasets()

for dataset in datasets:
    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

for dataset in datasets_without_center:
    dataset.dataframe = dataset.dataframe[dataset.dataframe["leaning"] != "center"]

    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

In [3]:
TRUNCATE_TOKENS = True
# Lambda, so that the generator can be reused.
GET_MODELS = lambda: get_dataset_benchmark_models()

accuracy_results = []

for model_index, model in enumerate(GET_MODELS()):
    print(f"evaluating {model.name} on:")
    accuracy_results.append([])
    for dataset_index in range(len(datasets)):
        dataset = datasets[dataset_index] if model.supports_center_leaning else datasets_without_center[dataset_index]
        print(f"  {dataset.name}")

        predictions = []
        for body_index, body in enumerate(tqdm(dataset.dataframe["body"])):
            try:
                predictions.append(model.predict(body, TRUNCATE_TOKENS))
            except RuntimeError:
                if TRUNCATE_TOKENS:
                    raise
                predictions.append(None)

        valid_indices = [i for i, prediction in enumerate(predictions) if prediction is not None]
        predictions = list(map(lambda prediction: prediction.value, [predictions[i] for i in valid_indices]))
        accuracy = accuracy_score(
            dataset.dataframe["leaning"].iloc[valid_indices].tolist(),
            predictions
        ) if len(predictions) > 0 else 0
        accuracy_results[-1].append(
            f"{len(valid_indices) * accuracy:.0f}/{len(valid_indices)} ({np.round(accuracy * 100, 2)} %)"
        )

evaluating dataset_benchmark/roberta-base/article_bias_prediction on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 23.53it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 31.96it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 98.71it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 146.53it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 32.90it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 32.22it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 154.92it/s]


evaluating dataset_benchmark/roberta-base/commoncrawl_news_articles on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 40.95it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 34.98it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 122.83it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 146.31it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 37.37it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 36.46it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 175.52it/s]


evaluating dataset_benchmark/roberta-base/gpt4_political_bias on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 37.43it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.77it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 121.54it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 129.24it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 29.90it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 35.92it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 168.20it/s]


evaluating dataset_benchmark/roberta-base/polistance_issue_tweets on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 39.33it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.41it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 119.38it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 141.01it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 37.20it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 32.71it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 156.90it/s]


evaluating dataset_benchmark/roberta-base/qbias on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 39.48it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.28it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 115.99it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 140.59it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 31.05it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 33.99it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 161.01it/s]


evaluating dataset_benchmark/roberta-base/webis_bias_flipper_18 on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 36.41it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 34.84it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 119.22it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 146.26it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 36.73it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 31.35it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 147.75it/s]


evaluating dataset_benchmark/roberta-base/webis_news_bias_20 on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 40.21it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 34.97it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 118.33it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 137.78it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 35.83it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 34.14it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 162.57it/s]


evaluating dataset_benchmark/bert-base-cased/article_bias_prediction on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 40.56it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 34.85it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 120.21it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 140.11it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 35.61it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 36.33it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 173.40it/s]


evaluating dataset_benchmark/bert-base-cased/commoncrawl_news_articles on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 40.66it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.24it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 117.65it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 120.72it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 33.81it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 35.42it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 170.71it/s]


evaluating dataset_benchmark/bert-base-cased/gpt4_political_bias on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 40.88it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 36.08it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 132.37it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 160.04it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 35.66it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 34.80it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 158.69it/s]


evaluating dataset_benchmark/bert-base-cased/polistance_issue_tweets on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 39.73it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 33.90it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 107.94it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 138.70it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 30.78it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 35.01it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 181.07it/s]


evaluating dataset_benchmark/bert-base-cased/qbias on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 41.38it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.00it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 122.17it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 142.95it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 36.82it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 37.12it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 181.58it/s]


evaluating dataset_benchmark/bert-base-cased/webis_bias_flipper_18 on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 38.79it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 30.13it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 113.33it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 138.09it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 37.74it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 35.66it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 177.72it/s]


evaluating dataset_benchmark/bert-base-cased/webis_news_bias_20 on:
  commoncrawl_news_articles


100%|██████████| 10/10 [00:00<00:00, 41.82it/s]


  article_bias_prediction


100%|██████████| 10/10 [00:00<00:00, 35.47it/s]


  qbias


100%|██████████| 10/10 [00:00<00:00, 121.65it/s]


  polistance_issue_tweets


100%|██████████| 10/10 [00:00<00:00, 144.59it/s]


  webis_news_bias_20


100%|██████████| 10/10 [00:00<00:00, 36.73it/s]


  webis_bias_flipper_18


100%|██████████| 10/10 [00:00<00:00, 30.88it/s]


  gpt4_political_bias


100%|██████████| 10/10 [00:00<00:00, 159.01it/s]


In [4]:
results_df = pd.DataFrame(
    accuracy_results,
    index=list(map(lambda model: model.name, GET_MODELS())),
    columns=list(map(lambda dataset: dataset.name, datasets)),
)

results_df

Unnamed: 0,commoncrawl_news_articles,article_bias_prediction,qbias,polistance_issue_tweets,webis_news_bias_20,webis_bias_flipper_18,gpt4_political_bias
dataset_benchmark/roberta-base/article_bias_prediction,3/10 (30.0 %),6/10 (60.0 %),2/10 (20.0 %),0/10 (0.0 %),2/10 (20.0 %),2/10 (20.0 %),5/10 (50.0 %)
dataset_benchmark/roberta-base/commoncrawl_news_articles,6/10 (60.0 %),2/10 (20.0 %),6/10 (60.0 %),4/10 (40.0 %),4/10 (40.0 %),3/10 (30.0 %),1/10 (10.0 %)
dataset_benchmark/roberta-base/gpt4_political_bias,3/10 (30.0 %),7/10 (70.0 %),2/10 (20.0 %),0/10 (0.0 %),2/10 (20.0 %),2/10 (20.0 %),5/10 (50.0 %)
dataset_benchmark/roberta-base/polistance_issue_tweets,4/10 (40.0 %),4/10 (40.0 %),5/10 (50.0 %),6/10 (60.0 %),5/10 (50.0 %),6/10 (60.0 %),6/10 (60.0 %)
dataset_benchmark/roberta-base/qbias,3/10 (30.0 %),7/10 (70.0 %),2/10 (20.0 %),0/10 (0.0 %),2/10 (20.0 %),2/10 (20.0 %),5/10 (50.0 %)
dataset_benchmark/roberta-base/webis_bias_flipper_18,3/10 (30.0 %),7/10 (70.0 %),2/10 (20.0 %),0/10 (0.0 %),2/10 (20.0 %),2/10 (20.0 %),5/10 (50.0 %)
dataset_benchmark/roberta-base/webis_news_bias_20,3/10 (30.0 %),7/10 (70.0 %),2/10 (20.0 %),0/10 (0.0 %),2/10 (20.0 %),2/10 (20.0 %),5/10 (50.0 %)
dataset_benchmark/bert-base-cased/article_bias_prediction,2/10 (20.0 %),2/10 (20.0 %),7/10 (70.0 %),4/10 (40.0 %),2/10 (20.0 %),3/10 (30.0 %),1/10 (10.0 %)
dataset_benchmark/bert-base-cased/commoncrawl_news_articles,6/10 (60.0 %),2/10 (20.0 %),6/10 (60.0 %),4/10 (40.0 %),4/10 (40.0 %),3/10 (30.0 %),1/10 (10.0 %)
dataset_benchmark/bert-base-cased/gpt4_political_bias,6/10 (60.0 %),2/10 (20.0 %),5/10 (50.0 %),4/10 (40.0 %),4/10 (40.0 %),3/10 (30.0 %),3/10 (30.0 %)
