In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from tqdm import tqdm

from utils.datasets import get_datasets
from utils.models import get_dataset_benchmark_models

In [None]:
DATASET_SAMPLE_SIZE = 1_000

datasets = list(get_datasets())
for dataset in datasets:
    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

datasets_without_center = list(get_datasets())
for dataset in datasets_without_center:
    dataset.dataframe = dataset.dataframe[dataset.dataframe["leaning"] != "center"]

    if len(dataset.dataframe) <= DATASET_SAMPLE_SIZE:
        continue

    dataset.dataframe = dataset.dataframe.iloc[
                        ::len(dataset.dataframe) // DATASET_SAMPLE_SIZE
                        ].head(DATASET_SAMPLE_SIZE)

In [None]:
TRUNCATE_TOKENS = True
# Lambda, so that the generator can be reused.
GET_MODELS = lambda: get_dataset_benchmark_models()

accuracy_results = []

for model_index, model in enumerate(GET_MODELS()):
    print(f"evaluating {model.name} on:")

    accuracy_results.append([])
    total_predictions_count = 0
    total_correct_predictions_count = 0

    for dataset_index in range(len(datasets)):
        dataset = datasets[dataset_index] if model.supports_center_leaning else datasets_without_center[dataset_index]
        print(f"  {dataset.name}")

        predictions = []
        for body_index, body in enumerate(tqdm(dataset.dataframe["body"])):
            try:
                predictions.append(model.predict(body, TRUNCATE_TOKENS))
            except RuntimeError:
                if TRUNCATE_TOKENS:
                    raise
                predictions.append(None)

        valid_indices = [i for i, prediction in enumerate(predictions) if prediction is not None]
        predictions = list(map(lambda prediction: prediction.value, [predictions[i] for i in valid_indices]))
        accuracy = accuracy_score(
            dataset.dataframe["leaning"].iloc[valid_indices].tolist(),
            predictions
        ) if len(predictions) > 0 else 0

        predictions_count = len(valid_indices)
        correct_predictions_count = predictions_count * accuracy

        accuracy_results[-1].append(
            f"{correct_predictions_count:.0f} / {predictions_count} ({accuracy * 100:.0f} %)"
        )
        if model.name.split("/")[-1] != dataset.name:
            total_predictions_count += predictions_count
            total_correct_predictions_count += correct_predictions_count

    average_accuracy = total_correct_predictions_count / total_predictions_count if total_predictions_count > 0 else 0
    accuracy_results[-1].append(
        f"{total_correct_predictions_count:.0f} / {total_predictions_count} ({average_accuracy * 100:.0f} %)"
    )

In [None]:
results_df = pd.DataFrame(
    accuracy_results,
    index=list(map(lambda model: model.name, GET_MODELS())),
    columns=list(map(lambda dataset: dataset.name, datasets)) + ["average"],
)
results_df