In [1]:
%reload_ext autoreload
%autoreload 2

import os
import datetime
import pandas as pd

from evaluation.template_list import template_list

import utils
from utils import logger

In [2]:
batch_size = 32
dataset_cutoff = 200
checkpoints = ['bigscience/T0_3B', 'bigscience/T0', 'bigscience/T0pp', 'google/flan-t5-xl', 'google/flan-t5-xxl']
# checkpoints = ['bigscience/T0']

today = datetime.datetime.now().strftime("%Y%m%d")
output_dir = f'./evaluation_result/{today}'

os.makedirs(output_dir, exist_ok=True)

In [3]:
for checkpoint in checkpoints:
    t2t = utils.build_t2t(checkpoint)
    results = []

    for (dataset_name, subset_name), prompts in template_list.items():
        raw_dataset = utils.load_raw_dataset(dataset_name, subset_name)

        for prompt_name in prompts:
            prompt = utils.get_prompt(dataset_name, subset_name, prompt_name)

            input_text, target_text = utils.preprocess_dataset(
                raw_dataset, prompt, cutoff=dataset_cutoff
            )
            test_size = len(input_text)

            accuracy, t_lapse, failed_cases = utils.eval(
                t2t, input_text, target_text, batch_size=batch_size
            )

            logger.info(
                f"{checkpoint}: {dataset_name}/{subset_name} - {prompt_name}: {accuracy}"
            )

            result = utils.Result(
                checkpoint,
                dataset_name,
                subset_name,
                test_size,
                t_lapse,
                prompt_name,
                accuracy,
            )
            results.append(result)

            utils.dump_failed_cases_as_json(
                failed_cases,
                checkpoint,
                dataset_name,
                subset_name,
                prompt_name,
                output_dir,
            )

    utils.dump_result_as_csv(results, checkpoint, output_dir)

    del t2t

[2023-01-14 05:20:14,322] [benchmark] [utils.py:25] loading model from bigscience/T0_3B...
[2023-01-14 05:20:30,004] [datasets.builder] [builder.py:785] Found cached dataset super_glue (/workspaces/seed/cache/hf_dataset/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)
[2023-01-14 05:20:30,018] [datasets.arrow_dataset] [arrow_dataset.py:3930] Loading cached shuffled indices for dataset at /workspaces/seed/cache/hf_dataset/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed/cache-ddc6f475405b3d20.arrow
[2023-01-14 05:20:33,264] [benchmark] [3198601099.py:20] bigscience/T0_3B: super_glue/rte - MNLI crowdsource: 0.66
[2023-01-14 05:20:33,265] [benchmark] [utils.py:187] dumping failed cases to ./evaluation_result/20230114/T0_3B/super_glue/rte/MNLI crowdsource.json...
[2023-01-14 05:20:33,279] [datasets.arrow_dataset] [arrow_dataset.py:3930] Loading cached shuffled indices for dataset at /workspaces/seed/cache/hf_dataset