In [3]:
import os
import json
import datasets
import torch
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd

from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration
)
from promptsource.templates import DatasetTemplates
from evaluation.template_list import template_list

import logging

logging.basicConfig(
    format="[%(asctime)s] [%(name)s] [%(filename)s:%(lineno)d] %(message)s",
)

logger = logging.getLogger("benchmark")
logger.setLevel(logging.INFO)

In [4]:
max_length = 1024
target_max_length = 256
batch_size = 8

checkpoints = ['bigscience/T0_3B', 'bigscience/T0', 'bigscience/T0pp']
output_dir = './evaluation_result'

os.makedirs(output_dir, exist_ok=True)


In [5]:
for checkpoint in checkpoints:
    model = T5ForConditionalGeneration.from_pretrained(checkpoint, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16)
    model.parallelize()

    tokenizer = AutoTokenizer.from_pretrained(checkpoint, padding_side='left')

    def t2t(batch, model=model, tokenizer=tokenizer):
        inputs = tokenizer(
                batch,
                padding='max_length',
                max_length=1024,
                truncation=True,
                add_special_tokens=False,
                return_tensors="pt"
        )
        
        outputs = model.generate(
            input_ids = inputs['input_ids'].cuda(),
            attention_mask = inputs['attention_mask'].cuda(),
            max_length=256, 
            temperature=0
        )

        output_sequences = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        return output_sequences
    
    results = []

    for (dataset_name, dataset_subset), prompts in template_list.items():
        raw_dataset = load_dataset(dataset_name, dataset_subset, split='dev_r1' if dataset_name == 'anli' else 'validation')

        #debug
        # raw_dataset = raw_dataset.select(range(10))

        if len(raw_dataset) >= 100:
            raw_dataset = raw_dataset.shuffle(seed=42).select(range(100)) # quick run through all models and datasets
        
        for prompt_name in prompts:
            logger.info(f"Starting {dataset_name}/{dataset_subset} - {prompt_name}")

            template = DatasetTemplates(f"{dataset_name}/{dataset_subset}" if dataset_subset else dataset_name)
            prompt = template[prompt_name]

            input_text = []
            target_text = []
            for i in raw_dataset:
                try:
                    i, o = prompt.apply(i)
                    input_text.append(i)
                    target_text.append(o)
                
                # log the error and continue
                except Exception as e:
                    logger.error(f"Error when applying {prompt_name} on {i}")
                    logger.error(e)
                    continue
            
            data_size = len(input_text)
            
            correct = 0

            for i in range(0, data_size, batch_size):
                batch = input_text[i:i+batch_size]
                batch_output = t2t(batch)    
                target = target_text[i:i+batch_size]

                batch_correct = (np.array(batch_output) == np.array(target)).sum()
                correct += batch_correct

            accuracy = correct / len(input_text)

            logger.info(f"{checkpoint}: {dataset_name}/{dataset_subset} - {prompt_name}: {accuracy}")
            result = {
                "model_name": checkpoint,
                "dataset_name": dataset_name,
                "dataset_subset_name": dataset_subset,
                "dataset_size": data_size,
                "prompt_name": prompt_name,
                "accuracy": accuracy,
            }

            results.append(result)
            
            result_df = pd.DataFrame(results)

            # write to csv per prompt, which make sense when we want to debug
            report_name = checkpoint.split('/')[-1]
            result_df.to_csv(os.path.join(output_dir, f"{report_name}_results.csv"), index=False)
    
    # release model from gpu memory
    del model

[2023-01-12 08:35:42,199] [datasets.builder] [builder.py:785] Found cached dataset super_glue (/workspaces/seed/cache/hf_dataset/super_glue/rte/1.0.3/bb9675f958ebfee0d5d6dc5476fafe38c79123727a7258d515c450873dbdbbed)
[2023-01-12 08:35:42,207] [benchmark] [2532015062.py:39] Starting super_glue/rte - MNLI crowdsource
[2023-01-12 08:35:53,741] [benchmark] [2532015062.py:72] bigscience/T0_3B: super_glue/rte - MNLI crowdsource: 0.61
[2023-01-12 08:35:53,744] [benchmark] [2532015062.py:39] Starting super_glue/rte - guaranteed true
[2023-01-12 08:36:05,178] [benchmark] [2532015062.py:72] bigscience/T0_3B: super_glue/rte - guaranteed true: 0.65
[2023-01-12 08:36:05,180] [benchmark] [2532015062.py:39] Starting super_glue/rte - can we infer
[2023-01-12 08:36:16,635] [benchmark] [2532015062.py:72] bigscience/T0_3B: super_glue/rte - can we infer: 0.54
[2023-01-12 08:36:16,636] [benchmark] [2532015062.py:39] Starting super_glue/rte - GPT-3 style
[2023-01-12 08:36:28,611] [benchmark] [2532015062.py:7

KeyboardInterrupt: 