In [5]:
from typing import List, Union
from separability import Model
from datasets import load_dataset, get_dataset_config_names

m = Model("facebook/opt-125m")

In [60]:
import torch
import numpy as np
from welford_torch import Welford

mcq_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def format_mmlu_question(datum, include_answer=False):
    s  = "Question:\n"
    s += datum["question"]
    s += "\n\n"
    s += "Choices:"
    for index, choice in enumerate(datum["choices"]):
        s += f"\n{mcq_letters[index]}) {choice}"
    s += "\n\n"
    s += "Answer: "
    if include_answer:
        s += mcq_letters[datum["answer"]]
    return s

def mmlu_configs():
    return get_dataset_config_names("tasksource/mmlu")

def evaluate_mmlu(
        opt: Model,
        config: Union[str, List[str]] = None,
        n_shot: int = 5,
    ):
    config_options = mmlu_configs()
    error_string = f" Config must be either 'all' or one/list of {config_options}"

    if config is None:
        raise ValueError("Must specify a config." + error_string)

    if isinstance(config, str):
        if config == "all":
            config = config_options
        elif config in config_options:
            config = [config]
        else:
            raise ValueError(f"Invalid config: {config}." + error_string)

    tasks = []

    for c in config:
        _dataset = load_dataset("tasksource/mmlu", c)["test"]

        # Create a pre-prompt with the first n_shot examples
        initial_prompt = ""
        for i in range(n_shot):
            initial_prompt += format_mmlu_question(_dataset[i], include_answer=True)
            initial_prompt += "\n\n"

        # Remove the first n_shot examples from the dataset 
        indices = list(range(n_shot, len(_dataset))) 
        _dataset = _dataset.select(indices=indices)
        
        tasks.append((initial_prompt, _dataset, c))

    # Store accuracy and losses
    out = {
        "num_predictions": 0,
        "num_accurate": 0,
        "num_skip_predictions": 0,
        "num_skip_accurate": 0,
        "num_topk_accurate": 0,
        "num_topk_skip_accurate": 0,
        "token_counts": None,
    }
    loss_tracker = Welford() 

    for initial_prompt, _dataset, dataset_name in tasks:
        print(dataset_name)
        for datum in _dataset:
            # Prepare text
            text = initial_prompt + \
                format_mmlu_question(datum, True)
            input_ids = opt.get_ids(text)
           
            # Get the guess (logits of the last token)
            all_logits = opt.get_all_logits(input_ids)
            logits = all_logits[0, -2]
            guess = opt.tokenizer.decode(logits.argmax()).strip()
            truth = mcq_letters[datum["answer"]]
            
            # Compare to ground truth
            loss = opt.evaluate_ce_loss(
                input_ids=input_ids[0, -2:].unsqueeze(0),
                logits=all_logits[0, -2:].unsqueeze(0),
            )
            
            out["num_predictions"] += 1
            out["num_accurate"] += int(guess == truth)
            loss_tracker.add( loss.detach() )
        
        
    loss = loss_tracker.mean.cpu().numpy()
    log_loss = np.log(loss)
        
    out.update({
        "accuracy": {
            "percentage_correct": out["num_accurate"] / out["num_predictions"],
        },
        "loss_data": {
            "loss": loss,
            "log_loss": np.log(loss)
        },
    })
    
    return out

for c in mmlu_configs():
    out = evaluate_mmlu(m, c)
    print(out["accuracy"])

abstract_algebra
{'percentage_correct': 0.24210526315789474}
anatomy
{'percentage_correct': 0.3384615384615385}
astronomy
{'percentage_correct': 0.29931972789115646}
business_ethics
{'percentage_correct': 0.05263157894736842}
clinical_knowledge
{'percentage_correct': 0.2653846153846154}
college_biology
{'percentage_correct': 0.19424460431654678}
college_chemistry
{'percentage_correct': 0.12631578947368421}
college_computer_science
{'percentage_correct': 0.21052631578947367}
college_mathematics
{'percentage_correct': 0.30526315789473685}
college_medicine
{'percentage_correct': 0.23809523809523808}
college_physics
{'percentage_correct': 0.1958762886597938}
computer_security
{'percentage_correct': 0.2631578947368421}
conceptual_physics
{'percentage_correct': 0.30869565217391304}
econometrics
{'percentage_correct': 0.12844036697247707}
electrical_engineering
{'percentage_correct': 0.3}
elementary_mathematics
{'percentage_correct': 0.23324396782841822}
formal_logic
{'percentage_correct': 0.

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,0,0], thread: [69,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [526,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [61]:
m = Model("openlm-research/open_llama_3b_v2")
print(m)

for c in mmlu_configs():
    out = evaluate_mmlu(m, c)
    print(out["accuracy"])

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
