In [1]:
import os
from helper import init_ipynb
envfound = init_ipynb()

DIR = os.environ["DIR_PATH"] if envfound else None
DEVICE = os.environ["DEVICE"] if envfound else None

In [2]:
from models import (OpenAIGPT, HF_LLM, GenerationArg, Model)
from evaluation import MCQBenchmark
import gc
from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel
import torch.distributed
from typing import List, Callable
import pandas as pd
from models.qa_prompts import QA_PROMPTS


import nltk
nltk.download('punkt')

def cot_prompt_template(n_shots: int, q: str) -> str:
    template = QA_PROMPTS["cot_answer_align"]
    shots = '\n'.join(template['shots'][:n_shots])
    return f"""-system:\n{template['system']}\n{shots}{template['q_form'].format(q=q)}"""

def direct_prompt_template(n_shots: int, q: str) -> str:
    template = QA_PROMPTS["direct_answer_align"]
    shots = '\n'.join(template['shots'][:n_shots])
    return f"""-system:\n{template['system']}\n{shots}{template['q_form'].format(q=q)}"""

def cot1_prompt_template(n_shots: int, q: str) -> str:
    template = QA_PROMPTS["1cot_answer_align"]
    shots = '\n'.join(template['shots'][:n_shots])
    return f"""-system:\n{template['system']}\n{shots}{template['q_form'].format(q=q)}"""

FLAGS = ["Therefore", "correct", "answer"]
def match_score(sentence):
    return len([flag for flag in FLAGS if flag in sentence]) / len(FLAGS)

def simple_extract(ans_sentence):
    sel = [l for l in ["A", "B", "C", "D", "E"] if l in ans_sentence]
    return sel[0] if sel != [] else "-1"

def csa2(pred):
    if(type(pred) != str):
        pred = pred.outputs[0].text
    sent_text = nltk.sent_tokenize(pred.replace("\n", ";"))
    sentence_score = sorted([[sent, match_score(sent)] for sent in sent_text], key=lambda _: _[1], reverse=True)
    if(len(sentence_score) == 0):
        return "-1"
    sentence, score = sentence_score[0]
    if(score == 0):
        return "-1"
    return simple_extract(sentence)


def write_cache(cache_file:str,
                res,
                targets:List[str]):
    total_cache_file = "docs/benchmarks_results/full/" + cache_file
    with open(total_cache_file) as f:
        singleqa = [f"{res[i].outputs[0].text}\n\n{targets[i]}\n\n{'*'*100}" for i in range(len(targets))]
        f.write("\n\n".join(singleqa))
    print(f"Cached full results on : {cache_file}")


def benchmark(llm: Model,
              benchnames: List[str],
              pt: Callable[[str], str],
              search_ans: Callable[[str], str],
              cache_file:str=None):
    """
        args :
            - model: model to test
            - benchnames : names of the benchmarks to run for the given model
            - pt : prompt template that inputs a question and return the prompt to the model
            - search_ans : a function that finds the answer to the MCQ in the result string
    """
    model_res = []
    for benchname in benchnames:
        print("Evaluation on", benchname)
        benchmark = MCQBenchmark(
            BENCHMARKS_PATHS[benchname],
            pt ## call with the specific prompt template
        )
        res = benchmark.assess(llm)
        
        if(cache_file is not None):
            ## if a cache_file is inputed => cache out the file
            write_cache(
                cache_file,
                res,
                [q["answer"] for q in benchmark.mcq]
            )
            

        answ = []
        for r in res:
            answ.append(search_ans(r))
        
        model_res.append(
            sum(q["answer"] in ans for ans, q in zip(answ, benchmark.mcq)) / len(answ)
        )
    return model_res


def testbench(
    benchnames:List[str]=["AES7", "AES8"],
    modnames:List[str]=["epfl-llm/meditron-7b", "meta-llama/Llama-2-7b-hf"],
    runargs:List[GenerationArg]=[],
    prompt_template:Callable[[str], str]=lambda x : x,
    search_ans:Callable[[str], str]=lambda x : x,
    use_vllm:bool=True,
    cache_file:str=None
    ) -> pd.DataFrame:
    
    all_res = []
    for mod in modnames:
        print("<" + "-" * 100 + ">")
        try : 
        ## creating and loading the model
            llm = HF_LLM(
                        mod,
                        device=DEVICE,
                        use_vllm=use_vllm,
                        arg=GenerationArg(use_vllm=use_vllm)
                    )
            llm.load()

            for runarg in runargs:
                ## update the model with current runargs
                llm.set_arg(runarg)

                ## run the benchmarks
                rr = [mod] \
                    + [v for k, v in runarg.attr.items()] \
                    + benchmark(llm, benchnames, prompt_template, search_ans, cache_file) 
                all_res.append(rr)
            destroy_model_parallel()
            del llm
            gc.collect()
            torch.cuda.empty_cache()
            # torch.distributed.destroy_process_group()
        except Exception as e:    
            ## unload the model
            print("Unloading the model")
            destroy_model_parallel()
            del llm
            gc.collect()
            torch.cuda.empty_cache()
            # torch.distributed.destroy_process_group()
            raise Exception(e)
    cols = ["mod"] + [k for k, v in runargs[0].attr.items()] + benchnames
    return pd.DataFrame(all_res, columns=cols)

BENCHMARKS_PATHS = {
    "MCQ" : "docs/benchmarks/mcq40/processed.json",
    "AES7" : "docs/benchmarks/self_assessment/aes7_processed.json",
    "AES8" :  "docs/benchmarks/self_assessment/aes8_processed.json",
}

We've set API key :  f0555591d0410671711554a53411c6d7


[nltk_data] Downloading package punkt to
[nltk_data]     /home/antoinemagron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Ablation studies


### Ablation on number of shots for COT1

In [3]:
modnames = ["epfl-llm/meditron-7b", "meta-llama/Meta-Llama-3-8B", "meta-llama/Llama-2-7b-hf"]
benchnames = ["AES7", "AES8"]
DETERMINISTIC = GenerationArg(
        temperature=0.00001,
        use_vllm=True,
        topk=1,
        topp=1,
        max_new_token=512,
        stop_seq="###"
    )

In [4]:
results = []
for shots in range(4):
    resdf = testbench(benchnames=benchnames,
              runargs=[DETERMINISTIC],
              prompt_template=lambda q : cot1_prompt_template(shots, q),
              search_ans=csa2,
              modnames=modnames,
              use_vllm=True)
    resdf["shots"] = shots
    results.append(resdf)

<---------------------------------------------------------------------------------------------------->
INFO 05-13 13:57:59 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='epfl-llm/meditron-7b', tokenizer='epfl-llm/meditron-7b', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 13:57:59 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 05-13 13:57:59 selector.py:25] Using XFormers backend.
INFO 05-13 13:58:00 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 13:58:01 model_runner.py:104] Loading model weights took 12.5527 GB
INFO 05-13 13:58:02 gpu_executor.py:94] # GPU blocks: 3797, # CPU blocks: 512
INFO 05-13 13:58:03 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 13:58:03 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 13:58:08 model_runner.py:867] G

Processed prompts: 100%|██████████| 100/100 [00:47<00:00,  2.12it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:48<00:00,  2.07it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 13:59:44 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Meta-Llama-3-8B', tokenizer='meta-llama/Meta-Llama-3-8B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 13:59:45 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 13:59:46 model_runner.py:104] Loading model weights took 14.9595 GB
INFO 05-13 13:59:48 gpu_executor.py:94] # GPU blocks: 13450, # CPU blocks: 2048
INFO 05-13 13:59:48 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 13:59:48 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 13:59:53 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:19<00:00,  5.09it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:20<00:00,  4.85it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:00:34 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 05-13 14:00:34 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 05-13 14:00:34 selector.py:25] Using XFormers backend.
INFO 05-13 14:00:34 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:00:36 model_runner.py:104] Loading model weights took 12.5523 GB
INFO 05-13 14:00:37 gpu_executor.py:94] # GPU blocks: 3819, # 

Processed prompts: 100%|██████████| 100/100 [00:47<00:00,  2.10it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:48<00:00,  2.06it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:02:19 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='epfl-llm/meditron-7b', tokenizer='epfl-llm/meditron-7b', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:02:19 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:02:21 model_runner.py:104] Loading model weights took 12.5523 GB
INFO 05-13 14:02:21 gpu_executor.py:94] # GPU blocks: 3826, # CPU blocks: 512
INFO 05-13 14:02:21 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:02:21 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:02:26 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:37<00:00,  2.68it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:37<00:00,  2.66it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:03:42 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Meta-Llama-3-8B', tokenizer='meta-llama/Meta-Llama-3-8B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:03:42 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:03:44 model_runner.py:104] Loading model weights took 14.9575 GB
INFO 05-13 14:03:45 gpu_executor.py:94] # GPU blocks: 13450, # CPU blocks: 2048
INFO 05-13 14:03:45 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:03:45 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:03:51 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:30<00:00,  3.28it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:04:54 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 05-13 14:04:54 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:04:55 model_runner.py:104] Loading model weights took 12.5513 GB
INFO 05-13 14:04:56 gpu_executor.py:94] # GPU blocks: 3819, # CPU blocks: 512
INFO 05-13 14:04:56 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set

Processed prompts: 100%|██████████| 100/100 [01:14<00:00,  1.35it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [01:15<00:00,  1.33it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:07:32 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='epfl-llm/meditron-7b', tokenizer='epfl-llm/meditron-7b', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:07:32 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:07:33 model_runner.py:104] Loading model weights took 12.5523 GB
INFO 05-13 14:07:34 gpu_executor.py:94] # GPU blocks: 3826, # CPU blocks: 512
INFO 05-13 14:07:34 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:07:34 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:07:39 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:45<00:00,  2.18it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:47<00:00,  2.12it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:09:13 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Meta-Llama-3-8B', tokenizer='meta-llama/Meta-Llama-3-8B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:09:13 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:09:15 model_runner.py:104] Loading model weights took 14.9575 GB
INFO 05-13 14:09:16 gpu_executor.py:94] # GPU blocks: 13450, # CPU blocks: 2048
INFO 05-13 14:09:16 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:09:16 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:09:22 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:42<00:00,  2.35it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:42<00:00,  2.34it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:10:48 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 05-13 14:10:48 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:10:50 model_runner.py:104] Loading model weights took 12.5513 GB
INFO 05-13 14:10:51 gpu_executor.py:94] # GPU blocks: 3819, # CPU blocks: 512
INFO 05-13 14:10:51 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set

Processed prompts: 100%|██████████| 100/100 [01:28<00:00,  1.14it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [01:28<00:00,  1.13it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:13:53 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='epfl-llm/meditron-7b', tokenizer='epfl-llm/meditron-7b', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:13:53 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:13:55 model_runner.py:104] Loading model weights took 12.5523 GB
INFO 05-13 14:13:56 gpu_executor.py:94] # GPU blocks: 3826, # CPU blocks: 512
INFO 05-13 14:13:56 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:13:56 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:14:01 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:47<00:00,  2.12it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:45<00:00,  2.21it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:15:34 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Meta-Llama-3-8B', tokenizer='meta-llama/Meta-Llama-3-8B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 05-13 14:15:34 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:15:36 model_runner.py:104] Loading model weights took 14.9575 GB
INFO 05-13 14:15:37 gpu_executor.py:94] # GPU blocks: 13450, # CPU blocks: 2048
INFO 05-13 14:15:37 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 05-13 14:15:37 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 05-13 14:15:43 model_runner.py:867] Graph capturing finished in 5 secs.
Evaluation on AES7


Processed prompts: 100%|██████████| 100/100 [00:47<00:00,  2.11it/s]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [00:48<00:00,  2.07it/s]


<---------------------------------------------------------------------------------------------------->
INFO 05-13 14:17:19 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='meta-llama/Llama-2-7b-hf', tokenizer='meta-llama/Llama-2-7b-hf', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)
INFO 05-13 14:17:20 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 05-13 14:17:21 model_runner.py:104] Loading model weights took 12.5513 GB
INFO 05-13 14:17:22 gpu_executor.py:94] # GPU blocks: 3819, # CPU blocks: 512
INFO 05-13 14:17:22 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set

Processed prompts: 100%|██████████| 100/100 [01:51<00:00,  1.12s/it]


Evaluation on AES8


Processed prompts: 100%|██████████| 100/100 [01:52<00:00,  1.13s/it]


In [7]:
pd.concat(results).to_csv("docs/benchmarks_results/ablation_study.csv")

## Study the results

In [19]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

BENCHMARKS_PATHS = {
    "MCQ" : "docs/benchmarks/mcq40/processed.json",
    "AES7" : "docs/benchmarks/self_assessment/aes7_processed.json",
    "AES8" :  "docs/benchmarks/self_assessment/aes8_processed.json",
}


human_accuracies = {
    "AES7" : sum([_["human_accuracy"] for _ in MCQBenchmark(BENCHMARKS_PATHS["AES7"], lambda x : x).mcq]) / 100,
    "AES8" : sum([_["human_accuracy"] for _ in MCQBenchmark(BENCHMARKS_PATHS["AES8"], lambda x : x).mcq]) / 100,
}

human_accuracies["AES"] = sum(s for b, s in human_accuracies.items()) / len(human_accuracies)

GPT_RES_AES7 = 0.41
GPT_RES_AES8 = 0.43

In [11]:
df = pd.read_csv("docs/benchmarks_results/ablation_study.csv").drop("Unnamed: 0", axis=1)
df.count()

mod                  12
temperature          12
top_k                12
top_p                12
max_tokens           12
presence_penalty     12
frequency_penalty    12
use_beam_search      12
logprobs             12
best_of              12
stop                 12
use_vllm             12
AES7                 12
AES8                 12
shots                12
dtype: int64

In [20]:
def perf_plot(of=df, against="temp", benchnames=["AES7", "AES8"]):
    colors = ["#a1dab4", "#41b6c4", "#2c7fb8", "#253494", "#ffffcc"]
    temps = []
    for bench in benchnames:
        temp = df[["mod", bench, "temperature"]]
        temp["bench"] = bench
        temp.columns = ["mod", "accuracy", "temperature", "bench"]
        temps.append(temp)
    df = pd.concat(temps)

    df["lines"] = df["mod"].apply(process_mod_name) + " on " + df["bench"]
    
    f, a = plt.subplots(figsize=(10, 5))
    for bench, color in zip(benchnames, colors):
        a.axhline(human_accuracies[bench], linestyle="--", label=f"human score {bench} ", color=color)
    a.axhline((GPT_RES_AES7 + GPT_RES_AES8) / 2 , linestyle="--", label=f"GPT3.5 {bench} ", color="r")
    # a.axhline(0.2 , linestyle="--", label=f"chance", color="b")
    g = sns.lineplot(data=df, x=against, y="accuracy", hue="lines")
    g.set_xlabel("Generation temperature")
    g.set_ylabel("Accuracy on benchmark")

def process_mod_name(modname):
    return "-".join(modname.split("/")[1].split("-")[:2])

In [None]:
perf_against_temp(df)