# Qualitative review

In [5]:
import os
from helper import init_ipynb
envfound = init_ipynb()

DIR = os.environ["DIR_PATH"] if envfound else None
DEVICE = os.environ["DEVICE"] if envfound else None
API_KEY = os.environ["API_KEY"] if envfound else None
PLATFORM = os.environ["OS_TYPE"] if envfound else None

if(PLATFORM == "Darwin"):
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'
else :
    import vllm
%load_ext autoreload
%autoreload 2

In [6]:
from transformers import (AutoModelForCausalLM
                         ,LlamaForCausalLM
                         ,AutoTokenizer)
from models import (HF_LLM, GenerationArg, Model, OpenAIGPT)
import torch
import gc
from evaluation import MCQBenchmark
from models.qa_prompts import QA_PROMPTS
from typing import List
from vllm.distributed.parallel_state import destroy_model_parallel

import nltk
nltk.download('punkt')

We've set API key :  f0555591d0410671711554a53411c6d7


  warn(
[nltk_data] Downloading package punkt to
[nltk_data]     /home/antoinemagron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
MODELS = [
    "tiiuae/falcon-7b",
    "mistralai/Mistral-7B-v0.1",
    "meta-llama/Llama-2-7b-hf",
    "meta-llama/Meta-Llama-3-8B",
    "epfl-llm/meditron-7b",
]


OPENAI_MODELS = [
    "gpt-3.5-turbo",
    "gpt4"
]

In [4]:
MARKER = "X00A"

def cot1_prompt_template_support(q: str, shots: List[str], stop_token:str="<|STOP|>") -> str:
    template = QA_PROMPTS["1cot_answer_align"]
    shots = '\n'.join(shots)
    return f"""-system:\n{template['system']}\n{shots}{template['q_form'].format(q=q)}""".replace("###", stop_token)

def cot1_prompt_template_gpt(q: str, shots: List[str]) -> List[str]:
    template = QA_PROMPTS["1cot_answer_align"]
    messages = [
        {
            "role": "system",
            "content": template["system"]
        }
    ]
    for shot in shots:
        contents = shot.replace("-user:", MARKER).replace("-assistant:", MARKER).split(MARKER)
        contents = [x.strip() for x in contents if x != ""]
        messages.append({
            "role": "user",
            "content": contents[0]
        })
        messages.append({
            "role": "assistant",
            "content": contents[1]
        })
    messages.append({
        "role": "user",
        "content": ("Question : " + q)
    })
    return messages





In [6]:
aes = MCQBenchmark(
    "docs/benchmarks/self_assessment/final_processed.json",
    cot1_prompt_template_support,
    support_type="kNN",
    n_shots=3
)

aes_gpt = MCQBenchmark(
    "docs/benchmarks/self_assessment/final_processed.json",
    cot1_prompt_template_gpt,
    support_type="kNN",
    n_shots=3
)

### Models from 🤗

In [6]:
all_generations = {}
for model in MODELS:
    llm = HF_LLM(
        model_name=model,
        arg=GenerationArg(
            temperature=0.001,
            topp=1,
            max_new_token=512,
            stop_seq="<|STOP|>",
            topk=1,
            use_vllm=True
        ),
        device=DEVICE,
        use_vllm=True
    )
    llm.load()
    all_generations[model] = aes.assess(llm)
    print("Unloading the model")
    destroy_model_parallel()
    del llm
    gc.collect()
    torch.cuda.empty_cache()
    torch.distributed.destroy_process_group()


INFO 06-05 16:47:07 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='tiiuae/falcon-7b', speculative_config=None, tokenizer='tiiuae/falcon-7b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=tiiuae/falcon-7b)




INFO 06-05 16:47:07 utils.py:660] Found nccl from library /home/antoinemagron/.config/vllm/nccl/cu12/libnccl.so.2.18.1
INFO 06-05 16:47:07 selector.py:27] Using FlashAttention-2 backend.
INFO 06-05 16:47:07 weight_utils.py:199] Using model weights format ['*.bin']


pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

INFO 06-05 16:51:28 model_runner.py:175] Loading model weights took 12.9420 GB
INFO 06-05 16:51:29 gpu_executor.py:114] # GPU blocks: 235592, # CPU blocks: 32768
INFO 06-05 16:51:31 model_runner.py:937] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-05 16:51:31 model_runner.py:941] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-05 16:51:39 model_runner.py:1017] Graph capturing finished in 8 secs.
[['-user:\nQuestion : Deep brain stimulation for drug-resistant epilepsy targets which of the following structures?\n        A. anterior nucleus of the thalamus\n        B. centromedian nucleus of the thalamus\n        C. cingulate gyrus\n    

Processed prompts: 100%|██████████| 224/224 [00:39<00:00,  5.63it/s]


Unloading the model
INFO 06-05 16:53:34 llm_engine.py:100] Initializing an LLM engine (v0.4.2) with config: model='mistralai/Mistral-7B-v0.1', speculative_config=None, tokenizer='mistralai/Mistral-7B-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=mistralai/Mistral-7B-v0.1)


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 

In [10]:
to_store = aes.mcq.copy()
for gen, q in zip(all_generations["gpt-3.5-turbo"], to_store):
    q["prediction"] = gen

In [12]:
import json
with open("docs/final_results_store/gpt-3.5-turbo.json", "w") as f:
    json.dump(to_store, f)

### Creating Review file

In [7]:
all_generations = {}
for model in OPENAI_MODELS:
    llm = OpenAIGPT(model, temperature=0)
    all_generations[model] = aes_gpt.assess(llm)




[['-user:\nQuestion : Deep brain stimulation for drug-resistant epilepsy targets which of the following structures?\n        A. anterior nucleus of the thalamus\n        B. centromedian nucleus of the thalamus\n        C. cingulate gyrus\n        D. hippocampus\n        E. internal segment of the globus pallidus\n-assistant:\nAnswer :  The anterior nucleus of the thalamus is part of the Papez circuit and is therefore believed to be a relay station for information passing from the amygdala and hippocampus to the cerebral cortex. Data have shown that inhibition of the anterior nucleus may result in prevention or cessation of seizures. \nTherefore, the correct answer is A.###', '-user:\nQuestion : According to the International League Against Epilepsy’s 2010 revision of its seizure classification system, which of the following factors is most helpful in initially focusing the scope of diagnostic possibilities for specific electroclinical syndromes?\n        A. age at onset\n        B. fam

  0%|          | 0/224 [00:00<?, ?it/s]

Timed out HTTP code 502 from API (<html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>cloudflare</center>
</body>
</html>
). Waiting for 10 seconds.
[['-user:\nQuestion : Deep brain stimulation for drug-resistant epilepsy targets which of the following structures?\n        A. anterior nucleus of the thalamus\n        B. centromedian nucleus of the thalamus\n        C. cingulate gyrus\n        D. hippocampus\n        E. internal segment of the globus pallidus\n-assistant:\nAnswer :  The anterior nucleus of the thalamus is part of the Papez circuit and is therefore believed to be a relay station for information passing from the amygdala and hippocampus to the cerebral cortex. Data have shown that inhibition of the anterior nucleus may result in prevention or cessation of seizures. \nTherefore, the correct answer is A.###', '-user:\nQuestion : According to the International League Against Epilepsy’s 2010 revision of its seizure c

  0%|          | 0/224 [00:00<?, ?it/s]

InvalidRequestError: The model `gpt4` does not exist or you do not have access to it.

In [8]:
all_generations

{'gpt-3.5-turbo': [{'index': 'AES7-Question 1',
   'question': 'Based on the ILAE consensus classification, which of the following findings are seen in the most common histopathological subtype of hippocampal sclerosis?\nA. Neuronal cell loss in the CA2 and CA3 regions\nB. Neuronal cell loss in the CA1 and CA3 regions\nC. Neuronal cell loss in the CA1 and CA4 regions\nD. Neuronal cell loss in the dentate regions',
   'answer': 'C',
   'human_accuracy': 0.365,
   'difficulty': 'medium',
   'contains_media': False,
   'topic': 'Mechanisms of the epilepsies',
   'answer_full': 'The most common type of pyramidal cell loss in patients with temporal lobe epilepsy is Type 1, affecting both the CA4 and CA1 sectors. This type is more often associated with a history of initial precipitating injuries before 5 years of age, with early seizure onset and favorable postsurgical seizure control. Damage to sectors CA3 and CA2 is more variable but frequently visible. Type 2 involves predominantly CA1, w

In [8]:
def search_ans(pred):
    if(type(pred) != str):
        pred = pred.outputs[0].text
    ans_sentence = [x for x in pred.split(".") if "Therefore" in x]
    

    if(ans_sentence == []):
        return "-1"
    sel = [l for l in ["A", "B", "C", "D", "E"] if l in ans_sentence[0]]
    return sel[0] if sel != [] else "-1"


MODELS = [
    "meditron-7b",
    "Llama-2-7b-hf",
    "Meta-Llama-3-8B",
    "gpt-3.5",
    "gpt4"
]

gens = {}


for model in MODELS:
    if(os.path.exists(f"final_results_store/{model}.json")):
        print(f"- loading results for {model}")
        with open(f"final_results_store/{model}.json", "r") as f:
            gens[model] = json.load(f)
    else :
        print(f"- no stored results for {model}")

- no stored results for meditron-7b
- no stored results for Llama-2-7b-hf
- no stored results for Meta-Llama-3-8B
- no stored results for gpt-3.5
- no stored results for gpt4


## Generate the questionnaire :

In [37]:
import random

MODELS = [
    "meditron_7b",
    "Llama-2-7b-hf",
    "Llama-3-8B",
    # "gpt-3.5-turbo",
    "gpt4",
    "Mistral-7B-v0.1",
    "falcon-7b"
]

question_samples = random.sample(range(224), 10)
orderings = []
for i in range(10):
    orderings.append(
        random.sample(range(len(MODELS)), len(MODELS))
    )

In [38]:
orderings

[[1, 3, 0, 2, 5, 4],
 [5, 1, 2, 4, 0, 3],
 [5, 3, 0, 1, 4, 2],
 [2, 1, 3, 5, 0, 4],
 [5, 4, 2, 3, 0, 1],
 [2, 5, 1, 3, 0, 4],
 [3, 0, 5, 1, 4, 2],
 [2, 4, 3, 5, 0, 1],
 [0, 1, 5, 4, 2, 3],
 [1, 3, 2, 0, 5, 4]]

In [39]:
import json


gens = {}


for model in MODELS:
    if(os.path.exists(f"docs/final_results_store/{model}.json")):
        print(f"- loading results for {model}")
        with open(f"docs/final_results_store/{model}.json", "r") as f:
            gens[model] = json.load(f)
    else :
        print(f"- no stored results for {model}")



import random

question_samples = random.sample(range(224), 10)

selected = {}
for model in MODELS:
    selected[model] = []
    for sample in question_samples:
        selected[model].append(gens[model][sample])


sample = []

for i in range(10):
    ordering = orderings[i]
    for modi in ordering:
        mod = MODELS[modi]
        sample.append(
            f"{'*'*300}\n**QUESTION** : {selected[mod][i]['question']}\n\n>{selected[mod][i]['prediction']}"
        )

        sample.append(
            """
- [ ] Correct
- [ ] Convincing
- [ ] ...?



            """
        )




- loading results for meditron_7b
- loading results for Llama-2-7b-hf
- loading results for Llama-3-8B
- loading results for gpt4
- loading results for Mistral-7B-v0.1
- loading results for falcon-7b


In [40]:
with open("docs/questionnaire.md", "w") as f:
    f.write("\n".join(sample))

In [33]:
print(selected.keys())

dict_keys(['meditron_7b', 'Llama-2-7b-hf', 'Llama-3-8B', 'gpt4', 'Mistral-7B-v0.1', 'falcon-7b'])
