In [1]:
%set_env VLLM_ATTENTION_BACKEND=FLASHINFER

env: VLLM_ATTENTION_BACKEND=FLASHINFER


In [2]:
from vllm import LLM, SamplingParams
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import torch
import gc
import pickle
import re

  from .autonotebook import tqdm as notebook_tqdm
2024-07-24 20:32:30,748	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Compute the disjointness

In [3]:
df = pd.read_csv("../data/pairs.csv", header=0, names=["c1", "c2", "disjoint"])
df["reason"] = df.disjoint.apply(lambda x: x if pd.isna(x) else eval(x)[1])
df["disjoint"] = df.disjoint.apply(lambda x: x if pd.isna(x) else eval(x)[0])

df_inverse = df.copy()
df_inverse["c1"] = df["c2"]
df_inverse["c2"] = df["c1"]
df = pd.concat([df, df_inverse]).reset_index(drop=True)

df["c1_name"] = df.c1.str.replace("http://dbpedia.org/ontology/", "")
df["c2_name"] = df.c2.str.replace("http://dbpedia.org/ontology/", "")

In [4]:
SYSTEM_PROMPTS = {
    "naive": "Answer only \"yes\" or \"no\".",
    "task_description": "This is a question about ontological disjointness, answer only with \"yes\" or \"no\"",
    "few_shot": "This is a question about ontological disjointness, answer only with \"yes\" or \"no\"\nExamples of disjoint are: 'person' and 'file system', 'tower' and 'person', 'place' and 'agent', 'continent' and 'sea', 'baseball league' and 'bowling league', 'planet' and 'star'.\nExamples of not disjoint are: 'basketball player' and 'baseball player', 'means of transportation' and 'reptile', 'garden' and 'historic place', 'president' and 'beauty queen', 'castle' and 'prison'.",
}

PROMPT = {
    #"can_a_question": ("Can a %s be a %s?", lambda a: 0 if a == "yes" else 1),
    "can_a_question": ("Can a %s be a %s?", lambda a: False if re.match(r"^\s*[Yy]es", a) else True),
    #"are_disjoint": ("Is the class %s disjoint from %s?", lambda a: 1 if a == "yes" else 0)
    "are_disjoint": ("Is the class %s disjoint from %s?", lambda a: True if re.match(r"^\s*[Yy]es", a) else False),
}

LLMS_MAP = {
    "meta-llama/Meta-Llama-3-8B-Instruct": "<|start_header_id|>system<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n%s\n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
    "google/gemma-2-9b-it": "<start_of_turn>user\n%s\n%s<end_of_turn>\n<start_of_turn>model\n",
    "mistralai/Mistral-7B-Instruct-v0.3": "[INST] %s \n%s [/INST]",
    "Qwen/Qwen2-7B-Instruct": "{{ if .System }}<|im_start|>system\n %s<|im_end|>\n<|im_start|>user\n%s<|im_end|>\n<|im_start|>assistant\n",

}

In [5]:
samples = df[df.disjoint.notna()]
y = np.array(list(map(int, samples.disjoint)))

In [6]:
sampling_params = SamplingParams(temperature=0.0)
results = []

for llm_k, llm_template in LLMS_MAP.items():
    try:
        del llm
    except:
        pass
    finally:
        torch.cuda.empty_cache()
        gc.collect()

    llm = LLM(model=llm_k, quantization="fp8")

    for sp, system_prompt in SYSTEM_PROMPTS.items():
        for p_k, (prompt, parse_fn) in PROMPT.items():
            prompts = samples.apply(lambda r: prompt % (r.c1_name, r.c2_name), axis=1).to_list()
            prompts = [llm_template % (system_prompt, p) for p in prompts]
            
            outputs = llm.generate(prompts, sampling_params)
            output = [o.outputs[0].text for o in outputs]
            pred = np.array(list(map(parse_fn, output)))

            samples["prompt"] = prompts
            samples["output"] = output
            samples["llm"] = llm_k
            samples["system_prompt"] = sp
            samples["prompt"] = p_k
            results.append(samples.copy())

INFO 07-24 20:32:32 llm_engine.py:169] Initializing an LLM engine (v0.5.1) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 07-24 20:32:33 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:32:33 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:32:34 weight_utils.py:218] Using model weights format ['*.safetensors']
INFO 07-24 20:32:37 model_runner.py:255] Loading model weights took 8.4633 GB
INFO 07-24 20:32:39 gpu_executor.py:84] # GPU blocks: 5262, # CPU blocks: 2048
INFO 07-24 20:32:40 model_runner.py:924] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-24 20:32:40 model_runner.py:928] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-24 20:32:48 model_runner.py:1117] Graph capturing finished in 8 secs.


Processed prompts: 100%|██████████| 2296/2296 [00:20<00:00, 112.62it/s, est. speed input: 3784.44 toks/s, output: 225.24 toks/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples["prompt"] = prompts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples["output"] = output
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samples["llm"] = llm_

INFO 07-24 20:38:30 llm_engine.py:169] Initializing an LLM engine (v0.5.1) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=google/gemma-2-9b-it, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-24 20:38:31 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:38:31 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:38:31 weight_

Processed prompts: 100%|██████████| 2296/2296 [00:27<00:00, 83.77it/s, est. speed input: 2376.01 toks/s, output: 352.10 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:32<00:00, 70.42it/s, est. speed input: 2067.96 toks/s, output: 294.82 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:41<00:00, 54.94it/s, est. speed input: 2162.81 toks/s, output: 264.97 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:42<00:00, 53.53it/s, est. speed input: 2160.64 toks/s, output: 305.20 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [02:23<00:00, 15.97it/s, est. speed input: 2225.86 toks/s, output: 81.93 toks/s]
Processed prompts:  16%|█▌        | 371/2296 [00:33<02:09, 14.90it/s, est. speed input: 1537.35 toks/s, output: 60.01 toks/s]



Processed prompts: 100%|██████████| 2296/2296 [02:21<00:00, 16.19it/s, est. speed input: 2273.14 toks/s, output: 85.92 toks/s]


INFO 07-24 20:46:00 llm_engine.py:169] Initializing an LLM engine (v0.5.1) with config: model='mistralai/Mistral-7B-Instruct-v0.3', speculative_config=None, tokenizer='mistralai/Mistral-7B-Instruct-v0.3', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=mistralai/Mistral-7B-Instruct-v0.3, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 07-24 20:46:01 weight_utils.py:218] Using model weights format ['*.safetensors']
INFO 07-24 20:46:03 mo

Processed prompts: 100%|██████████| 2296/2296 [00:25<00:00, 90.87it/s, est. speed input: 2376.59 toks/s, output: 1055.40 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:32<00:00, 71.15it/s, est. speed input: 2003.24 toks/s, output: 1067.35 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:36<00:00, 63.05it/s, est. speed input: 2405.72 toks/s, output: 603.52 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [00:37<00:00, 61.68it/s, est. speed input: 2476.63 toks/s, output: 240.02 toks/s] 
Processed prompts: 100%|██████████| 2296/2296 [02:17<00:00, 16.69it/s, est. speed input: 2623.55 toks/s, output: 87.23 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [02:09<00:00, 17.70it/s, est. speed input: 2816.42 toks/s, output: 38.03 toks/s]


INFO 07-24 20:53:04 llm_engine.py:169] Initializing an LLM engine (v0.5.1) with config: model='Qwen/Qwen2-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Qwen/Qwen2-7B-Instruct, use_v2_block_manager=False, enable_prefix_caching=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 07-24 20:53:04 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:53:05 selector.py:79] Using Flashinfer backend.
INFO 07-24 20:53:05 weight_utils.py:218] Using model weights format ['*.safetensors']
INFO 07-24 20:53:08 model_runner.py:255] Loading model weights took 8.1381 GB
INFO 07-24 20:53:16 gpu_executor.py:84] # GPU blocks: 8714, # CPU blocks: 4681
INFO 07-24 20:53:18 model_runner.py:924] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 07-24 20:53:18 model_runner.py:928] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 07-24 20:53:26 model_runner.py:1117] Graph capturing finished in 8 secs.


Processed prompts: 100%|██████████| 2296/2296 [00:25<00:00, 90.30it/s, est. speed input: 3316.82 toks/s, output: 180.60 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [00:27<00:00, 82.57it/s, est. speed input: 3115.59 toks/s, output: 165.15 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [00:38<00:00, 60.27it/s, est. speed input: 2876.78 toks/s, output: 120.54 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [00:36<00:00, 62.71it/s, est. speed input: 3055.83 toks/s, output: 125.42 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [01:57<00:00, 19.58it/s, est. speed input: 3030.00 toks/s, output: 39.16 toks/s]
Processed prompts: 100%|██████████| 2296/2296 [01:54<00:00, 19.99it/s, est. speed input: 3112.68 toks/s, output: 39.98 toks/s]


In [7]:
pd.concat(results).to_csv("results.csv")

## Evaluate

In [10]:
results = pd.read_csv("./results.csv", index_col=0)

In [13]:
agg_results = []
for (sys, prompt, llm), data in results.groupby(["system_prompt", "prompt", "llm"]):
    # compute disjoint recall
    idxs = (data.disjoint == True)
    y = data[idxs].disjoint.astype(float).values
    pred = data[idxs].output.apply(lambda s: PROMPT[prompt][1](s)).astype(float).values
    dr = recall_score(y, pred, zero_division=0)

    # compute non-disjoint accuracy
    idxs = (data.disjoint == False)
    y = 1 - data[idxs].disjoint.astype(float).values
    pred = 1 - data[idxs].output.apply(lambda s: PROMPT[prompt][1](s)).astype(float).values
    ndf1 = f1_score(y, pred)
    
    # end2end f1
    y = data.disjoint.astype(float).values
    pred = data.output.apply(lambda s: PROMPT[prompt][1](s)).astype(float).values
    f1 = f1_score(y, pred, zero_division=0)

    # symmetry respected
    a = data.iloc[:data.shape[0] // 2].output.apply(lambda s: PROMPT[prompt][1](s)).astype(float).values
    b = data.iloc[data.shape[0] // 2:].output.apply(lambda s: PROMPT[prompt][1](s)).astype(float).values
    sc = (a == b).sum() / a.shape[0]

    # compute metrics
    agg_results.append({
        "Prompt": sys,
        "QA": prompt,
        "LLM": llm,
        "DR": dr,
        "NDF1": ndf1,
        "F1": f1,
        "SC": sc
    })
    
aggregated_results = pd.DataFrame(agg_results)
aggregated_results["mu"] = aggregated_results.iloc[:, -4:].mean(axis=1)

aggregated_results.LLM = aggregated_results.LLM.map({
    "Qwen/Qwen2-7B-Instruct": "Qwen 2",
    "google/gemma-2-9b-it": "Gemma 2",
    "meta-llama/Meta-Llama-3-8B-Instruct": "LLama 3",
    "mistralai/Mistral-7B-Instruct-v0.3": "Mistral 0.3",
})

aggregated_results.Prompt = aggregated_results.Prompt.map({"few_shot": "Few shot", "naive": "Naive", "task_description": "Task description"})
aggregated_results.Prompt = aggregated_results.Prompt.astype("category")
aggregated_results.Prompt = aggregated_results.Prompt.cat.set_categories(["Naive", "Task description", "Few shot"])

aggregated_results.QA = aggregated_results.QA.map({"are_disjoint": "Positive", "can_a_question": "Negative"})
aggregated_results.QA = aggregated_results.QA.astype("category")
aggregated_results.QA = aggregated_results.QA.cat.set_categories(["Positive", "Negative"])


aggregated_results.sort_values(["Prompt", "QA", "LLM"])

Unnamed: 0,Prompt,QA,LLM,DR,NDF1,F1,SC,mu
9,Naive,Positive,Gemma 2,0.994595,0.257559,0.525339,0.893728,0.667805
10,Naive,Positive,LLama 3,0.193243,0.629075,0.165797,0.652439,0.410139
11,Naive,Positive,Mistral 0.3,1.0,0.027883,0.491042,0.980836,0.62494
8,Naive,Positive,Qwen 2,0.002703,0.987309,0.005122,0.964286,0.489855
13,Naive,Negative,Gemma 2,0.708108,0.90807,0.686763,0.848432,0.787843
14,Naive,Negative,LLama 3,0.9,0.858819,0.743719,0.889373,0.847978
15,Naive,Negative,Mistral 0.3,0.85,0.814476,0.677802,0.808362,0.78766
12,Naive,Negative,Qwen 2,0.921622,0.795199,0.699129,0.83885,0.8137
17,Task description,Positive,Gemma 2,0.993243,0.353439,0.54505,0.861498,0.688308
18,Task description,Positive,LLama 3,0.863514,0.079012,0.445141,0.898084,0.571438


: 