In [1]:
# uncomment and run to install requirements:
# %pip install evalplus requests numpy tqdm transformers

In [2]:
import concurrent.futures
import os
import requests
import time

import numpy as np

from tqdm import tqdm
from transformers import AutoTokenizer
from typing import Callable
from evalplus.data import get_human_eval_plus, get_human_eval_plus_hash, write_jsonl
from evalplus.evaluate import check_correctness, get_groundtruth
from evalplus.sanitize import sanitize

os.environ["TOKENIZERS_PARALLELISM"] = "false"

## API Adapters

In [3]:
MAX_TOKENS = 512
TEMPERATURE = 0.0


def llamacpp_completion(text: str, wrap: bool = True, **kwargs) -> tuple[str, float]:
    st = time.monotonic()
    url = "http://localhost:8080/completion"
    prompt = f"""<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n""" if wrap else text
    data = {
        "prompt": prompt,
        "n_predict": MAX_TOKENS,
        "temperature": TEMPERATURE,
    }
    data.update(kwargs)
    response = requests.post(url, json=data)
    try:
        completion = response.json()["content"].strip()
    except Exception as e:
        print("FAILURE:", e, response.json())
        raise e
    dt = time.monotonic() - st
    return completion, dt


def vllm_completion(text: str, model: str, wrap: bool = True, **kwargs) -> tuple[str, float]:
    st = time.monotonic()
    token = "YOUR_API_TOKEN"
    url = "YOUR_URL_HERE/v1/completions"
    prompt = f"""<s> [INST] {text} [/INST] """ if wrap else text
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
    }
    data.update(kwargs)
    response = requests.post(url, json=data, headers={"Authorization": f"Bearer {token}"})
    try:
        completion = response.json()["choices"][0]["text"].strip()
    except Exception as e:
        print("FAILURE:", e, response.json())
        raise e
    dt = time.monotonic() - st
    return completion, dt


def deepinfra_completion(text: str, model: str, wrap: bool = True, **kwargs) -> tuple[str, float]:
    st = time.monotonic()
    token = "YOUR_API_TOKEN"
    url = f"https://api.deepinfra.com/v1/inference/{model}"
    prompt = f"""<s> [INST] {text} [/INST] """ if wrap else text
    data = {
        "input": prompt,
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
    }
    data.update(kwargs)
    response = requests.post(url, json=data, headers={"Authorization": f"Bearer {token}"})
    try:
        completion = response.json()["results"][0]["generated_text"].strip()
    except Exception as e:
        print("FAILURE:", e, response.json())
        raise e
    dt = time.monotonic() - st
    return completion, dt

    
def together_completion(text: str, model: str, wrap: bool = True, **kwargs) -> tuple[str, float]:
    st = time.monotonic()
    token = "YOUR_API_TOKEN"
    url = "https://api.together.xyz/v1/completions"
    prompt = f"""<s> [INST] {text} [/INST] """ if wrap else text
    data = {
        "model": model,
        "prompt": prompt,
        "max_tokens": MAX_TOKENS,
        "temperature": TEMPERATURE,
    }
    data.update(kwargs)
    response = requests.post(url, json=data, headers={"Authorization": f"Bearer {token}"})
    try:
        completion = response.json()["choices"][0]["text"].strip()
    except Exception as e:
        print("FAILURE:", e, response.json())
        raise e
    dt = time.monotonic() - st
    return completion, dt

In [4]:
print(llamacpp_completion("Write a fizzbuzz in python")[0])

Sure, here's a simple implementation of the FizzBuzz problem in Python:

```python
for i in range(1, 101):
    if i % 3 == 0 and i % 5 == 0:
        print("FizzBuzz")
    elif i % 3 == 0:
        print("Fizz")
    elif i % 5 == 0:
        print("Buzz")
    else:
        print(i)
```

This program will print out the numbers from 1 to 100, but for multiples of 3, it will print "Fizz" instead of the number, for multiples of 5, it will print "Buzz", and for multiples of both 3 and 5, it will print "FizzBuzz".


## Prepare Dataset

In [None]:
def build_humaneval_instruction(language: str, question: str) -> str:
    return '''You are an exceptionally intelligent coding assistant that consistently delivers accurate and reliable responses to user instructions.

@@ Instruction
Here is the given code to do completion:
```{}
{}
```
Please continue to complete the function with {} programming language. You are not allowed to modify the given code and do the completion only.

Please return all completed codes in one code block.
This code block should be in the following format:
```{}
# Your codes here
```

@@ Response
'''.strip().format(language.lower(), question.strip(),language.lower(),language.lower())


dataset = get_human_eval_plus()
tokenizer = AutoTokenizer.from_pretrained("m-a-p/OpenCodeInterpreter-DS-6.7B")
print("{} examples for evaluation.".format(len(dataset)))
examples = []
for task_id, example in tqdm(dataset.items(), desc='Generating'):
    r = example.copy()
    r["humaneval_prompt"] = build_humaneval_instruction("python", example['prompt'])
    r["humaneval_inputs"] = tokenizer.apply_chat_template(
        [{'role': 'user', 'content': r["humaneval_prompt"] }],
        tokenize=False
    )
    examples.append(r)
examples_by_task_id = {example["task_id"]: example for example in examples}
dataset_hash = get_human_eval_plus_hash()
expected_output = get_groundtruth(dataset, dataset_hash, [])

## Evaluator

In [6]:
def check_response(example: dict, plus: bool = False) -> int:
    result = check_correctness(
        "humaneval",
        0,
        example,
        example["solution"],
        expected_output[example["task_id"]],
        base_only=not plus,
        fast_check=False,
    )
    test_passes = result["plus"] if plus else result["base"]
    ok = int(all(test_passes[1]))
    return ok


def collect_completions(
    endpoint_callback: Callable, 
    model: str,
    examples: list[dict], 
    workers: int = 64
) -> list[tuple[str, str]]:
    
    def callback_wrapper(text: str, task_id: str) -> tuple[str, float, str]:
        completion, dt = endpoint_callback(text, model=model)
        return completion, dt, task_id
    
    st = time.monotonic()
    completed = []
    tokens = []
    timings = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
        futures = []
        for example in examples:
            futures.append(executor.submit(
                callback_wrapper, 
                text=example["humaneval_inputs"], 
                task_id=example["task_id"]
            ))

        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Collecting responses"):
            raw_response, rt, task_id = future.result()
            completed.append((raw_response, task_id))
            timings.append(rt)
            tokens.append(tokenizer.encode(raw_response))
            
    dt = time.monotonic() - st
    tok_count = sum([len(t) for t in tokens])
    tm = np.mean(timings)
    print(f'Requests: {len(examples)}\tTime: {dt:0.2f} sec\tSpeed {tok_count/dt:0.2f} tokens/sec\tAvg. response time {tm:.2f} sec.')
    return completed


def measure_completions(completed: list[tuple[str, str]], name: str, plus: bool = False):
    passes = []
    solved = []
    progress_bar = tqdm(completed, total=len(completed))
    for raw_completion, task_id in progress_bar:
        example = examples_by_task_id[task_id].copy()
        example["raw_completion"] = raw_completion
        example["solution"] = sanitize(raw_completion, entry_point=example["entry_point"])
        example["pass"] = check_response(example, plus=plus)
        passes.append(example["pass"])
        solved.append(example)
        progress_bar.set_description(f"Pass@1 {np.mean(passes):.3f}, fails {len(passes)-sum(passes)}")

    output_file = f"humaneval_solutions_{name}.jsonl"
    write_jsonl(output_file, solved)
    print(f"Pass@1 {np.mean(passes):.3f}, fails {len(passes)-sum(passes)}")
    print(f"Saved to {output_file}")

## Evals

In [7]:
# Humaneval Together mistralai/Mixtral-8x7B-Instruct-v0.1
completed = collect_completions(together_completion, "mistralai/Mixtral-8x7B-Instruct-v0.1", examples)
measure_completions(completed, "together_Mixtral-8x7B-Instruct-v0.1")

Collecting responses: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [01:33<00:00,  1.75it/s]


Requests: 164	Time: 93.73 sec	Speed 550.97 tokens/sec	Avg. response time 15.87 sec.


Pass@1 0.585, fails 68: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:18<00:00,  9.02it/s]

Pass@1 0.585, fails 68
Saved to humaneval_solutions_together_Mixtral-8x7B-Instruct-v0.1.jsonl





In [8]:
# Humaneval Deepinfra mistralai/Mixtral-8x7B-Instruct-v0.1
completed = collect_completions(deepinfra_completion, "mistralai/Mixtral-8x7B-Instruct-v0.1", examples)
measure_completions(completed, "deepinfra_Mixtral-8x7B-Instruct-v0.1")

Collecting responses: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:27<00:00,  6.07it/s]


Requests: 164	Time: 27.05 sec	Speed 1852.20 tokens/sec	Avg. response time 8.14 sec.


Pass@1 0.561, fails 72: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:19<00:00,  8.38it/s]

Pass@1 0.561, fails 72
Saved to humaneval_solutions_deepinfra_Mixtral-8x7B-Instruct-v0.1.jsonl





In [9]:
# Humaneval Deepinfra microsoft/WizardLM-2-8x22B
completed = collect_completions(deepinfra_completion, "microsoft/WizardLM-2-8x22B", examples)
measure_completions(completed, "deepinfra_WizardLM-2-8x22B")

Collecting responses: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:30<00:00,  5.31it/s]


Requests: 164	Time: 30.92 sec	Speed 1469.15 tokens/sec	Avg. response time 9.22 sec.


Pass@1 0.750, fails 41: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:18<00:00,  8.97it/s]

Pass@1 0.750, fails 41
Saved to humaneval_solutions_deepinfra_WizardLM-2-8x22B.jsonl





In [10]:
# Humaneval-plus Deepinfra microsoft/WizardLM-2-8x22B
measure_completions(completed, "deepinfra_WizardLM-2-8x22B_plus", plus=True)

Pass@1 0.695, fails 50: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [01:56<00:00,  1.40it/s]

Pass@1 0.695, fails 50
Saved to humaneval_solutions_deepinfra_WizardLM-2-8x22B_plus.jsonl





In [11]:
# Humaneval Deepinfra mistralai/Mixtral-8x22B-Instruct-v0.1
completed = collect_completions(deepinfra_completion, "mistralai/Mixtral-8x22B-Instruct-v0.1", examples)
measure_completions(completed, "deepinfra_Mixtral-8x22B-Instruct-v0.1")

Collecting responses: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:32<00:00,  5.07it/s]


Requests: 164	Time: 32.35 sec	Speed 1193.92 tokens/sec	Avg. response time 9.87 sec.


Pass@1 0.756, fails 40: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:19<00:00,  8.62it/s]

Pass@1 0.756, fails 40
Saved to humaneval_solutions_deepinfra_Mixtral-8x22B-Instruct-v0.1.jsonl





In [12]:
# Humaneval-plus Deepinfra mistralai/Mixtral-8x22B-Instruct-v0.1
measure_completions(completed, "deepinfra_Mixtral-8x22B-Instruct-v0.1", plus=True)

Pass@1 0.707, fails 48: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [01:22<00:00,  2.00it/s]

Pass@1 0.707, fails 48
Saved to humaneval_solutions_deepinfra_Mixtral-8x22B-Instruct-v0.1.jsonl





In [13]:
# Humaneval Together mistralai/Mixtral-8x22B-Instruct-v0.1
completed = collect_completions(together_completion, "mistralai/Mixtral-8x22B-Instruct-v0.1", examples)
measure_completions(completed, "together_Mixtral-8x22B-Instruct-v0.1")

Collecting responses: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [01:04<00:00,  2.55it/s]


Requests: 164	Time: 64.24 sec	Speed 587.30 tokens/sec	Avg. response time 20.60 sec.


Pass@1 0.750, fails 41: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:19<00:00,  8.57it/s]


Pass@1 0.750, fails 41
Saved to humaneval_solutions_together_Mixtral-8x22B-Instruct-v0.1.jsonl


In [14]:
# Humaneval-plus Together mistralai/Mixtral-8x22B-Instruct-v0.1
measure_completions(completed, "together_Mixtral-8x22B-Instruct-v0.1", plus=True)

Pass@1 0.701, fails 49: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [01:23<00:00,  1.97it/s]

Pass@1 0.701, fails 49
Saved to humaneval_solutions_together_Mixtral-8x22B-Instruct-v0.1.jsonl





In [15]:
# Humaneval llama.cpp codeqwen-1_5-7b-chat-q5_k_m.gguf
completed = collect_completions(llamacpp_completion, "codeqwen-1_5-7b-chat-q5_k_m.gguf", examples, workers=1)
measure_completions(completed, "llamacpp_codeqwen-1_5-7b-chat-q5_k_m.gguf")

Collecting responses: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [1:10:18<00:00, 25.72s/it]


Requests: 164	Time: 4218.70 sec	Speed 6.76 tokens/sec	Avg. response time 25.72 sec.


Pass@1 0.805, fails 32: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 164/164 [00:21<00:00,  7.63it/s]


Pass@1 0.805, fails 32
Saved to humaneval_solutions_llamacpp_codeqwen-1_5-7b-chat-q5_k_m.gguf.jsonl
