In [None]:
####################################################################################################
############################ Try SGLang and Test Llama3 on MATH  ###################################
####################################################################################################

In [31]:
from sglang.utils import (
    execute_shell_command,
    wait_for_server,
    terminate_process,
    print_highlight,
)

import requests

In [30]:
url = "http://localhost:10086/v1/chat/completions"

data = {
    "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "messages": [
        {"role": "user", "content": "List 3 countries and their capitals."}
    ]
}

response = requests.post(url, json=data)
print(response.json())

{'id': '8ae7cd6a70e243779ec3a7b55e90a910', 'object': 'chat.completion', 'created': 1731544482, 'model': 'meta-llama/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'Here are 3 countries and their capitals:\n\n1. United States - Washington D.C.\n2. Japan - Tokyo\n3. Australia - Canberra'}, 'logprobs': None, 'finish_reason': 'stop', 'matched_stop': 128009}], 'usage': {'prompt_tokens': 43, 'total_tokens': 73, 'completion_tokens': 30, 'prompt_tokens_details': None}}


In [None]:
import openai

client = openai.Client(base_url="http://127.0.0.1:10086/v1", api_key="None")

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant"},
        {"role": "user", "content": "List 3 countries and their capitals."},
    ],
    temperature=0,
    max_tokens=128,
)

print(response)

ChatCompletion(id='e100c05c69d14502ad6af3c2f9205d31', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='Here are 3 countries and their capitals:\n\n1. Country: Japan\n   Capital: Tokyo\n\n2. Country: Australia\n   Capital: Canberra\n\n3. Country: Brazil\n   Capital: Brasília', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), matched_stop=128009)], created=1731542263, model='meta-llama/Meta-Llama-3.1-8B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=43, prompt_tokens=48, total_tokens=91, completion_tokens_details=None, prompt_tokens_details=None))


In [4]:
# load math500-test.jsonl
import json

data = []
with open("math500-test.jsonl", "r") as f:
    raw_data = f.readlines()

for line in raw_data:
    data.append(json.loads(line))

In [7]:
# Need to import an evaluator from dart_math
# to compare math expressions with ground truth answers

from dart_math.eval import EvaluatorMath
math_evaluator = EvaluatorMath()

In [None]:
# from simple_eval by OpenAI

import random
import re
from typing import Literal

import blobfile as bf
import pandas

import common
from common import ANSWER_PATTERN, HTML_JINJA, check_equality
from eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

QUERY_TEMPLATE = """
Solve the following math problem step by step. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.

{Question}

Remember to put your answer on its own line after "Answer:", and you do not need to use a \\boxed command.
""".strip()


class MathEval(Eval):
    def __init__(
        self,
        equality_checker: SamplerBase,
        num_examples: int | None = None,
        n_repeats: int = 1,
        split: Literal["math_test", "math_500_test"] = "my_math_500_test",  # see readme.md
    ):
        df = pandas.read_csv(
            # bf.BlobFile(f"https://openaipublic.blob.core.windows.net/simple-evals/{split}.csv")
            bf.BlobFile(f"{split}.csv")
        )
        examples = [row.to_dict() for _, row in df.iterrows()]
        if num_examples:
            assert n_repeats == 1, "n_repeats only supported for num_examples = None"
            rng = random.Random(0)
            examples = rng.sample(examples, num_examples)
        self.examples = examples * n_repeats
        self.equality_checker = equality_checker

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(row: dict):
            prompt_messages = [
                sampler._pack_message(content=QUERY_TEMPLATE.format(**row), role="user")
            ]
            response_text = sampler(prompt_messages)
            match = re.search(ANSWER_PATTERN, response_text)
            extracted_answer = match.group(1) if match else None
            # score = 0 if extracted_answer is None else \
            #     float(check_equality(self.equality_checker, row["Answer"], extracted_answer))
            score = 0 if extracted_answer is None else math_evaluator.eq(row["Answer"], extracted_answer)
            
            # my change: none -> error
            html = common.jinja_env.from_string(HTML_JINJA).render(
                prompt_messages=prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=score,
                correct_answer=row["Answer"],
                extracted_answer=extracted_answer,
            )
            convo = prompt_messages + [dict(content=response_text, role="assistant")]
            return SingleEvalResult(html=html, score=score, convo=convo)

        results = common.map_with_progress(fn, self.examples)
        return common.aggregate_results(results)

In [9]:
# also from simple_eval by OpenAI

import base64
import time
from typing import Any

import openai
from openai import OpenAI

from eval_types import MessageList, SamplerBase

OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
OPENAI_SYSTEM_MESSAGE_CHATGPT = (
    "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture."
    + "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
)


class ChatCompletionSampler(SamplerBase):
    """
    Sample from OpenAI's chat completion API
    """

    def __init__(
        self,
        model: str = "gpt-3.5-turbo",
        system_message: str | None = None,
        temperature: float = 0.5,
        max_tokens: int = 1024,
        client = None, 
        return_full_response: bool = False,
    ):
        self.api_key_name = "OPENAI_API_KEY"
        self.client = client or OpenAI()
        # using api_key=os.environ.get("OPENAI_API_KEY")  # please set your API_KEY
        self.model = model
        self.system_message = system_message
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.image_format = "url"
        self.return_full_response = return_full_response

    def _handle_image(
        self, image: str, encoding: str = "base64", format: str = "png", fovea: int = 768
    ):
        new_image = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/{format};{encoding},{image}",
            },
        }
        return new_image

    def _handle_text(self, text: str):
        return {"type": "text", "text": text}

    def _pack_message(self, role: str, content: Any):
        return {"role": str(role), "content": content}

    def __call__(self, message_list: MessageList) -> str:
        if self.system_message:
            message_list = [self._pack_message("system", self.system_message)] + message_list
        trial = 0
        # print(message_list)
        while True:
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=message_list,
                    temperature=self.temperature,
                    max_tokens=self.max_tokens,
                )
                if self.return_full_response:
                    print("message", message_list, "response", response.choices[0].message.content)
                    return response.choices[0].message.content
                else:
                    return response.choices[0].message.content

            # NOTE: BadRequestError is triggered once for MMMU, please uncomment if you are reruning MMMU
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
                return ""
            except Exception as e:
                exception_backoff = 2**trial  # expontial back off
                print(
                    f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
                    e,
                )
                time.sleep(exception_backoff)
                trial += 1
            # unknown error shall throw exception

In [10]:
# Use our sglang client as OpenAI client
client = openai.Client(base_url="http://127.0.0.1:10086/v1", api_key="None")

sampler = ChatCompletionSampler(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
                                temperature=0.0,
                                max_tokens=2048,
                                client=client)

In [12]:
debug = False
# equality_checker = sampler
matheval = MathEval(
     equality_checker=sampler, num_examples=10 if debug else 500
)

In [13]:
samplers = {"llama3": sampler}

In [14]:
# also from simple_eval by OpenAI

import pandas as pd

evals = {
    "matheval": matheval
}
print(evals)
debug_suffix = "_DEBUG" if debug else ""
print(debug_suffix)
mergekey2resultpath = {}
for sampler_name, sampler in samplers.items():
    for eval_name, eval_obj in evals.items():
        result = eval_obj(sampler)
        # ^^^ how to use a sampler
        file_stem = f"{eval_name}_{sampler_name}"
        report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
        print(f"Writing report to {report_filename}")
        with open(report_filename, "w") as fh:
            fh.write(common.make_report(result))
        metrics = result.metrics | {"score": result.score}
        print(metrics)
        result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
        with open(result_filename, "w") as f:
            f.write(json.dumps(metrics, indent=2))
        print(f"Writing results to {result_filename}")
        mergekey2resultpath[f"{file_stem}"] = result_filename
merge_metrics = []
for eval_sampler_name, result_filename in mergekey2resultpath.items():
    try:
        result = json.load(open(result_filename, "r+"))
    except Exception as e:
        print(e, result_filename)
        continue
    result = result.get("f1_score", result.get("score", None))
    eval_name = eval_sampler_name[: eval_sampler_name.find("_")]
    sampler_name = eval_sampler_name[eval_sampler_name.find("_") + 1 :]
    merge_metrics.append(
        {"eval_name": eval_name, "sampler_name": sampler_name, "metric": result}
    )
merge_metrics_df = pd.DataFrame(merge_metrics).pivot(
    index=["sampler_name"], columns="eval_name"
)
print("\nAll results: ")
print(merge_metrics_df.to_markdown())

{'matheval': <__main__.MathEval object at 0x7f46d1e51c00>}



100%|██████████| 500/500 [02:59<00:00,  2.79it/s]

Writing report to /tmp/matheval_llama3.html
{'score:std': 0.4976906669810074, 'score': 0.452}
Writing results to /tmp/matheval_llama3.json

All results: 
| sampler_name   |   ('metric', 'matheval') |
|:---------------|-------------------------:|
| llama3         |                    0.452 |





In [None]:
####################################################################################################
###################################### SGLang for Tree Search ######################################
####################################################################################################

In [16]:
import random
import re
from typing import Literal

import blobfile as bf
import pandas

import common
from common import ANSWER_PATTERN, HTML_JINJA, check_equality
from eval_types import Eval, EvalResult, SamplerBase, SingleEvalResult

split = "my_math_500_test"
df = pandas.read_csv(
    # bf.BlobFile(f"https://openaipublic.blob.core.windows.net/simple-evals/{split}.csv")
    bf.BlobFile(f"{split}.csv")
)
examples = [row.to_dict() for _, row in df.iterrows()]

In [22]:
## try to use sglang to sequentially generate next steps.

import sglang as sgl
import argparse
from sglang.test.test_utils import (
    add_common_sglang_args_and_parse,
    select_sglang_backend,
)
import time


max_steps = 30


@sgl.function
def search_try(s, question):
    s += sgl.user(
        f"""Solve the following math problem step by step. Steps should be separated with two new lines. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
        
{question}

Remember to separate steps with two new lines, and finally put your answer on its own line after "Answer:", and you do not need to use a \\boxed command.
"""
    )
    
    s += sgl.assistant_begin()
    s += sgl.gen(max_tokens=256, stop=["\n\n"])
    
    # print(f"''{s.text()}''")
    
    for _ in range(max_steps):
        # s += new_line
        s += "\n\n"
        s += sgl.gen(max_tokens=256, stop=["\n\n"])
        # print(f"''{s.text()}''")  
        if "Answer:" in s.text().split("\n")[-1]:
            break
    
    # s += sgl.assistant(sgl.gen("step", max_tokens=256, temperature=0.3, stop=["\n\n"]))

In [19]:
args = argparse.Namespace(
    # data_path="sglang/benchmark/tree_of_thought_v0/test.jsonl",
    # num_questions=2,
    port=10086,
    parallel=16,
    backend='srt',
    host="http://127.0.0.1",
    result_file="results.txt"
)

# q = """If $x^3$ is a positive factor of $10!,$ how many possible integer values of $x$ are there?  (Reminder: For a positive integer $n$, the expression $n!$ stands for the product of the integers from 1 up to (and including) $n$.)"""

arguments = [{"question": d['Question']} for d in examples[:100]]

# Select backend
backend = select_sglang_backend(args)

# Run requests
tic = time.time()
states = search_try.run_batch(
    arguments,
    temperature=0,
    backend=backend,
    num_threads=args.parallel,
    progress_bar=True,
)
latency = time.time() - tic

100%|██████████| 100/100 [01:15<00:00,  1.32it/s]


In [21]:
# for i, state in enumerate(states):
#     print(f"Question: {arguments[i]['question']}")
#     print(f"Answer: {state.text()}")

scores = 0
for i, state in enumerate(states):
    response_text = state.text()
    match = re.search(ANSWER_PATTERN, response_text.split("\n")[-1])
    extracted_answer = match.group(1) if match else None
    # score = 0 if extracted_answer is None else \
    #     float(check_equality(self.equality_checker, row["Answer"], extracted_answer))
    answer = examples[i]["Answer"]
    score = 0 if extracted_answer is None else math_evaluator.eq(answer, extracted_answer)
    
    print(f"{extracted_answer} || {answer} || {score}")
    scores += score
    
print(scores/len(states))

$\left(3, \frac{\pi}{2}\right)$ || \left( 3, \frac{\pi}{2} \right) || True
None || p - q || 0
$\frac{14}{3}$ || \frac{14}{3} || True
9 || 9 || True
Evelyn || \text{Evelyn} || True
126 || 42 || False
9 || 27 || False
$20$ || 90^\circ || False
3√13 || 3\sqrt{13} || False
None || 4 || 0
None || 2220 || 0
$\frac{1}{8}$ || \frac{3}{56} || False
284 || 284 || True
$5$ || 5 || True
$10$ || \sqrt{51} || False
$2 - \frac{11 \sqrt{2}}{2} + ( -3 + 2 \sqrt{2} )i$ || 6 - 5i || False
-50 || -50 || True
$\pi$ || \pi || True
$292$ || 28 || False
None || 3 || 0
6 + 9i || 6+9i || True
None || 13535 || 0
None || 5 || 0
None || x=5 || 0
0.6 || 10 || False
$\boxed{\text{There are no solutions.}}$ || 1,-2 || False
20 || 144 || False
$78 || 78 || True
-2 + 7i || -2 + 7i || True
None || 225 || 0
$52_8$ || 52_8 || True
11$\sqrt{2}$ || 11\sqrt2 || True
None || 720 || 0
None || \frac{243}{625} || 0
-125 || -125 || True
3 || 3 || True
$2, 5$ || 3, 5, 7 || False
72 || 72 || True
2000 || 2000 || True
23 || 23 || Tr

In [None]:
import random

In [None]:
## Try beam search

import sglang as sgl
max_steps = 30

BEAM_SIZE = 4
BEAM_WIDTH = 2
assert BEAM_SIZE % BEAM_WIDTH == 0

@sgl.function
def beam_search(s, question):
    s += sgl.user(
        f"""Solve the following math problem step by step. Steps should be separated with two new lines. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
        
{question}

Remember to separate steps with two new lines, and finally put your answer on its own line after "Answer:", and you do not need to use a \\boxed command."""
    )

    s += sgl.assistant_begin()
    forks = s.fork(BEAM_SIZE)
    forks += sgl.gen(max_tokens=256, stop=["\n\n"], temperature=0.5)
    step = s.text().split("\n\n")[-1]

    # print(f"''{s.text()}''")
    cur_states = list(forks)
    
    answer_states = []

    for _ in range(max_steps):
        
        # s += new_line
        # randomly select BEAM_WIDTH states
        # print("--A--")
        
        cur_beam_width = min(BEAM_WIDTH, len(cur_states))
        
        cur_states = random.sample(cur_states, cur_beam_width)

        # expand to BEAM_SIZE states
        new_states = []

        for state in cur_states:

            # print("--B--")

            if "Answer:" in state.text().split("\n")[-1]:
                answer_states.append(state)
                continue
            
            # print("--C--")
            forked_states = state.fork((BEAM_SIZE - 1) // cur_beam_width + 1)
            forked_states += "\n\n" + sgl.gen(max_tokens=256, stop=["\n\n"], temperature=0.5)
            new_states.extend(forked_states)
        
        # print("--D--")
        # print(len(new_states))
        cur_states = new_states
        
        if len(answer_states) > 0:
            break
            
        
    return answer_states


In [25]:
bs_states = beam_search.run_batch(
    arguments[:10],
    temperature=0,
    backend=backend,
    num_threads=args.parallel,
    progress_bar=True,
)

100%|██████████| 10/10 [00:35<00:00,  3.54s/it]


In [None]:
scores = 0

for i, states in enumerate(bs_states):
    if len(states.ret_value) == 0:
        continue

    response_text = states.ret_value[0].text()
    match = re.search(ANSWER_PATTERN, response_text.split("\n")[-1])
    extracted_answer = match.group(1) if match else None
    # score = 0 if extracted_answer is None else \
    #     float(check_equality(self.equality_checker, row["Answer"], extracted_answer))
    answer = examples[i]["Answer"]
    score = 0 if extracted_answer is None else math_evaluator.eq(answer, extracted_answer)
    
    print(f"{extracted_answer} || {answer} || {score}")
    scores += score

print(scores/len(bs_states))

(3, \frac{\pi}{2}) || \left( 3, \frac{\pi}{2} \right) || True
p q (1 - 3 p^2 + 3 p r) - s p^3 || p - q || False
$\frac{14}{3}$ || \frac{14}{3} || True
9 || 9 || True
Angela || \text{Evelyn} || False
42 || 42 || True
27 || 27 || True
98.04 || 90^\circ || False
$3\sqrt{13}$ || 3\sqrt{13} || True
32 || 4 || False
0.6


In [29]:
bs_states[0]

ProgramState(<|start_header_id|>user<|end_header_id|>

Solve the following math problem step by step. Steps should be separated with two new lines. The last line of your response should be of the form Answer: $ANSWER (without quotes) where $ANSWER is the answer to the problem.
        
Convert the point $(0,3)$ in rectangular coordinates to polar coordinates.  Enter your answer in the form $(r,\theta),$ where $r > 0$ and $0 \le \theta < 2 \pi.$

Remember to separate steps with two new lines, and finally put your answer on its own line after "Answer:", and you do not need to use a \boxed command.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

)

In [None]:
arguments = [{"question": d['Question']} for d in examples[:100]]

# Select backend
backend = select_sglang_backend(args)

# Run requests
tic = time.time()
states = search_try.run_batch(
    arguments,
    temperature=0,
    backend=backend,
    num_threads=args.parallel,
    progress_bar=True,
)
latency = time.time() - tic

100%|██████████| 100/100 [01:48<00:00,  1.08s/it]


In [19]:
# for i, state in enumerate(states):
#     print(f"Question: {arguments[i]['question']}")
#     print(f"Answer: {state.text()}")

scores = 0
for i, state in enumerate(states):
    response_text = state.text()
    match = re.search(ANSWER_PATTERN, response_text.split("\n")[-1])
    extracted_answer = match.group(1) if match else None
    # score = 0 if extracted_answer is None else \
    #     float(check_equality(self.equality_checker, row["Answer"], extracted_answer))
    answer = examples[i]["Answer"]
    score = 0 if extracted_answer is None else math_evaluator.eq(answer, extracted_answer)
    
    print(f"{extracted_answer} || {answer} || {score}")
    scores += score
    
print(scores/len(states))

(3, \frac{\pi}{2}) || \left( 3, \frac{\pi}{2} \right) || True
None || p - q || 0
$\frac{14}{3}$ || \frac{14}{3} || True
9 || 9 || True
Angela || \text{Evelyn} || False
126 || 42 || False
None || 27 || 0
$20$ || 90^\circ || False
3√13 || 3\sqrt{13} || False
None || 4 || 0
None || 2220 || 0
125/168 || \frac{3}{56} || False
284 || 284 || True
$5$ || 5 || True
$10$ || \sqrt{51} || False
None || 6 - 5i || 0
-50 || -50 || True
$\pi$ || \pi || True
112 || 28 || False
None || 3 || 0
6 + 9i || 6+9i || True
None || 13535 || 0
None || 5 || 0
5 || x=5 || True
10 || 10 || True
$\boxed{}$ || 1,-2 || False
12 || 144 || False
$78 || 78 || True
-2 + 7i || -2 + 7i || True
112 || 225 || False
$2_8$ || 52_8 || False
11$\sqrt{2}$ || 11\sqrt2 || True
None || 720 || 0
None || \frac{243}{625} || 0
$-\frac{1}{32}$ || -125 || False
3 || 3 || True
$2, 5$ || 3, 5, 7 || False
360 || 72 || False
2000 || 2000 || True
23 || 23 || True
12 || 12 || True
17 || 17 || True
4 || 4 || True
None || 70 \sqrt{2} || 0
1.25 || 1