In [1]:
from datasets import load_dataset
from openai import OpenAI
import utils
from tqdm import tqdm
import ast
import utils

# Set OpenAI client
client = OpenAI(api_key=utils.get_OPENAI_API_KEY_DJ())

# load gsm8k dataset
gsm8k_dataset = load_dataset("gsm8k", "main")
gsm8k_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [2]:
print(gsm8k_dataset["train"]["answer"][10])

Let S be the number of people on the first hundred years’ ship.
The second hundred years’ ship had twice as many as the first, so it had 2S people.
The third hundred years’ ship had twice as many as the second, so it had 2 * 2S = <<2*2=4>>4S people.
All the ships had S + 2S + 4S = 7S = 847 people.
Thus, the ship that the monster ate in the first hundred years had S = 847 / 7 = <<847/7=121>>121 people on it.
#### 121


### Checker Function

In [3]:
def check(actual, expected):
    """
    Checkes if the actual answer is correct or not

    @param actual:      The actual answer
    @param expected:    The expected answer
    @return:            1 if the actual answer is correct, 0 otherwise
    """

    system_prompt = """
        Compare the actual answer with the expected answer. 
        If the actual answer is correct, respond with "1". 
        If the actual answer is incorrect, respond with "0". 
        Respond with a json object with "answer" and "isCorrect" fields.

        Example Prompt:
        Actual answer: The answer is 42.
        Expected answer: 42

        Let's think step by step. 
        The actual answer is 42. The expected answer is also 42. Thus, the actual answer is correct!

        Response: {{
            "answer": "The answer is 42.",
            "isCorrect": 1
        }}
    """

    user_prompt = """
        Actual answer: {actual}
        Expected answer: {expected}
    """

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": user_prompt.format(actual=actual, expected=expected.split("####")[-1].strip()),
            },
        ],
        model="gpt-3.5-turbo-0125",
        max_tokens=4096,
        temperature=0,
        response_format={"type": "json_object"},
    )
    result = chat_completion.choices[0].message.content
    return ast.literal_eval(result)["isCorrect"]

def check_fast(actual, expected):
    """
    Faster and simpler version of the check function.
    """
    expected = expected.split("####")[-1].strip()
    return expected in actual

In [4]:
# # Check the check function
# total = 0
# max_tests = 10
# for i in tqdm(range(max_tests)):
#     res = check(gsm8k_dataset["train"]["answer"][i], gsm8k_dataset["train"]["answer"][i])
#     if not res:
#         print(gsm8k_dataset["train"]["answer"][i])
#     total += res

# total/max_tests * 100

### Optimizer

In [5]:
def opt_llm(instruction_score_pairs, training_sample):
    pairs_str = ""
    for pair in instruction_score_pairs:
        pairs_str += f"input:\n{pair[0]}\nscore:\n{pair[1]}\n"

    system_prompt = """
        You are an optimization expert. The user has some texts along with their corresponding scores.
        Your task is to generate a new piece of text in square brackets that scores as high as possible.
    """

    user_prompt = """
        I have some texts along with their corresponding scores. The texts are arranged in ascending order
        based on their scores, where higher scores indicate better quality.

        {pairs_str}

        The following exemplars show how to apply your text: you replace <INS> in each input with your
        text, then read the input and give an output. We say your output is wrong if your output is different
        from the given output, and we say your output is correct if they are the same.
                
        input:
        Q: {q1}
        A: <INS>
        output:
        {a1}

        input:
        Q: {q2}
        A: <INS>
        output:
        {a2}

        input:
        Q: {q3}
        A: <INS>
        output:
        {a3}
        
        Write your new text that is different from the old ones and has a score as high as possible. Write the
        text in square brackets.
    """
    q1, q2, q3 = training_sample["question"][0], training_sample["question"][1], training_sample["question"][2]
    a1, a2, a3 = training_sample["answer"][0], training_sample["answer"][1], training_sample["answer"][2]

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt.format(pairs_str=pairs_str, q1=q1, q2=q2, q3=q3, a1=a1, a2=a2, a3=a3)},
        ],
        model="gpt-3.5-turbo-0125",
        max_tokens=4096,
        temperature=1,
    )
    result = chat_completion.choices[0].message.content
    return result[1:-1]

### Scorer

In [6]:
def scorer_lm(instruction, training_sample):
    accuracy = 0
    user_prompt = """
        Q: {question}
        A: {instruction}
    """
    for i in tqdm(range(len(training_sample))):
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": user_prompt.format(
                        question=training_sample["question"][i], instruction=instruction
                    ),
                },
            ],
            model="gpt-3.5-turbo-0125",
            max_tokens=4096,
            temperature=0,
        )
        result = chat_completion.choices[0].message.content
        accuracy += check_fast(result, training_sample["answer"][i])

    accuracy = accuracy / len(training_sample) * 100

    return accuracy

### Combining the Optimizer with the Scorer

In [7]:
# Sample 3.5% of the training set
INS_PER_STEP = 8
EXEMPLARS_PER_STEP = 3
MAX_INS_SCORE_PAIRS = 20
SAMPLE_PERCENTAGE = 0.35/100
alt_percentage = 0.05/100
training_sample = gsm8k_dataset["train"].shuffle(seed=42).select(range(int(len(gsm8k_dataset["train"]) * SAMPLE_PERCENTAGE)))
# ins_score_pairs = {"Let's think step by step to solve this problem.": scorer_lm("Let's think step by step to solve this problem.", training_sample)}
ins_score_pairs = {}

steps = 1
for i in tqdm(range(steps)):
    # Optimizer LLM
    exemplars = training_sample.shuffle()[:EXEMPLARS_PER_STEP]
    instructions = [opt_llm(ins_score_pairs, exemplars) for _ in range(INS_PER_STEP)]
    
    # Scoring the new instructions
    new_ins_score_pairs = {ins: scorer_lm(ins, training_sample) for ins in instructions}
    ins_score_pairs.update(new_ins_score_pairs)
    ins_score_pairs = dict(sorted(ins_score_pairs.items(), key=lambda x: x[1], reverse=True)[:MAX_INS_SCORE_PAIRS])


100%|██████████| 26/26 [01:27<00:00,  3.38s/it]
100%|██████████| 26/26 [01:19<00:00,  3.04s/it]
100%|██████████| 26/26 [01:35<00:00,  3.67s/it]
100%|██████████| 26/26 [01:14<00:00,  2.87s/it]
100%|██████████| 26/26 [01:11<00:00,  2.74s/it]
100%|██████████| 26/26 [01:22<00:00,  3.16s/it]
100%|██████████| 26/26 [01:23<00:00,  3.20s/it]
100%|██████████| 26/26 [01:09<00:00,  2.66s/it]
100%|██████████| 1/1 [10:53<00:00, 653.50s/it]


In [8]:
ins_score_pairs

{'By implementing advanced optimization techniques, we can significantly enhance the efficiency and performance of the system, resulting in optimal outcomes.': 88.46153846153845,
 'The solution to this problem lies in optimizing the algorithm to efficiently calculate the total score.': 84.61538461538461,
 'By perfecting his aim, Thomas achieved a perfect stack of 100 blocks, impressing everyone with his precision and dedication.': 84.61538461538461,
 'Based on various scientific studies, experts have concluded that implementing regular exercise routines leads to numerous health benefits, such as improved cardiovascular health, increased energy levels, and enhanced mental well-being.]': 80.76923076923077,
 'I calculated the sum of the first 10 prime numbers which are 2, 3, 5, 7, 11, 13, 17, 19, 23, 29. The total sum is 129. This demonstrates the importance of prime numbers in mathematics.': 80.76923076923077,
 'To maximize your score, focus on providing clear, concise, and accurate info