In [1]:
from datasets import load_dataset
from requests import ReadTimeout
import utils
from tqdm import tqdm
import ast
import utils
import google.generativeai as genai
from langchain_community.llms import Ollama

# Setting llms
genai.configure(api_key=utils.get_GEMINI_API_KEY())
model = genai.GenerativeModel('gemini-pro')  # for optimizerllm
llm = Ollama(model="gemma:2b", temperature=0, num_gpu = 40, timeout=30)  # for scorerllm

# load gsm8k dataset
gsm8k_dataset = load_dataset("gsm8k", "main")
gsm8k_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

### Checker Function

In [2]:
def check(actual, expected):
    """
    Checkes if the actual answer is correct or not

    @param actual:      The actual answer
    @param expected:    The expected answer
    @return:            1 if the actual answer is correct, 0 otherwise
    """
    # Set up the model
    generation_config = {
        "temperature": 0.0,
    }

    prompt = """
        Compare the actual answer with the expected answer. 
        If the actual answer is correct, respond with "1". 
        If the actual answer is incorrect, respond with "0". 
        Respond with a json object with "answer" and "isCorrect" fields.

        Example Prompt:
        Actual answer: The answer is 42.
        Expected answer: 42

        Let's think step by step. 
        The actual answer is 42. The expected answer is also 42. Thus, the actual answer is correct!

        Response: 
        {{
            "answer": "The answer is 42.",
            "isCorrect": 1
        }}

        User Prompt:
        Actual answer: {actual}
        Expected answer: {expected}
    """

    response = model.generate_content(prompt.format(actual=actual, expected=expected.split("####")[-1].strip()), generation_config=generation_config)

    return ast.literal_eval(response.text)["isCorrect"]

def check_fast(actual, expected):
    """
    Faster and simpler version of the check function.
    """
    expected = expected.split("####")[-1].strip()
    return expected in actual

In [3]:
# # Check the check function
# total = 0
# max_tests = 10
# for i in tqdm(range(max_tests)):
#     res = check(gsm8k_dataset["train"]["answer"][i], gsm8k_dataset["train"]["answer"][i])
#     if not res:
#         print(gsm8k_dataset["train"]["answer"][i])
#     total += res

# total/max_tests * 100

### Optimizer

In [4]:
def opt_llm(instruction_score_pairs, training_sample):
    # Format the instruction and score pairs into a string
    pairs_str = ""
    for ins, score in instruction_score_pairs.items():
        pairs_str += f"text:\n{ins}\nscore:\n{score:.2f}\n\n"
    
    # Set up the model
    generation_config = {
        "temperature": 1.0,
    }

    prompt = """You are an optimization expert. The user has some texts along with their corresponding scores.
Your task is to generate a new piece of text that scores as high as possible. 
Generate the new unique text only, not its corresponding score.

I have some texts along with their corresponding scores. The texts are arranged in ascending order
based on their scores, where higher scores indicate better quality.

{pairs_str}

The following exemplars show how to apply your text: you replace <INS> in each input with your
text, then read the input and give an output. We say your output is wrong if your output is different
from the given output, and we say your output is correct if they are the same.

input:
Q: {q1}
A: <INS>
output:
{a1}

input:
Q: {q2}
A: <INS>
output:
{a2}

input:
Q: {q3}
A: <INS>
output:
{a3}

Write your new text that is different from the old ones and has a score as high as possible.
Generate the new unique text only, not its corresponding score.
New instruction:
"""

    # Passing 20 Best Instruction-Score Pairs
    # 3 randomly chosen questions/answers from the training sample
    cut = lambda x: x.split("####")[-1].strip()
    q1, q2, q3 = training_sample["question"][0], training_sample["question"][1], training_sample["question"][2]
    a1, a2, a3 = cut(training_sample["answer"][0]), cut(training_sample["answer"][1]), cut(training_sample["answer"][2])
    response = model.generate_content(prompt.format(pairs_str=pairs_str, q1=q1, q2=q2, q3=q3, a1=a1, a2=a2, a3=a3), 
                                      generation_config=generation_config)
    return response.text



### Scorer

In [5]:
def lm(prompt):
    temp = llm.timeout
    while True:
        try:
            res = llm.invoke(prompt)
            break
        except ReadTimeout:
            if llm.timeout > 120:
                print(f"Inference lasted for {llm.timeout} seconds. Stopping now.")
                break
            llm.timeout *= 2
            print(f"### ReadTimeout. Trying again with Timeout: {llm.timeout} seconds ###")
        except Exception as e:
            print(f"### {e} ###")
            break
    llm.timeout = temp
    return res
    
def scorer_lm(instruction, training_sample):
    # Scoring an instruction using an LM
    accuracy = 0
    prompt = """
        Q: {question}
        A: {instruction}
    """
    for i in range(len(training_sample)):
        res = lm(prompt.format(question=training_sample["question"][i], instruction=instruction))
        accuracy += check_fast(res, training_sample["answer"][i])

    accuracy = accuracy / len(training_sample) * 100

    return accuracy

### Combining the Optimizer with the Scorer

In [6]:
import os
import json

# Sample 3.5% of the training set
INS_PER_STEP = 8
EXEMPLARS_PER_STEP = 3
MAX_INS_SCORE_PAIRS = 20
SAMPLE_PERCENTAGE = 0.35/100
training_sample = gsm8k_dataset["train"].shuffle(seed=42).select(range(int(len(gsm8k_dataset["train"]) * SAMPLE_PERCENTAGE)))

# loading saved data
if os.path.exists("opro.json"):
    with open("opro.json", "r") as f:
        opro = json.load(f)
    step = len(opro) + 1
    ins_score_pairs = opro[str(len(opro))]
else:
    ins_lst = ["Solve this problem", 
               "Let's think step by step", 
                "Let’s solve this problem by splitting it into steps", 
                "Let’s think about this logically", 
                "Take a deep breath and think through this", 
                "Break this down"]
    ins_score_pairs = {ins:scorer_lm(ins, training_sample) for ins in tqdm(ins_lst)}
    step = 1
    opro = {step: ins_score_pairs}
    with open("opro.json", "w") as f:
        json.dump(opro, f)

# Each step takes aboy 5 to 10 minutes with gemma:2b
for i in range(step, step + 101):
    print(f"Step {i}")
    while True:
        try:
            # Optimizer LLM
            print("Optimizing instructions...")
            exemplars = training_sample.shuffle()[:EXEMPLARS_PER_STEP]
            instructions = [opt_llm(ins_score_pairs, exemplars) for _ in tqdm(range(INS_PER_STEP))]
            
            # Scoring the new instructions
            print("Scoring instructions...")
            new_ins_score_pairs = {ins: scorer_lm(ins, training_sample) for ins in tqdm(instructions)}
            combined_ins_score_pairs = {**ins_score_pairs, **new_ins_score_pairs}
            ins_score_pairs = dict(sorted(combined_ins_score_pairs.items(), key=lambda x: x[1], reverse=True)[:MAX_INS_SCORE_PAIRS])

            # Saving data
            opro[i] = ins_score_pairs
            with open("opro.json", "w") as f:
                json.dump(opro, f)
            
            break
        except ValueError as e:
            print(e)
        except Exception as e:
            print(e)

100%|██████████| 6/6 [04:30<00:00, 45.15s/it]


Step 1
Optimizing instructions...


100%|██████████| 8/8 [00:21<00:00,  2.65s/it]


Scoring instructions...


100%|██████████| 8/8 [06:49<00:00, 51.13s/it]


Step 2
Optimizing instructions...


100%|██████████| 8/8 [00:17<00:00,  2.21s/it]


Scoring instructions...


100%|██████████| 8/8 [07:39<00:00, 57.38s/it]


Step 3
Optimizing instructions...


100%|██████████| 8/8 [00:25<00:00,  3.19s/it]


Scoring instructions...


100%|██████████| 8/8 [08:29<00:00, 63.63s/it]


Step 4
Optimizing instructions...


100%|██████████| 8/8 [00:29<00:00,  3.66s/it]


Scoring instructions...


100%|██████████| 8/8 [09:11<00:00, 68.90s/it]


Step 5
Optimizing instructions...


100%|██████████| 8/8 [00:28<00:00,  3.62s/it]


Scoring instructions...


100%|██████████| 8/8 [10:17<00:00, 77.18s/it]


Step 6
Optimizing instructions...


100%|██████████| 8/8 [00:29<00:00,  3.69s/it]


Scoring instructions...


100%|██████████| 8/8 [11:13<00:00, 84.19s/it]


Step 7
Optimizing instructions...


100%|██████████| 8/8 [00:26<00:00,  3.32s/it]


Scoring instructions...


100%|██████████| 8/8 [11:15<00:00, 84.45s/it] 


Step 8
Optimizing instructions...


100%|██████████| 8/8 [00:23<00:00,  2.97s/it]


Scoring instructions...


100%|██████████| 8/8 [10:03<00:00, 75.38s/it]


Step 9
Optimizing instructions...


100%|██████████| 8/8 [00:41<00:00,  5.23s/it]


Scoring instructions...


100%|██████████| 8/8 [11:14<00:00, 84.26s/it]


Step 10
Optimizing instructions...


100%|██████████| 8/8 [00:27<00:00,  3.39s/it]


Scoring instructions...


100%|██████████| 8/8 [11:13<00:00, 84.16s/it]


Step 11
Optimizing instructions...


100%|██████████| 8/8 [00:25<00:00,  3.14s/it]


Scoring instructions...


100%|██████████| 8/8 [09:52<00:00, 74.07s/it]


Step 12
Optimizing instructions...


100%|██████████| 8/8 [00:35<00:00,  4.40s/it]


Scoring instructions...


100%|██████████| 8/8 [10:29<00:00, 78.66s/it]


Step 13
Optimizing instructions...


100%|██████████| 8/8 [00:27<00:00,  3.38s/it]


Scoring instructions...


100%|██████████| 8/8 [11:13<00:00, 84.22s/it]


Step 14
Optimizing instructions...


100%|██████████| 8/8 [00:35<00:00,  4.39s/it]


Scoring instructions...


100%|██████████| 8/8 [10:20<00:00, 77.52s/it]


Step 15
Optimizing instructions...


100%|██████████| 8/8 [00:24<00:00,  3.07s/it]


Scoring instructions...


100%|██████████| 8/8 [10:52<00:00, 81.59s/it]


Step 16
Optimizing instructions...


100%|██████████| 8/8 [00:30<00:00,  3.80s/it]


Scoring instructions...


100%|██████████| 8/8 [10:46<00:00, 80.75s/it]


Step 17
Optimizing instructions...


100%|██████████| 8/8 [00:25<00:00,  3.25s/it]


Scoring instructions...


100%|██████████| 8/8 [10:28<00:00, 78.53s/it]


Step 18
Optimizing instructions...


100%|██████████| 8/8 [00:29<00:00,  3.68s/it]


Scoring instructions...


100%|██████████| 8/8 [10:34<00:00, 79.29s/it]


Step 19
Optimizing instructions...


100%|██████████| 8/8 [00:24<00:00,  3.04s/it]


Scoring instructions...


100%|██████████| 8/8 [10:16<00:00, 77.12s/it]


Step 20
Optimizing instructions...


100%|██████████| 8/8 [00:31<00:00,  3.91s/it]


Scoring instructions...


100%|██████████| 8/8 [10:52<00:00, 81.54s/it]


Step 21
Optimizing instructions...


100%|██████████| 8/8 [00:32<00:00,  4.12s/it]


Scoring instructions...


 25%|██▌       | 2/8 [02:49<08:25, 84.25s/it]

KeyboardInterrupt: 

In [None]:
ins_score_pairs