#### Set environment variables in [.env](.env) for LLM API calling

### Import Dependencies

In [None]:
import sys
import os
sys.path.insert(0, "../")
import promptwizard
import random
from promptwizard.glue.promptopt.instantiate import GluePromptOpt
from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing
from promptwizard.glue.common.utils.file import save_jsonlist
from typing import Any
from tqdm import tqdm
import json
import os
from azure.identity import get_bearer_token_provider, AzureCliCredential
from openai import AzureOpenAI
from openai import OpenAI

from dotenv import load_dotenv
load_dotenv(override = True)


True

### Below code can be used for LLM-as-a-judge eval

In [2]:
def extract_between(start, end, text):
    """
    Extracts the substring from 'text' that is between 'start' and 'end' strings.
    
    Parameters:
    - start (str): The starting delimiter string.
    - end (str): The ending delimiter string.
    - text (str): The text to search within.
    
    Returns:
    - str: The extracted substring between the start and end delimiters.
    """
    start_index = text.find(start)
    if start_index == -1:
        return '' 
    
    start_index += len(start)
    
    end_index = text.find(end, start_index)
    if end_index == -1:
        return ''  
    return text[start_index:end_index]

def call_api(messages):

    if os.environ['USE_OPENAI_API_KEY'] == "True":
        client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

        response = client.chat.completions.create(
        model=os.environ["OPENAI_MODEL_NAME"],
        messages=messages,
        temperature=0.0,
        )
    else:
        token_provider = get_bearer_token_provider(
                AzureCliCredential(), "https://cognitiveservices.azure.com/.default"
            )
        client = AzureOpenAI(
            api_version=os.environ["OPENAI_API_VERSION"],
            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
            azure_ad_token_provider=token_provider
            )
        response = client.chat.completions.create(
            model=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
            messages=messages,
            temperature=0.0,
        )

    prediction = response.choices[0].message.content
    return prediction

def llm_eval(predicted_answer,gt_answer):
    
    EVAL_PROMPT = f"""Given the Predicted_Answer and Reference_Answer, compare them and check they mean the same.
                    If they mean the same then return True between <ANS_START> and <ANS_END> tags , 
                    If they differ in the meaning then return False between <ANS_START> and <ANS_END> tags 
                    Following are the given :
                    Predicted_Answer: {predicted_answer}
                    Reference_Answer: {gt_answer}"""
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": EVAL_PROMPT}
    ]

    response = call_api(messages)
    final_judgement = extract_between(start="<ANS_START>", end="<ANS_END>", text=response)
    return final_judgement == "True"

### Create a dataset specific class and define the required functions 

In [3]:

llm_as_judge_eval = True

class BBH(DatasetSpecificProcessing):

    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:
        def extract_answer_from_output(completion):

                return completion

        examples_set = []

        for _, sample in tqdm(enumerate(kwargs["dataset"]), desc="Evaluating samples"):
            example = {
              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],
              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],
              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample["answer"])
            }
            examples_set.append(example)

        save_jsonlist(dataset_jsonl, examples_set, "w")

    def extract_final_answer(self, answer: str):
        
        final_answer = extract_between(text=answer,start="<ANS_START>",end="<ANS_END>")
        return final_answer
    
    def access_answer(self, llm_output: str, gt_answer: str):

        if llm_as_judge_eval:
            predicted_answer = self.extract_final_answer(llm_output)
            is_correct = False
            if llm_eval(predicted_answer,gt_answer):
                is_correct = True
        else:
            predicted_answer = self.extract_final_answer(llm_output)
            is_correct = False
            if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):
                is_correct = True

        return is_correct, predicted_answer

In [4]:
bbh_processor = BBH()

### Load and save the dataset . 
Set the ```dataset_to_run``` variable to choose 1 among the 19 datasets of BBII to run the optimization on

In [5]:
# Step 1: Prepare directories
if not os.path.exists("data"):
    os.mkdir("data")

dataset_list = ['reasoning_about_colored_objects']
dataset_to_run = 'reasoning_about_colored_objects'

if not os.path.exists(f"data/{dataset_to_run}"):
    os.mkdir(f"data/{dataset_to_run}")

# Step 2: Clone the repo
#os.system("git clone https://github.com/google-deepmind/bbeh")
random.seed(42)
# Step 3: Process each dataset
for dataset in dataset_list:
    if dataset != dataset_to_run:
        continue

    file_path = f'../BIG-Bench-Hard/bbh/{dataset}.json'
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Optional: set seed for reproducibility
    examples = data['examples']
    # Shuffle the examples
    shuffled = random.sample(examples, len(examples))  # makes a shuffled copy

    # Split into train and test
    train_samples = shuffled[:25]
    test_samples = shuffled[25:]

    def format_data(samples):
        data_list = []
        for sample in samples:
            informal = sample["input"]
            formal = sample["target"]
            question = f"{informal}"
            answer = f"{formal}"
            data_list.append({"question": question, "answer": answer})
        return data_list

    train_data = format_data(train_samples)
    test_data = format_data(test_samples)

    # Use bbh_processor to write JSONL files
    bbh_processor.dataset_to_jsonl(f"data/{dataset}/train.jsonl", dataset=train_data)
    bbh_processor.dataset_to_jsonl(f"data/{dataset}/test.jsonl", dataset=test_data)

# Step 4: Clean up
#os.system("rm -r BIG-Bench-Hard")



Evaluating samples: 25it [00:00, 59679.91it/s]
Evaluating samples: 225it [00:00, 318823.78it/s]


### Set paths

In [6]:
train_file_name = os.path.join("data/"+dataset_to_run, "train.jsonl")
test_file_name = os.path.join("data/"+dataset_to_run, "test.jsonl")
path_to_config = "configs"
llm_config_path = os.path.join(path_to_config, "promptwizard/llm_config.yaml")
promptopt_config_path = os.path.join(path_to_config, "promptwizard/promptopt_config_reasoning_about_colored_objects.yaml")
setup_config_path = os.path.join(path_to_config, "promptwizard/setup_config.yaml")

### Create an object for calling prompt optimization and inference functionalities

In [7]:
gp = GluePromptOpt(promptopt_config_path,
                   setup_config_path,
                   train_file_name,
                   bbh_processor)

Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'bbh'), ('mode', 'offline')] 


Prompt Optimization parameters: [('answer_format', 'Wrap the final answer between <ANS_START> and <ANS_END>'), ('base_instruction', "Let's think step by step."), ('few_shot_count', 3), ('generate_expert_identity', True), ('generate_intent_keywords', False), ('generate_reasoning', True), ('max_eval_batches', 6), ('min_correct_count', 3), ('mutate_refine_iterations', 3), ('mutation_rounds', 3), ('num_train_examples', 20), ('prompt_technique_name', 'critique_n_refine'), ('questions_batch_size', 1), ('refine_instruction', True), ('refine_task_eg_iterations', 3), ('seen_set_size', 25), ('style_variation', 5), ('task_description', 'Answer extremely simple questions about the colors of objects on a surface.'), ('top_n', 1), ('unique_model_id', 'gpt-4o')] 




### Call prompt optmization function
1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real
2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples 
3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt 

In [8]:
# Function call to generate optimal prompt and expert profile 
best_prompt = gp.get_best_prompt(use_examples=False,run_without_train_examples=False,generate_synthetic_examples=False)


Mutating Task Description....


Iterations completed:   0%|          | 0/3 [00:00<?, ?it/s]
 + Starting iteration: 1 
 current_base_instruction: Let's think step by step.
mutation_round=0 mutated_sample_prompt=You are given a task description and a prompt instruction and different styles known as meta prompts:
[Task Description]: Answer extremely simple questions about the colors of objects on a surface.
[Meta Prompt]: How could I devise an experiment to help solve that problem?
Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made.
How could I measure progress on this problem?
How can I simplify the problem so that it is easier to solve?
What are the key assumptions underlying this problem?
Now you need to generate 5 variations of following Instruction adaptively mixing meta prompt while keeping similar semantic meaning.
Make sure to wrap each generated prompt with <START> and <END>
[Prompt Instruction]: Let's think step by step.
[Generated Prompts


Refining Task description and Examples iteratively....


100%|██████████| 3/3 [00:05<00:00,  2.00s/it]



Generating CoT Reasoning for In-Context Examples....


100%|██████████| 3/3 [00:04<00:00,  1.53s/it]



Generating Expert Identity....


Expert Identity: You are a color expert with a specialization in visual perception and color theory. Your expertise lies in understanding how colors interact, how they are perceived by the human eye, and how they can be described accurately. You have a keen eye for distinguishing between different shades and hues, and you can easily identify and categorize colors based on their properties such as hue, saturation, and brightness. Your knowledge of color psychology and symbolism allows you to interpret the meanings and associations of different colors. You are well-equipped to answer extremely simple questions about the colors of objects on a surface, providing clear and concise explanations that are easy to understand for individuals seeking basic information about color perception. Your ability to communicate complex concepts in a simple and accessible manner makes you an ideal agent for addressing inquiries related to colors.
Final best prompt:  Develop a systematic approach for analy

### Save the optimized prompt and expert profile

In [9]:
print(best_prompt[0])

 Develop a systematic approach for analyzing the colors of objects on a surface by considering the spatial arrangement of objects relative to each other. Clearly define color identification methods and criteria to ensure objective interpretations. Address how to accurately handle scenarios involving multiple objects of the same color to enhance problem-solving accuracy. 

[Question] On the floor, you see several things arranged in a row: a blue crayon, a purple stress ball, and a burgundy dog leash. What is the color of the right-most thing?
Options:
(A) red
(B) orange
(C) yellow
(D) green
(E) blue
(F) brown
(G) magenta
(H) fuchsia
(I) mauve
(J) teal
(K) turquoise
(L) burgundy
(M) silver
(N) gold
(O) black
(P) grey
(Q) purple
(R) pink
[Answer] 1. Identify the objects on the floor: a blue crayon, a purple stress ball, and a burgundy dog leash.
2. Determine the spatial arrangement of the objects in a row.
3. Establish the position of the right-most object in the row.
4. Focus on the colo

In [10]:
import pickle 
if not os.path.exists("results"):
    os.system("mkdir results")

with open("results/promptwizard.pkl", 'wb') as f:
    pickle.dump(best_prompt[0], f)

print(f"Best prompt: {best_prompt[0]}")

Best prompt:  Develop a systematic approach for analyzing the colors of objects on a surface by considering the spatial arrangement of objects relative to each other. Clearly define color identification methods and criteria to ensure objective interpretations. Address how to accurately handle scenarios involving multiple objects of the same color to enhance problem-solving accuracy. 

[Question] On the floor, you see several things arranged in a row: a blue crayon, a purple stress ball, and a burgundy dog leash. What is the color of the right-most thing?
Options:
(A) red
(B) orange
(C) yellow
(D) green
(E) blue
(F) brown
(G) magenta
(H) fuchsia
(I) mauve
(J) teal
(K) turquoise
(L) burgundy
(M) silver
(N) gold
(O) black
(P) grey
(Q) purple
(R) pink
[Answer] 1. Identify the objects on the floor: a blue crayon, a purple stress ball, and a burgundy dog leash.
2. Determine the spatial arrangement of the objects in a row.
3. Establish the position of the right-most object in the row.
4. Focu

### Evaluate the optimized prompt

In [11]:
gp.BEST_PROMPT = best_prompt[0]

# Function call to evaluate the prompt
accuracy = gp.evaluate(test_file_name)

print(f"Final Accuracy: {accuracy}")

Evaluation started 

{'accuracy': '0/1 : 0.0%', 'predicted': '(C)', 'actual': '(B)', 'llm_output': '1. Identify the objects on the desk: a blue pen, a burgundy notebook, and a pink stress ball.\n2. Determine the spatial arrangement of the objects in a row.\n3. Establish the position of the burgundy notebook in the row.\n4. Focus on the objects to the left of the burgundy notebook.\n5. Count the number of non-brown things to the left of the burgundy notebook.\n6. Recall that the blue pen and the pink stress ball are mentioned as objects in the row.\n7. Identify the blue pen and the pink stress ball as the non-brown things to the left of the burgundy notebook.\n8. Confirm that there are two non-brown things to the left of the burgundy notebook.\n9. Choose option (C) "two" as the final answer. <ANS_START>(C)<ANS_END>', 'question': 'On the desk, you see the following things arranged in a row: a blue pen, a burgundy notebook, and a pink stress ball. How many non-brown things do you see to t

Final Accuracy: 0.6266666666666667
