#### Set environment variables in [.env](.env) for LLM API calling

### Import Dependencies

In [None]:
import sys
sys.path.insert(0, "../")
import promptwizard
import random
from promptwizard.glue.promptopt.instantiate import GluePromptOpt
from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing
from promptwizard.glue.common.utils.file import save_jsonlist
from typing import Any
from tqdm import tqdm
import json
import os
from azure.identity import get_bearer_token_provider, AzureCliCredential
from openai import AzureOpenAI
from openai import OpenAI

from dotenv import load_dotenv
load_dotenv(override = True)


True

### Below code can be used for LLM-as-a-judge eval

In [2]:
def extract_between(start, end, text):
    """
    Extracts the substring from 'text' that is between 'start' and 'end' strings.
    
    Parameters:
    - start (str): The starting delimiter string.
    - end (str): The ending delimiter string.
    - text (str): The text to search within.
    
    Returns:
    - str: The extracted substring between the start and end delimiters.
    """
    start_index = text.find(start)
    if start_index == -1:
        return '' 
    
    start_index += len(start)
    
    end_index = text.find(end, start_index)
    if end_index == -1:
        return ''  
    return text[start_index:end_index]

def call_api(messages):

    if os.environ['USE_OPENAI_API_KEY'] == "True":
        client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

        response = client.chat.completions.create(
        model=os.environ["OPENAI_MODEL_NAME"],
        messages=messages,
        temperature=0.0,
        )
    else:
        token_provider = get_bearer_token_provider(
                AzureCliCredential(), "https://cognitiveservices.azure.com/.default"
            )
        client = AzureOpenAI(
            api_version=os.environ["OPENAI_API_VERSION"],
            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
            azure_ad_token_provider=token_provider
            )
        response = client.chat.completions.create(
            model=os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"],
            messages=messages,
            temperature=0.0,
        )

    prediction = response.choices[0].message.content
    return prediction

def llm_eval(predicted_answer,gt_answer):
    
    EVAL_PROMPT = f"""Given the Predicted_Answer and Reference_Answer, compare them and check they mean the same.
                    If they mean the same then return True between <ANS_START> and <ANS_END> tags , 
                    If they differ in the meaning then return False between <ANS_START> and <ANS_END> tags 
                    Following are the given :
                    Predicted_Answer: {predicted_answer}
                    Reference_Answer: {gt_answer}"""
    messages = [
        {"role": "system", "content": ""},
        {"role": "user", "content": EVAL_PROMPT}
    ]

    response = call_api(messages)
    final_judgement = extract_between(start="<ANS_START>", end="<ANS_END>", text=response)
    return final_judgement == "True"

### Create a dataset specific class and define the required functions 

In [3]:

llm_as_judge_eval = True

class BBH(DatasetSpecificProcessing):

    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:
        def extract_answer_from_output(completion):

                return completion

        examples_set = []

        for _, sample in tqdm(enumerate(kwargs["dataset"]), desc="Evaluating samples"):
            example = {
              DatasetSpecificProcessing.QUESTION_LITERAL: sample['question'],
              DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL: sample['answer'],
              DatasetSpecificProcessing.FINAL_ANSWER_LITERAL: extract_answer_from_output(sample["answer"])
            }
            examples_set.append(example)

        save_jsonlist(dataset_jsonl, examples_set, "w")

    def extract_final_answer(self, answer: str):
        
        final_answer = extract_between(text=answer,start="<ANS_START>",end="<ANS_END>")
        return final_answer
    
    def access_answer(self, llm_output: str, gt_answer: str):

        if llm_as_judge_eval:
            predicted_answer = self.extract_final_answer(llm_output)
            is_correct = False
            if llm_eval(predicted_answer,gt_answer):
                is_correct = True
        else:
            predicted_answer = self.extract_final_answer(llm_output)
            is_correct = False
            if predicted_answer and (predicted_answer.lower() == gt_answer.lower()):
                is_correct = True

        return is_correct, predicted_answer

In [4]:
bbh_processor = BBH()

### Load and save the dataset . 
Set the ```dataset_to_run``` variable to choose 1 among the 19 datasets of BBII to run the optimization on

In [5]:
# Step 1: Prepare directories
if not os.path.exists("data"):
    os.mkdir("data")

dataset_list = ['hyperbaton']
dataset_to_run = 'hyperbaton'

if not os.path.exists(f"data/{dataset_to_run}"):
    os.mkdir(f"data/{dataset_to_run}")

# Step 2: Clone the repo
#os.system("git clone https://github.com/google-deepmind/bbeh")
random.seed(42)
# Step 3: Process each dataset
for dataset in dataset_list:
    if dataset != dataset_to_run:
        continue

    file_path = f'../BIG-Bench-Hard/bbh/{dataset}.json'
    with open(file_path, 'r') as file:
        data = json.load(file)

    # Optional: set seed for reproducibility
    examples = data['examples']
    # Shuffle the examples
    shuffled = random.sample(examples, len(examples))  # makes a shuffled copy

    # Split into train and test
    train_samples = shuffled[:25]
    test_samples = shuffled[25:]

    def format_data(samples):
        data_list = []
        for sample in samples:
            informal = sample["input"]
            formal = sample["target"]
            question = f"{informal}"
            answer = f"{formal}"
            data_list.append({"question": question, "answer": answer})
        return data_list

    train_data = format_data(train_samples)
    test_data = format_data(test_samples)

    # Use bbh_processor to write JSONL files
    bbh_processor.dataset_to_jsonl(f"data/{dataset}/train.jsonl", dataset=train_data)
    bbh_processor.dataset_to_jsonl(f"data/{dataset}/test.jsonl", dataset=test_data)

# Step 4: Clean up
#os.system("rm -r BIG-Bench-Hard")



Evaluating samples: 25it [00:00, 51858.36it/s]
Evaluating samples: 225it [00:00, 407653.74it/s]


### Set paths

In [6]:
train_file_name = os.path.join("data/"+dataset_to_run, "train.jsonl")
test_file_name = os.path.join("data/"+dataset_to_run, "test.jsonl")
path_to_config = "configs"
llm_config_path = os.path.join(path_to_config, "heuristic/llm_config.yaml")
promptopt_config_path = os.path.join(path_to_config, "heuristic/promptopt_config.yaml")
setup_config_path = os.path.join(path_to_config, "heuristic/setup_config.yaml")

### Create an object for calling prompt optimization and inference functionalities

In [7]:
gp = GluePromptOpt(promptopt_config_path,
                   setup_config_path,
                   train_file_name,
                   bbh_processor)

Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'bbh'), ('mode', 'offline')] 


Prompt Optimization parameters: [('answer_format', 'Wrap the final answer between <ANS_START> and <ANS_END>'), ('few_shot_count', 3), ('prompt_technique_name', 'heuristic'), ('seen_set_size', 25), ('validation_round', 7)] 




### Load PromptWizard Prompt

In [8]:
import pickle

with open("results/promptwizard.pkl", "rb") as f:
    promptwizard_prompt = pickle.load(f)

print("promptwizard_prompt:", promptwizard_prompt)


promptwizard_prompt:  Develop a systematic approach for analyzing the colors of objects on a surface by considering the spatial arrangement of objects relative to each other. Clearly define color identification methods and criteria to ensure objective interpretations. Address how to accurately handle scenarios involving multiple objects of the same color to enhance problem-solving accuracy. 

[Question] On the floor, you see several things arranged in a row: a blue crayon, a purple stress ball, and a burgundy dog leash. What is the color of the right-most thing?
Options:
(A) red
(B) orange
(C) yellow
(D) green
(E) blue
(F) brown
(G) magenta
(H) fuchsia
(I) mauve
(J) teal
(K) turquoise
(L) burgundy
(M) silver
(N) gold
(O) black
(P) grey
(Q) purple
(R) pink
[Answer] 1. Identify the objects on the floor: a blue crayon, a purple stress ball, and a burgundy dog leash.
2. Determine the spatial arrangement of the objects in a row.
3. Establish the position of the right-most object in the row.

### Use MPIR to embed heuristics into PromptWizard.

In [9]:
MPIR_prompt = gp.improve_prompt(promptwizard_prompt)


evaluation:  1. Role Prompting – 2/5  
- Strength: The prompt assigns a task that involves analyzing colors and spatial arrangements, which is relevant to the task of identifying object colors.  
- Improvement: The prompt does not explicitly assign a role to guide the tone, accuracy, and perspective, such as "color analyst" or "visual data interpreter," which could help in setting a more precise context.  
- Rationale: While the task is clear, the lack of a defined role leaves the tone and perspective open to interpretation, which may affect the consistency and accuracy of responses.

2. Step Back – 3/5  
- Strength: The prompt attempts to address key concepts such as color identification and spatial arrangement.  
- Improvement: The key concepts are not explicitly extracted or stated, which could help in understanding the underlying principles more clearly.  
- Rationale: The prompt hints at important concepts but does not clearly articulate them, which could lead to varied interpreta

In [10]:
MPIR_prompt
print(MPIR_prompt)



### Task Overview  
You are a **Color Analyst** tasked with developing a systematic approach for analyzing the colors of objects on a surface by considering the spatial arrangement of objects relative to each other. Your goal is to ensure objective interpretations by clearly defining color identification methods and criteria. Additionally, address how to accurately handle scenarios involving multiple objects of the same color to enhance problem-solving accuracy.

### Key Concepts  
- **Color Identification**: Methods and criteria for objectively determining the color of objects.
- **Spatial Arrangement**: Understanding the relative positions of objects to each other.
- **Multiple Objects of Same Color**: Strategies for distinguishing between objects of the same color.

### Instructions  
1. **Identify Objects**: List all objects and their colors as described in the scenario.
2. **Determine Spatial Arrangement**: Understand the relative positions of the objects.
3. **Focus on Specific

### Evaluate the optimized prompt

In [11]:
gp.BEST_PROMPT = MPIR_prompt

# Function call to evaluate the prompt
accuracy = gp.evaluate(test_file_name)


Evaluation started 

{'accuracy': '1/1 : 100.0%', 'predicted': '(B)', 'actual': '(B)', 'llm_output': '[Question] Which sentence has the correct adjective order:  \nOptions:  \n(A) purple new enormous drinking awful motorcycle  \n(B) awful enormous new purple drinking motorcycle  \n[Answer]  \n1. Examine the two sentences provided to determine the correct adjective order.  \n2. Identify the sequence of adjectives in each sentence:  \n   - Sentence (A): purple new enormous drinking awful motorcycle  \n   - Sentence (B): awful enormous new purple drinking motorcycle  \n3. Recall the standard order of adjectives in English, which is generally:  \n   - Quantity or number  \n   - Quality or opinion  \n   - Size  \n   - Age  \n   - Shape  \n   - Color  \n   - Proper adjective (often nationality, other place of origin, or material)  \n   - Purpose or qualifier  \n4. Analyze the adjective order in each sentence based on the standard sequence.  \n5. Compare the adjectives in each sentence with t