### Import Dependencies

In [31]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path().absolute().parent.parent / "PromptWizard"))

import promptwizard
from promptwizard.glue.promptopt.instantiate import GluePromptOpt
from promptwizard.glue.promptopt.techniques.common_logic import DatasetSpecificProcessing
from promptwizard.glue.common.utils.file import save_jsonlist
from typing import Any
from tqdm import tqdm
from re import compile, findall
import os
from datasets import load_dataset

from dotenv import load_dotenv
load_dotenv(override = True)

True

### Create a dataset specific class and define the required functions 

In [45]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Load full data
df = pd.read_json("../../../data/results.jsonl", lines=True)

# Create groundtruth judgment
df['final_answer'] = "false"
positive = df['sample_class'] == 'positive'
df.loc[positive & (df['question_id'] == df['bge_retrieved_top1_id']), 'final_answer'] = "true"

# Filter only train split
df_train = df[df['split'] == 'train'].reset_index(drop=True)

# Select and rename
pw_df = df_train[['query_id', 'question_id', 'user_query', 'bge_retrieved_top1_id', 'final_answer']].copy()
pw_df = pw_df.rename(columns={'bge_retrieved_top1_id': 'retrieved_id'})

# Load D1 passages
id_to_qa = {}
with open("../../../data/D1.jsonl", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        id_to_qa[obj['id']] = {'candidate_Q': obj['Q'], 'candidate_A': obj['A']}

# Add candidate Q&A
pw_df['candidate_Q'] = pw_df['retrieved_id'].map(lambda x: id_to_qa.get(x, {}).get('candidate_Q', '[Missing]'))
pw_df['candidate_A'] = pw_df['retrieved_id'].map(lambda x: id_to_qa.get(x, {}).get('candidate_A', '[Missing]'))

# Add formatted Input column
pw_df['question'] = (
    "User Query: " + pw_df['user_query'] + "\n" +
    "Retrieved Question: " + pw_df['candidate_Q'] + "\n" +
    "Retrieved Answer: " + pw_df['candidate_A']
)

# Final column order (Input first for clarity)
pw_df = pw_df[['question', 'query_id', 'question_id', 'user_query', 'retrieved_id',
               'candidate_Q', 'candidate_A', 'final_answer']]

# Re-split after adding column
train_pw, test_pw = train_test_split(pw_df, train_size=50, test_size=30, random_state=42, shuffle=True)
train_pw = train_pw.reset_index(drop=True)
test_pw = test_pw.reset_index(drop=True)

# Save
train_pw.to_json("data/train.jsonl", orient="records", lines=True)
test_pw.to_json("data/test.jsonl", orient="records", lines=True)

print(f"Train: {len(train_pw)}, Test: {len(test_pw)}")

Train: 50, Test: 30


In [47]:
class PromptWizardProcessing(DatasetSpecificProcessing):
    def dataset_to_jsonl(self, dataset_jsonl: str, **kwargs: Any) -> None:
        def extract_answer_from_output(completion):
            return completion

        examples_set = []
        examples_set = []
        for _, sample in tqdm(enumerate(kwargs["dataset"]), desc="Converting samples"):
            example = {
                DatasetSpecificProcessing.QUESTION_LITERAL           : sample["question"],
                DatasetSpecificProcessing.ANSWER_WITH_REASON_LITERAL : sample["answer"],
                DatasetSpecificProcessing.FINAL_ANSWER_LITERAL        : extract_answer_from_output(sample["answer"])
            }
            examples_set.append(example)
        save_jsonlist(dataset_jsonl, examples_set, "w")

    def extract_final_answer(self, answer: str):
        if not answer:
            return self.INVALID_ANS
        answer = answer.strip().lower()
        if "true" in answer or "yes" in answer:
            return "true"
        elif "false" in answer or "no" in answer:
            return "false"
        else:
            return self.INVALID_ANS

In [48]:
judge_processor = PromptWizardProcessing()

### Set paths

In [49]:
train_file_name = os.path.join("data", "train.jsonl")
test_file_name = os.path.join("data", "test.jsonl")
path_to_config = "configs"
promptopt_config_path = os.path.join(path_to_config, "promptopt_config.yaml")
setup_config_path = os.path.join(path_to_config, "setup_config.yaml")

### Create an object for calling prompt optimization and inference functionalities

In [50]:
gp = GluePromptOpt(promptopt_config_path,
                   setup_config_path,
                   train_file_name,
                   judge_processor)

Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o-mini')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'retrieval_judgment'), ('mode', 'offline')] 


Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o-mini')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'retrieval_judgment'), ('mode', 'offline')] 


Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o-mini')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'retrieval_judgment'), ('mode', 'offline')] 


Setup configurations parameters: [('assistant_llm', AssistantLLM(prompt_opt='gpt-4o-mini')), ('description', None), ('dir_info', Dir(base_dir='logs', log_dir_name='glue_logs')), ('experiment_name', 'retrieval_judgment'), ('mode', 'offline')] 


Prompt Optimization 

### Call prompt optmization function
1. ```use_examples``` can be used when there are training samples and a mixture of real and synthetic in-context examples are required in the final prompt. When set to ```False``` all the in-context examples will be real
2. ```generate_synthetic_examples``` can be used when there are no training samples and we want to generate synthetic examples 
3. ```run_without_train_examples``` can be used when there are no training samples and in-context examples are not required in the final prompt 

In [51]:
# Function call to generate optimal prompt and expert profile 
best_prompt, expert_profile = gp.get_best_prompt(use_examples=True,run_without_train_examples=False,generate_synthetic_examples=False)


Mutating Task Description....


Iterations completed:   0%|          | 0/3 [00:00<?, ?it/s]
 + Starting iteration: 1 
 current_base_instruction: Carefully judge the semantic and intention of both user query and the retrieved QA, decide whether the retrieved question exactly match with the user intent. Note that there is high compliance regulation, be caution with the decision.

 + Starting iteration: 1 
 current_base_instruction: Carefully judge the semantic and intention of both user query and the retrieved QA, decide whether the retrieved question exactly match with the user intent. Note that there is high compliance regulation, be caution with the decision.

 + Starting iteration: 1 
 current_base_instruction: Carefully judge the semantic and intention of both user query and the retrieved QA, decide whether the retrieved question exactly match with the user intent. Note that there is high compliance regulation, be caution with the decision.

 + Starting iteration: 1 
 current_base_instruction: Carefully judge the 

dataset_subset [{'question': 'User Query: I am trying to determine the normal yearly wage scale for a Concierge position within Diamond Resorts International?\nRetrieved Question: salary for a concierge with diamond international\nRetrieved Answer: The average Diamond Resorts International salary ranges from approximately $20,000 per year for Concierge. However, the average hourly pay for a Concierge at Diamond Resorts International ranges from approximately $9.00 per hour to $40.00 per hour.', 'query_id': '8628_p5', 'question_id': 8628, 'user_query': 'I am trying to determine the normal yearly wage scale for a Concierge position within Diamond Resorts International?', 'retrieved_id': 8628, 'candidate_Q': 'salary for a concierge with diamond international', 'candidate_A': 'The average Diamond Resorts International salary ranges from approximately $20,000 per year for Concierge. However, the average hourly pay for a Concierge at Diamond Resorts International ranges from approximately $9

prompt_score_list [["You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.\nCarefully judge the semantic and intention of both user query and the retrieved QA, decide whether the retrieved question exactly match with the user intent. Note that there is high compliance regulation, be caution with the decision.", 1.0, [{'question': "User Query: What is the founding date of Ubisoft's branch in Quebec?\nRetrieved Question: When was Ubisoft Quebec founded?\nRetrieved Answer: Ubisoft Quebec was founded in 2005 in Quebec City, Quebec.", 'query_id': '5033_p0', 'question_id': 5033, 'user_query': "What is the founding date of Ubisoft's branch in Quebec?", 'retrieved_id': 5033, 'candidate_Q': 'When was Ubisoft Quebec founded?', 'candidate_A': 'Ubisoft Quebec was founded in 2005 in Quebec City, Quebec.', 'final_answer': 'true'}]], ['Carefully analyze the user qu

dataset_subset [{'question': 'User Query: Would focusing on preventing illness rather than responding to it enhance our health outcomes?\nRetrieved Question: Would we be healthier if we switched from a reactive healthcare system to preventative?\nRetrieved Answer: Switching from a reactive healthcare system to a preventative one could potentially lead to improved overall health and a reduction in healthcare costs. Prevention focuses on maintaining good health, preventing diseases and enhancing wellbeing by addressing underlying risk factors and promoting healthy behaviors . By investing in preventative care, individuals may experience fewer chronic diseases, reduced hospitalizations, and less need for advanced medical treatments . Preventative healthcare also reduces healthcare costs by preventing the onset or progression of diseases, which are typically more expensive to treat than to prevent . However, it is important to recognize that a balance between reactive and preventative heal

Prompt to get critique:
 I'm trying to write a prompt for zero-shot instruction task that will help the most capable and suitable agent to solve the task.
My current prompt is:
[CURRENT PROMPT] "You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.
Carefully judge the semantic and intention of both user query and the retrieved QA, decide whether the retrieved question exactly match with the user intent. Note that there is high compliance regulation, be caution with the decision."
Now this prompt got the following examples correct:
[CORRECT EXAMPLES] 
[Question] User Query: What is the founding date of Ubisoft's branch in Quebec?
Retrieved Question: When was Ubisoft Quebec founded?
Retrieved Answer: Ubisoft Quebec was founded in 2005 in Quebec City, Quebec.
[Answer] true

Since you cant use these examples, analyse and understand characteristics/comple

dataset_subset [{'question': 'User Query: Whom does the legendary cartoon industry figure employing the artistic skills of Fred Carter represent?\nRetrieved Question: Who was an American cartoonist and publisher who had Fred Carter working for him?\nRetrieved Answer: Jack Thomas Chick was an American cartoonist and publisher who had Fred Carter working for him.', 'query_id': '6053_p4', 'question_id': 6053, 'user_query': 'Whom does the legendary cartoon industry figure employing the artistic skills of Fred Carter represent?', 'retrieved_id': 6053, 'candidate_Q': 'Who was an American cartoonist and publisher who had Fred Carter working for him?', 'candidate_A': 'Jack Thomas Chick was an American cartoonist and publisher who had Fred Carter working for him.', 'final_answer': 'true'}]
critique_example_set, correct_count
[] 1
dataset_subset [{'question': 'User Query: How should painted turtles be fed while living in captivity?\nRetrieved Question: what do painted turtles eat in captivity\nR

prompt_score_list [["  \nYou are a strict service chatbot judge. Your task is to determine whether the retrieved QA pair fully addresses the user's query with semantic equivalence. Follow these guidelines carefully:\n\n1. **Match the user’s intent**: Compare the **user query**, the **retrieved question**, and the **retrieved answer**. Ensure that the meaning is preserved, even if the wording differs slightly.\n2. **Full satisfaction**: The retrieved answer must **completely satisfy** the user’s request. If the answer partially addresses the query or includes irrelevant information, answer 'No'.\n3. **Be cautious with your decision**: If unsure, answer 'No' to ensure safety and compliance with high regulation standards.\n4. **Answer format**: Only reply with 'Yes' or 'No'. Do **not provide explanations**.\n\nYour decision should be based purely on whether the retrieved QA pair matches the user's intent.  \n", 1.0, [{'question': 'User Query: How should painted turtles be fed while living

dataset_subset [{'question': 'User Query: At what amount does a recent nursing graduate get paid as an RN?\nRetrieved Question: what wage would a rn nurse start at\nRetrieved Answer: The starting salary for a registered nurse (RN) can range from $28,000 to $50,000 annually, or from $16.50 to $26.00 per hour. However, the starting salary can vary based on location, experience, and other factors.', 'query_id': '9734_p1', 'question_id': 9734, 'user_query': 'At what amount does a recent nursing graduate get paid as an RN?', 'retrieved_id': 9734, 'candidate_Q': 'what wage would a rn nurse start at', 'candidate_A': 'The starting salary for a registered nurse (RN) can range from $28,000 to $50,000 annually, or from $16.50 to $26.00 per hour. However, the starting salary can vary based on location, experience, and other factors.', 'final_answer': 'true'}]
critique_example_set, correct_count
[] 3
Loop completed


mutation_round=0 mutated_sample_prompt=You are given a task description and a prompt instruction and different styles known as meta prompts:
[Task Description]: You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.
[Meta Prompt]: How could I devise an experiment to help solve that problem?
Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made.
How could I measure progress on this problem?
How can I simplify the problem so that it is easier to solve?
What are the key assumptions underlying this problem?
Now you need to generate 5 variations of following Instruction adaptively mixing meta prompt while keeping similar semantic meaning.
Make sure to wrap each generated prompt with <START> and <END>
[Prompt Instruction]:   
You are a strict service chatbot judge. Your task is to determin

dataset_subset [{'question': 'User Query: Who is the star of Good Boy! who grew up in Atlanta?\nRetrieved Question: What was the breakthrough role of the actor starring in Good Boy! and was a native of Atlanta?\nRetrieved Answer: The breakthrough role of the actor starring in Good Boy! and a native of Atlanta was Brittany Murphy. Her breakthrough role was as Tai Frasier in "Clueless" (1995).', 'query_id': '4531_p0', 'question_id': 4531, 'user_query': 'Who is the star of Good Boy! who grew up in Atlanta?', 'retrieved_id': 4531, 'candidate_Q': 'What was the breakthrough role of the actor starring in Good Boy! and was a native of Atlanta?', 'candidate_A': 'The breakthrough role of the actor starring in Good Boy! and a native of Atlanta was Brittany Murphy. Her breakthrough role was as Tai Frasier in "Clueless" (1995).', 'final_answer': 'false'}]
critique_example_set, correct_count
[{'question': 'User Query: Who is the star of Good Boy! who grew up in Atlanta?\nRetrieved Question: What was

prompt_score_list [["You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.\n  \nYou are a strict service chatbot judge. Your task is to determine whether the retrieved QA pair fully addresses the user's query with semantic equivalence. Follow these guidelines carefully:\n\n1. **Match the user’s intent**: Compare the **user query**, the **retrieved question**, and the **retrieved answer**. Ensure that the meaning is preserved, even if the wording differs slightly.\n2. **Full satisfaction**: The retrieved answer must **completely satisfy** the user’s request. If the answer partially addresses the query or includes irrelevant information, answer 'No'.\n3. **Be cautious with your decision**: If unsure, answer 'No' to ensure safety and compliance with high regulation standards.\n4. **Answer format**: Only reply with 'Yes' or 'No'. Do **not provide explana

dataset_subset [{'question': 'User Query: What steps are needed to precipitate struvite from a solution containing high phosphate concentrations?\nRetrieved Question: How to obtain struvite in a solution with high concentration of phosphates?\nRetrieved Answer: To obtain struvite in a solution with a high concentration of phosphates, you need to create the optimum conditions for struvite crystallization. These conditions involve maintaining an alkaline pH between 8 and 10 and temperatures between 20°C and 25°C . Additionally, a delicate balance in the quantities of key ions, such as magnesium, phosphate, and ammonium, is necessary for the success of the method .\n\nThe reaction rate of struvite crystallization can be controlled by adjusting the pH according to the change in phosphate concentration . As the phosphate concentration increases and the solution pH rises (e.g., from 8.6 to 9.08), the reaction rate also increases rapidly . When the phosphate concentration increases from 20 to

Prompt to get critique:
 I'm trying to write a prompt for zero-shot instruction task that will help the most capable and suitable agent to solve the task.
My current prompt is:
[CURRENT PROMPT] "  
You are a strict service chatbot judge. Your task is to evaluate whether the retrieved QA pair accurately addresses the user’s query with semantic equivalence. Use the following criteria to guide your decision:  

1. **Match the user’s intent**: Carefully assess the **user query**, the **retrieved question**, and the **retrieved answer**. Even if phrasing differs, ensure the meaning aligns.  
2. **Complete satisfaction**: The retrieved answer should fully fulfill the user's query. If it misses key points or includes irrelevant details, answer 'No'.  
3. **Proceed with caution**: If uncertain, default to 'No' to maintain high standards of compliance.  
4. **Response format**: Respond only with 'Yes' or 'No'. Do not provide any explanation.

Assess whether the retrieved QA pair matches the use

dataset_subset [{'question': "User Query: What is the founding date of Ubisoft's branch in Quebec?\nRetrieved Question: When was Ubisoft Quebec founded?\nRetrieved Answer: Ubisoft Quebec was founded in 2005 in Quebec City, Quebec.", 'query_id': '5033_p0', 'question_id': 5033, 'user_query': "What is the founding date of Ubisoft's branch in Quebec?", 'retrieved_id': 5033, 'candidate_Q': 'When was Ubisoft Quebec founded?', 'candidate_A': 'Ubisoft Quebec was founded in 2005 in Quebec City, Quebec.', 'final_answer': 'true'}]
critique_example_set, correct_count
[] 1
dataset_subset [{'question': "User Query: How can we account for the inclusion of the Mycenaean civilization within the definition of the Sea Peoples?\nRetrieved Question: Why is the Mycenaean civilization considered part of the so-called sea peoples?\nRetrieved Answer: The Mycenaean civilization is considered part of the so-called Sea Peoples due to their extensive maritime activity, powerful navy, and their potential role in th

prompt_score_list [["\nYou are a meticulous QA evaluation agent. Your task is to determine whether a retrieved question-answer pair fully and accurately satisfies the user’s query based on **semantic meaning**, not just word similarity. Follow these rules precisely:\n\n1. **Semantic Equivalence**: The retrieved question and answer together must convey the same meaning as the user’s query. Paraphrasing is acceptable; superficial keyword overlap is insufficient.  \n2. **Complete Fulfillment**: The retrieved answer must address **all aspects** of the user query. Missing, partially incorrect, or irrelevant information results in a 'No'.  \n3. **Safety-First Judgment**: If the answer is ambiguous, misleading, or you are unsure, respond 'No'. Prioritize correctness over leniency.  \n4. **Strict Format**: Respond only with 'Yes' or 'No'. Do not add explanations, comments, or any other text.\n\nAssess each QA pair carefully. Only respond 'Yes' if it fully and accurately satisfies the user’s qu

dataset_subset [{'question': 'User Query: How should painted turtles be fed while living in captivity?\nRetrieved Question: what do painted turtles eat in captivity\nRetrieved Answer: In captivity, painted turtles can eat a variety of foods, including meats such as crickets, worms, or fish, and vegetables like mustard greens, spinach, and carrots. They also eat turtle pellets, insects, and fruits. As juveniles, they tend to be more carnivorous, but as they mature, they add plants to their diet. It is important to provide a balanced diet and remove excess food after 30 to 45 minutes, as painted turtles do not know when to stop eating.', 'query_id': '8921_p2', 'question_id': 8921, 'user_query': 'How should painted turtles be fed while living in captivity?', 'retrieved_id': 8921, 'candidate_Q': 'what do painted turtles eat in captivity', 'candidate_A': 'In captivity, painted turtles can eat a variety of foods, including meats such as crickets, worms, or fish, and vegetables like mustard g

mutation_round=0 mutated_sample_prompt=You are given a task description and a prompt instruction and different styles known as meta prompts:
[Task Description]: You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.
[Meta Prompt]: How could I devise an experiment to help solve that problem?
Make a list of ideas for solving this problem, and apply them one by one to the problem to see if any progress can be made.
How could I measure progress on this problem?
How can I simplify the problem so that it is easier to solve?
What are the key assumptions underlying this problem?
Now you need to generate 5 variations of following Instruction adaptively mixing meta prompt while keeping similar semantic meaning.
Make sure to wrap each generated prompt with <START> and <END>
[Prompt Instruction]: 
You are a meticulous QA evaluation agent. Your task is to determin

dataset_subset [{'question': 'User Query: Would focusing on preventing illness rather than responding to it enhance our health outcomes?\nRetrieved Question: Would we be healthier if we switched from a reactive healthcare system to preventative?\nRetrieved Answer: Switching from a reactive healthcare system to a preventative one could potentially lead to improved overall health and a reduction in healthcare costs. Prevention focuses on maintaining good health, preventing diseases and enhancing wellbeing by addressing underlying risk factors and promoting healthy behaviors . By investing in preventative care, individuals may experience fewer chronic diseases, reduced hospitalizations, and less need for advanced medical treatments . Preventative healthcare also reduces healthcare costs by preventing the onset or progression of diseases, which are typically more expensive to treat than to prevent . However, it is important to recognize that a balance between reactive and preventative heal

prompt_score_list [["You are a strict service chatbot judge. Your task is to strictly determine whether a retrieved QA is the true response for the given user query. Answer only 'Yes' or 'No' without explanation.\n\nYou are a meticulous QA evaluation agent. Your task is to determine whether a retrieved question-answer pair fully and accurately satisfies the user’s query based on **semantic meaning**, not just word similarity. Follow these rules precisely:\n\n1. **Semantic Equivalence**: The retrieved question and answer together must convey the same meaning as the user’s query. Paraphrasing is acceptable; superficial keyword overlap is insufficient.  \n2. **Complete Fulfillment**: The retrieved answer must address **all aspects** of the user query. Missing, partially incorrect, or irrelevant information results in a 'No'.  \n3. **Safety-First Judgment**: If the answer is ambiguous, misleading, or you are unsure, respond 'No'. Prioritize correctness over leniency.  \n4. **Strict Format*

dataset_subset [{'question': 'User Query: What year did Dante pass away?\nRetrieved Question: When did Dante die?\nRetrieved Answer: Based on the given context, Dante Alighieri died in 1321.', 'query_id': '4795_p2', 'question_id': 4795, 'user_query': 'What year did Dante pass away?', 'retrieved_id': 4795, 'candidate_Q': 'When did Dante die?', 'candidate_A': 'Based on the given context, Dante Alighieri died in 1321.', 'final_answer': 'true'}]
critique_example_set, correct_count
[] 3
Loop completed


Prompt to get critique:
 I'm trying to write a prompt for zero-shot instruction task that will help the most capable and suitable agent to solve the task.
My current prompt is:
[CURRENT PROMPT] "  
You are a meticulous QA evaluation agent. Your task is to determine if a retrieved question-answer pair fully resolves the user’s query. Consider whether the QA pair addresses all parts of the query and maintains semantic equivalence. Think about how you could measure whether the response makes progress toward fully answering the user’s question. Respond only with 'Yes' or 'No'.  
"
Now this prompt got the following examples correct:
[CORRECT EXAMPLES] 
[Question] User Query: In which year did the first version of the game Civilization come out?
Retrieved Question: When did the computer game Civilization first come out?
Retrieved Answer: The computer game Civilization was first released in 1991 .
[Answer] true

Since you cant use these examples, analyse and understand characteristics/complex

dataset_subset [{'question': 'User Query: I am trying to determine the normal yearly wage scale for a Concierge position within Diamond Resorts International?\nRetrieved Question: salary for a concierge with diamond international\nRetrieved Answer: The average Diamond Resorts International salary ranges from approximately $20,000 per year for Concierge. However, the average hourly pay for a Concierge at Diamond Resorts International ranges from approximately $9.00 per hour to $40.00 per hour.', 'query_id': '8628_p5', 'question_id': 8628, 'user_query': 'I am trying to determine the normal yearly wage scale for a Concierge position within Diamond Resorts International?', 'retrieved_id': 8628, 'candidate_Q': 'salary for a concierge with diamond international', 'candidate_A': 'The average Diamond Resorts International salary ranges from approximately $20,000 per year for Concierge. However, the average hourly pay for a Concierge at Diamond Resorts International ranges from approximately $9




dataset_subset [{'question': 'User Query: How should painted turtles be fed while living in captivity?\nRetrieved Question: what do painted turtles eat in captivity\nRetrieved Answer: In captivity, painted turtles can eat a variety of foods, including meats such as crickets, worms, or fish, and vegetables like mustard greens, spinach, and carrots. They also eat turtle pellets, insects, and fruits. As juveniles, they tend to be more carnivorous, but as they mature, they add plants to their diet. It is important to provide a balanced diet and remove excess food after 30 to 45 minutes, as painted turtles do not know when to stop eating.', 'query_id': '8921_p2', 'question_id': 8921, 'user_query': 'How should painted turtles be fed while living in captivity?', 'retrieved_id': 8921, 'candidate_Q': 'what do painted turtles eat in captivity', 'candidate_A': 'In captivity, painted turtles can eat a variety of foods, including meats such as crickets, worms, or fish, and vegetables like mustard g

100%|██████████| 3/3 [01:23<00:00, 27.71s/it]



Generating CoT Reasoning for In-Context Examples....


100%|██████████| 3/3 [00:22<00:00,  7.58s/it]



Generating Expert Identity....


Expert Identity: [Agent Description]: You are a meticulous and impartial evaluator with expertise in assessing the accuracy and relevance of question-and-answer pairs. You have a sharp eye for detail and a strong understanding of how to match user queries with the correct responses. You are highly disciplined in following strict criteria and do not allow personal opinions or assumptions to influence your judgment. Your role is to make definitive determinations, answering only 'Yes' or 'No,' based solely on whether the retrieved answer truly satisfies the user’s query. You excel at maintaining consistency, fairness, and precision in all evaluations.
Expert Identity: [Agent Description]: You are a meticulous and impartial evaluator with expertise in assessing the accuracy and relevance of question-and-answer pairs. You have a sharp eye for detail and a strong understanding of how to match user queries with the correct responses. You are highly disciplined in following strict criteria and


Generating Intent Keywords....


Final best prompt:   
You are a meticulous QA evaluation agent. Your task is to determine if a retrieved question-answer pair fully resolves the user’s query. Consider whether the QA pair addresses all parts of the query and maintains semantic equivalence. Think about how you could measure whether the response makes progress toward fully answering the user’s question. Respond only with 'Yes' or 'No'.  


[Question] User Query: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average?
Retrieved Question: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average salary for concierges?
Retrieved Answer: The average yearly salary for a Concierge at Diamond Resorts International is approximately $20,000 per year, which is below the industry average of $25,000 for concierges in similar resorts.
[Answer] [Improved Reasoning Chain]:  

1. **Identi

### Save the optimized prompt and expert profile

In [52]:
import pickle 

if not os.path.exists("results"):
    os.system("mkdir results")
    
with open("results/best_prompt.pkl", 'wb') as f:
    pickle.dump(best_prompt, f)
with open("results/expert_profile.pkl", 'wb') as f:
    pickle.dump(expert_profile, f)

print(f"Best prompt: {best_prompt} \nExpert profile: {expert_profile}")

Best prompt:   
You are a meticulous QA evaluation agent. Your task is to determine if a retrieved question-answer pair fully resolves the user’s query. Consider whether the QA pair addresses all parts of the query and maintains semantic equivalence. Think about how you could measure whether the response makes progress toward fully answering the user’s question. Respond only with 'Yes' or 'No'.  


[Question] User Query: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average?
Retrieved Question: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average salary for concierges?
Retrieved Answer: The average yearly salary for a Concierge at Diamond Resorts International is approximately $20,000 per year, which is below the industry average of $25,000 for concierges in similar resorts.
[Answer] [Improved Reasoning Chain]:  

1. **Identify the

In [53]:
import os
import pickle

os.makedirs("results", exist_ok=True)

# Save as pickle
with open("results/best_prompt.pkl", 'wb') as f:
    pickle.dump(best_prompt, f)
with open("results/expert_profile.pkl", 'wb') as f:
    pickle.dump(expert_profile, f)

# Save as readable .txt
with open("results/best_prompt.txt", "w", encoding="utf-8") as f:
    f.write(best_prompt)

with open("results/expert_profile.txt", "w", encoding="utf-8") as f:
    f.write(expert_profile)

print("Saved best_prompt and expert_profile as both .pkl and .txt in ./results/")
print("\n=== BEST PROMPT ===\n")
print(best_prompt)
print("\n=== EXPERT PROFILE ===\n")
print(expert_profile)

Saved best_prompt and expert_profile as both .pkl and .txt in ./results/

=== BEST PROMPT ===

  
You are a meticulous QA evaluation agent. Your task is to determine if a retrieved question-answer pair fully resolves the user’s query. Consider whether the QA pair addresses all parts of the query and maintains semantic equivalence. Think about how you could measure whether the response makes progress toward fully answering the user’s question. Respond only with 'Yes' or 'No'.  


[Question] User Query: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average?
Retrieved Question: What is the average yearly salary for a concierge at Diamond Resorts International, and how does it compare to the industry average salary for concierges?
Retrieved Answer: The average yearly salary for a Concierge at Diamond Resorts International is approximately $20,000 per year, which is below the industry average of $25,000 for concie

In [24]:
expert_profile

'You are a data scientist with a strong background in machine learning, particularly in natural language processing and unsupervised learning techniques. You have expertise in text analysis and clustering algorithms, enabling you to effectively group sentences based on their semantic similarities. Your skills include applying methods such as k-means clustering, hierarchical clustering, and advanced techniques like topic modeling or sentence embeddings to create meaningful clusters. You possess a deep understanding of the nuances of language and contextual variations, ensuring that your clustering takes into account both syntactical and semantic content. By leveraging your expertise, you can output exactly {n} cluster labels, ensuring each sentence is accurately classified into one of the {k} clusters. Your analytical prowess ensures that the clustering solution is both interpretable and robust, aligning with the underlying patterns of the data.'

### Evaluate the optimized prompt

In [22]:
gp.EXPERT_PROFILE = expert_profile
gp.BEST_PROMPT = best_prompt

# Function call to evaluate the prompt
accuracy = gp.evaluate(test_file_name)

print(f"Final Accuracy: {accuracy}")

Evaluation started 

Evaluation started 





#################### Running 




{'accuracy': '0/1 : 0.0%', 'predicted': '[7, 19, 14, 5, 12, 9, 8, 13, 18, 15, 1, 6, 11, 8, 16, 17, 4, 10]', 'actual': '[37, 133, 80, 142, 100, 20, 125, 28, 147, 94, 82, 17, 101, 43, 92, 42, 117, 36]'}
{'accuracy': '0/1 : 0.0%', 'predicted': '[7, 19, 14, 5, 12, 9, 8, 13, 18, 15, 1, 6, 11, 8, 16, 17, 4, 10]', 'actual': '[37, 133, 80, 142, 100, 20, 125, 28, 147, 94, 82, 17, 101, 43, 92, 42, 117, 36]'}




#################### Running 




{'accuracy': '0/2 : 0.0%', 'predicted': '[5, 10, 13, 7, 16, 3, 12, 9, 5, 15, 8, 14, 4, 9, 11, 6, 2, 1]', 'actual': '[42, 109, 42, 42, 105, 145, 34, 110, 42, 53, 136, 111, 125, 142, 32, 10, 42, 95]'}
{'accuracy': '0/2 : 0.0%', 'predicted': '[5, 10, 13, 7, 16, 3, 12, 9, 5, 15, 8, 14, 4, 9, 11, 6, 2, 1]', 'actual': '[42, 109, 42, 42, 105, 145, 34, 110, 42, 53, 136, 111, 125, 142, 32, 10, 42, 95]'}




#################### Running 




{'accuracy': '0/3 : 0.0%', 'predicted': '[3, 1, 5, 4, 4, 2, 5, 4, 6, 5, 5, 5, 5, 1, 6, 5, 6, 1, 3, 2]', 'actual': '[40, 78, 42, 61, 112, 145, 27, 61, 42, 42, 87, 87, 108, 98, 42, 143, 36, 32, 34, 9]'}
{'accuracy': '0/3 : 0.0%', 'predicted': '[3, 1, 5, 4, 4, 2, 5, 4, 6, 5, 5, 5, 5, 1, 6, 5, 6, 1, 3, 2]', 'actual': '[40, 78, 42, 61, 112, 145, 27, 61, 42, 42, 87, 87, 108, 98, 42, 143, 36, 32, 34, 9]'}




#################### Running 




{'accuracy': '0/4 : 0.0%', 'predicted': '[4, 2, 1, 1, 5, 6, 1, 1, 3, 1, 6, 5, 6, 1, 6, 6, 1, 1, 1, 1, 6, 1, 6, 1, 6, 6, 2, 6]', 'actual': '[42, 17, 127, 42, 138, 42, 56, 68, 4, 7, 140, 31, 148, 143, 86, 144, 2, 109, 137, 108, 42, 20, 11, 71, 42, 77, 98, 65]'}
{'accuracy': '0/4 : 0.0%', 'predicted': '[4, 2, 1, 1, 5, 6, 1, 1, 3, 1, 6, 5, 6, 1, 6, 6, 1, 1, 1, 1, 6, 1, 6, 1, 6, 6, 2, 6]', 'actual': '[42, 17, 127, 42, 138, 42, 56, 68, 4, 7, 140, 31, 148, 143, 86, 144, 2, 109, 137, 108, 42, 20, 11, 71, 42, 77, 98, 65]'}




#################### Running 




{'accuracy': '0/5 : 0.0%', 'predicted': '[1, 2, 3, 4, 3, 5, 6, 7, 5, 3, 8, 9, 6, 10, 11, 12, 13, 14, 15, 16, 17]', 'actual': '[142, 96, 101, 90, 140, 42, 91, 24, 130, 126, 26, 42, 45, 42, 42, 74, 16, 92, 42, 149, 149]'}
{'accuracy': '0/5 : 0.0%', 'predicted': '[1, 2, 3, 4, 3, 5, 6, 7, 5, 3, 8, 9, 6, 10, 11, 12, 13, 14, 15, 16, 17]', 'actual': '[142, 96, 101, 90, 140, 42, 91, 24, 130, 126, 26, 42, 45, 42, 42, 74, 16, 92, 42, 149, 149]'}




#################### Running 




{'accuracy': '0/6 : 0.0%', 'predicted': '[5, 13, 8, 15, 10, 12, 10, 7, 4, 9, 14, 3, 6, 11, 1, 2]', 'actual': '[34, 149, 86, 50, 52, 42, 52, 39, 120, 42, 27, 33, 31, 74, 42, 42]'}
{'accuracy': '0/6 : 0.0%', 'predicted': '[5, 13, 8, 15, 10, 12, 10, 7, 4, 9, 14, 3, 6, 11, 1, 2]', 'actual': '[34, 149, 86, 50, 52, 42, 52, 39, 120, 42, 27, 33, 31, 74, 42, 42]'}




#################### Running 




{'accuracy': '0/7 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 1, 6, 3, 7, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 18, 1, 19]', 'actual': '[103, 39, 99, 42, 118, 49, 128, 138, 148, 42, 37, 107, 53, 42, 71, 42, 42, 73, 142, 42, 20, 96, 2, 62]'}
{'accuracy': '0/7 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 1, 6, 3, 7, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 18, 1, 19]', 'actual': '[103, 39, 99, 42, 118, 49, 128, 138, 148, 42, 37, 107, 53, 42, 71, 42, 42, 73, 142, 42, 20, 96, 2, 62]'}




#################### Running 




{'accuracy': '0/8 : 0.0%', 'predicted': '[14, 21, 18, 6, 9, 17, 25, 2, 5, 13, 13, 3, 4, 19, 16, 15, 11, 11, 20, 7, 6, 8, 9, 12, 10, 22, 10, 23, 2, 24]', 'actual': '[42, 45, 42, 98, 42, 42, 99, 61, 144, 90, 90, 106, 33, 30, 127, 42, 133, 89, 136, 42, 47, 25, 93, 91, 85, 100, 85, 34, 73, 16]'}
{'accuracy': '0/8 : 0.0%', 'predicted': '[14, 21, 18, 6, 9, 17, 25, 2, 5, 13, 13, 3, 4, 19, 16, 15, 11, 11, 20, 7, 6, 8, 9, 12, 10, 22, 10, 23, 2, 24]', 'actual': '[42, 45, 42, 98, 42, 42, 99, 61, 144, 90, 90, 106, 33, 30, 127, 42, 133, 89, 136, 42, 47, 25, 93, 91, 85, 100, 85, 34, 73, 16]'}




#################### Running 




{'accuracy': '0/9 : 0.0%', 'predicted': '[5, 3, 7, 5, 6, 3, 6, 2, 5, 6]', 'actual': '[94, 66, 86, 126, 41, 42, 42, 53, 122, 103]'}
{'accuracy': '0/9 : 0.0%', 'predicted': '[5, 3, 7, 5, 6, 3, 6, 2, 5, 6]', 'actual': '[94, 66, 86, 126, 41, 42, 42, 53, 122, 103]'}




#################### Running 




{'accuracy': '0/10 : 0.0%', 'predicted': '[57, 105, 82, 95, 78, 43, 122, 119, 51, 94, 46, 38, 67, 77, 110, 83, 103]', 'actual': '[139, 123, 42, 108, 42, 12, 103, 42, 66, 19, 26, 8, 145, 14, 149, 77, 85]'}
{'accuracy': '0/10 : 0.0%', 'predicted': '[57, 105, 82, 95, 78, 43, 122, 119, 51, 94, 46, 38, 67, 77, 110, 83, 103]', 'actual': '[139, 123, 42, 108, 42, 12, 103, 42, 66, 19, 26, 8, 145, 14, 149, 77, 85]'}




#################### Running 




{'accuracy': '0/11 : 0.0%', 'predicted': '[1, 2, 3, 3, 4, 2, 5, 6, 7, 7, 8]', 'actual': '[78, 42, 124, 114, 47, 28, 42, 146, 0, 23, 43]'}
{'accuracy': '0/11 : 0.0%', 'predicted': '[1, 2, 3, 3, 4, 2, 5, 6, 7, 7, 8]', 'actual': '[78, 42, 124, 114, 47, 28, 42, 146, 0, 23, 43]'}




#################### Running 




{'accuracy': '0/12 : 0.0%', 'predicted': '[5, 11, 8, 3, 11, 6, 14, 10, 12, 13, 7, 4, 1]', 'actual': '[72, 79, 40, 67, 79, 138, 42, 2, 42, 42, 125, 42, 25]'}
{'accuracy': '0/12 : 0.0%', 'predicted': '[5, 11, 8, 3, 11, 6, 14, 10, 12, 13, 7, 4, 1]', 'actual': '[72, 79, 40, 67, 79, 138, 42, 2, 42, 42, 125, 42, 25]'}




#################### Running 




{'accuracy': '0/13 : 0.0%', 'predicted': '[78, 94, 12, 43, 67, 38, 12, 57, 12, 45, 33, 96, 89, 71, 92, 80, 67, 58, 79, 67, 62]', 'actual': '[134, 15, 9, 143, 8, 54, 85, 129, 9, 42, 119, 55, 135, 148, 88, 42, 53, 52, 80, 131, 42]'}
{'accuracy': '0/13 : 0.0%', 'predicted': '[78, 94, 12, 43, 67, 38, 12, 57, 12, 45, 33, 96, 89, 71, 92, 80, 67, 58, 79, 67, 62]', 'actual': '[134, 15, 9, 143, 8, 54, 85, 129, 9, 42, 119, 55, 135, 148, 88, 42, 53, 52, 80, 131, 42]'}




#################### Running 




{'accuracy': '0/14 : 0.0%', 'predicted': '[11, 23, 14, 21, 8, 12, 17, 19, 10, 22, 18, 5, 20, 7, 13, 6, 16, 24, 4, 9, 3, 15, 25, 26, 27, 2, 28, 1, 29]', 'actual': '[141, 20, 84, 76, 67, 85, 10, 118, 7, 70, 42, 93, 42, 102, 103, 29, 59, 49, 42, 139, 42, 16, 73, 8, 87, 64, 52, 21, 78]'}
{'accuracy': '0/14 : 0.0%', 'predicted': '[11, 23, 14, 21, 8, 12, 17, 19, 10, 22, 18, 5, 20, 7, 13, 6, 16, 24, 4, 9, 3, 15, 25, 26, 27, 2, 28, 1, 29]', 'actual': '[141, 20, 84, 76, 67, 85, 10, 118, 7, 70, 42, 93, 42, 102, 103, 29, 59, 49, 42, 139, 42, 16, 73, 8, 87, 64, 52, 21, 78]'}




#################### Running 




{'accuracy': '0/15 : 0.0%', 'predicted': '[10, 23, 23, 7, 15, 18, 11, 19, 17, 14, 8, 6, 17, 5, 11, 20, 15, 20, 13, 20, 12, 15, 13]', 'actual': '[1, 98, 142, 42, 112, 79, 42, 21, 143, 80, 51, 18, 111, 10, 3, 68, 43, 89, 57, 91, 121, 48, 51]'}
{'accuracy': '0/15 : 0.0%', 'predicted': '[10, 23, 23, 7, 15, 18, 11, 19, 17, 14, 8, 6, 17, 5, 11, 20, 15, 20, 13, 20, 12, 15, 13]', 'actual': '[1, 98, 142, 42, 112, 79, 42, 21, 143, 80, 51, 18, 111, 10, 3, 68, 43, 89, 57, 91, 121, 48, 51]'}




#################### Running 




{'accuracy': '0/16 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 8, 11, 12, 5, 13, 14, 15, 16, 9, 17, 18, 19, 20, 21, 22, 23, 5, 13]', 'actual': '[15, 61, 122, 114, 41, 124, 36, 81, 145, 79, 17, 138, 25, 35, 62, 80, 137, 105, 42, 42, 66, 18, 42, 142, 67, 42, 71, 62]'}
{'accuracy': '0/16 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 8, 11, 12, 5, 13, 14, 15, 16, 9, 17, 18, 19, 20, 21, 22, 23, 5, 13]', 'actual': '[15, 61, 122, 114, 41, 124, 36, 81, 145, 79, 17, 138, 25, 35, 62, 80, 137, 105, 42, 42, 66, 18, 42, 142, 67, 42, 71, 62]'}




#################### Running 




{'accuracy': '0/17 : 0.0%', 'predicted': '[1, 2, 3, 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, 10, 7, 11, 12, 13, 14, 15, 16, 17, 10, 18, 19, 20, 4, 21, 12, 13, 3]', 'actual': '[24, 60, 40, 13, 121, 124, 42, 45, 10, 5, 28, 42, 42, 11, 50, 51, 20, 137, 37, 9, 47, 42, 144, 110, 42, 42, 147, 97, 89, 42]'}
{'accuracy': '0/17 : 0.0%', 'predicted': '[1, 2, 3, 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, 10, 7, 11, 12, 13, 14, 15, 16, 17, 10, 18, 19, 20, 4, 21, 12, 13, 3]', 'actual': '[24, 60, 40, 13, 121, 124, 42, 45, 10, 5, 28, 42, 42, 11, 50, 51, 20, 137, 37, 9, 47, 42, 144, 110, 42, 42, 147, 97, 89, 42]'}




#################### Running 




{'accuracy': '0/18 : 0.0%', 'predicted': '[34, 22, 45, 67, 89, 71, 13, 56, 98, 40, 63, 13, 76]', 'actual': '[42, 118, 1, 148, 132, 68, 39, 23, 117, 42, 108, 39, 131]'}
{'accuracy': '0/18 : 0.0%', 'predicted': '[34, 22, 45, 67, 89, 71, 13, 56, 98, 40, 63, 13, 76]', 'actual': '[42, 118, 1, 148, 132, 68, 39, 23, 117, 42, 108, 39, 131]'}




#################### Running 




{'accuracy': '0/19 : 0.0%', 'predicted': '[12, 8, 5, 14, 3, 7, 19, 15, 3, 16, 7, 18, 10, 3, 17, 18, 6, 13, 11]', 'actual': '[67, 147, 72, 34, 144, 140, 42, 42, 11, 64, 140, 44, 42, 51, 42, 44, 7, 86, 122]'}
{'accuracy': '0/19 : 0.0%', 'predicted': '[12, 8, 5, 14, 3, 7, 19, 15, 3, 16, 7, 18, 10, 3, 17, 18, 6, 13, 11]', 'actual': '[67, 147, 72, 34, 144, 140, 42, 42, 11, 64, 140, 44, 42, 51, 42, 44, 7, 86, 122]'}




#################### Running 




{'accuracy': '0/20 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]', 'actual': '[72, 109, 130, 106, 94, 25, 15, 59, 129, 42, 99, 70, 66, 132, 10, 66, 23, 103, 27, 26, 51, 66, 64, 11, 13, 107]'}
{'accuracy': '0/20 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]', 'actual': '[72, 109, 130, 106, 94, 25, 15, 59, 129, 42, 99, 70, 66, 132, 10, 66, 23, 103, 27, 26, 51, 66, 64, 11, 13, 107]'}




#################### Running 




{'accuracy': '0/21 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]', 'actual': '[45, 57, 71, 31, 36, 150, 94, 69, 149, 12, 42, 41, 0]'}
{'accuracy': '0/21 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]', 'actual': '[45, 57, 71, 31, 36, 150, 94, 69, 149, 12, 42, 41, 0]'}




#################### Running 




{'accuracy': '0/22 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 11, 12, 2, 13, 14, 15, 16, 17, 18, 19, 7, 20, 21, 22, 12, 15, 23, 24, 25]', 'actual': '[42, 42, 101, 42, 42, 42, 132, 42, 67, 42, 79, 47, 72, 108, 121, 42, 122, 64, 17, 136, 139, 44, 55, 138, 141, 40, 122, 24, 117, 66]'}
{'accuracy': '0/22 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 7, 11, 12, 2, 13, 14, 15, 16, 17, 18, 19, 7, 20, 21, 22, 12, 15, 23, 24, 25]', 'actual': '[42, 42, 101, 42, 42, 42, 132, 42, 67, 42, 79, 47, 72, 108, 121, 42, 122, 64, 17, 136, 139, 44, 55, 138, 141, 40, 122, 24, 117, 66]'}




#################### Running 




{'accuracy': '0/23 : 0.0%', 'predicted': '[101, 202, 303, 304, 205, 306, 207, 208, 209, 210, 311, 312, 213, 214, 315, 216, 317, 318, 319, 320, 321, 322, 223, 324, 225, 226, 327]', 'actual': '[42, 57, 91, 132, 42, 42, 77, 75, 72, 47, 145, 38, 42, 119, 42, 42, 120, 60, 68, 136, 46, 79, 42, 124, 16, 18, 6]'}
{'accuracy': '0/23 : 0.0%', 'predicted': '[101, 202, 303, 304, 205, 306, 207, 208, 209, 210, 311, 312, 213, 214, 315, 216, 317, 318, 319, 320, 321, 322, 223, 324, 225, 226, 327]', 'actual': '[42, 57, 91, 132, 42, 42, 77, 75, 72, 47, 145, 38, 42, 119, 42, 42, 120, 60, 68, 136, 46, 79, 42, 124, 16, 18, 6]'}




#################### Running 




{'accuracy': '0/24 : 0.0%', 'predicted': '[89, 76, 112, 35, 47, 25, 58, 90, 113, 81, 102, 107, 54, 83, 99]', 'actual': '[42, 42, 122, 15, 61, 1, 104, 42, 71, 125, 89, 109, 42, 43, 27]'}
{'accuracy': '0/24 : 0.0%', 'predicted': '[89, 76, 112, 35, 47, 25, 58, 90, 113, 81, 102, 107, 54, 83, 99]', 'actual': '[42, 42, 122, 15, 61, 1, 104, 42, 71, 125, 89, 109, 42, 43, 27]'}




#################### Running 




{'accuracy': '0/25 : 0.0%', 'predicted': '[3, 7, 5, 5, 4, 6, 2, 8, 7, 9, 3, 12, 10, 11, 8, 5, 12, 11, 3]', 'actual': '[69, 122, 30, 121, 146, 119, 111, 42, 122, 42, 109, 22, 42, 3, 23, 107, 48, 120, 44]'}
{'accuracy': '0/25 : 0.0%', 'predicted': '[3, 7, 5, 5, 4, 6, 2, 8, 7, 9, 3, 12, 10, 11, 8, 5, 12, 11, 3]', 'actual': '[69, 122, 30, 121, 146, 119, 111, 42, 122, 42, 109, 22, 42, 3, 23, 107, 48, 120, 44]'}




#################### Running 




{'accuracy': '0/26 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 4, 21]', 'actual': '[114, 43, 123, 70, 97, 67, 5, 15, 125, 31, 42, 58, 92, 12, 134, 26, 42, 141, 82, 42, 112, 68, 7]'}
{'accuracy': '0/26 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 2, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 4, 21]', 'actual': '[114, 43, 123, 70, 97, 67, 5, 15, 125, 31, 42, 58, 92, 12, 134, 26, 42, 141, 82, 42, 112, 68, 7]'}




#################### Running 




{'accuracy': '0/27 : 0.0%', 'predicted': '[51, 32, 74, 12, 41, 58, 92, 67, 83, 29]', 'actual': '[131, 35, 68, 84, 71, 127, 124, 6, 103, 72]'}
{'accuracy': '0/27 : 0.0%', 'predicted': '[51, 32, 74, 12, 41, 58, 92, 67, 83, 29]', 'actual': '[131, 35, 68, 84, 71, 127, 124, 6, 103, 72]'}




#################### Running 




{'accuracy': '0/28 : 0.0%', 'predicted': '[3, 5, 8, 10, 5, 10, 8, 3, 1, 9, 1, 5, 7, 2, 6, 9]', 'actual': '[135, 109, 139, 122, 41, 42, 144, 42, 17, 149, 37, 6, 42, 150, 106, 35]'}
{'accuracy': '0/28 : 0.0%', 'predicted': '[3, 5, 8, 10, 5, 10, 8, 3, 1, 9, 1, 5, 7, 2, 6, 9]', 'actual': '[135, 109, 139, 122, 41, 42, 144, 42, 17, 149, 37, 6, 42, 150, 106, 35]'}




#################### Running 




{'accuracy': '0/29 : 0.0%', 'predicted': '[89, 73, 44, 91, 32, 27, 112, 56, 85, 67, 48, 102, 108, 10, 93, 130, 58, 99, 145, 120, 75, 66, 134]', 'actual': '[37, 99, 107, 26, 47, 77, 97, 106, 64, 105, 143, 35, 60, 82, 95, 42, 96, 103, 38, 59, 42, 3, 120]'}
{'accuracy': '0/29 : 0.0%', 'predicted': '[89, 73, 44, 91, 32, 27, 112, 56, 85, 67, 48, 102, 108, 10, 93, 130, 58, 99, 145, 120, 75, 66, 134]', 'actual': '[37, 99, 107, 26, 47, 77, 97, 106, 64, 105, 143, 35, 60, 82, 95, 42, 96, 103, 38, 59, 42, 3, 120]'}




#################### Running 




{'accuracy': '0/30 : 0.0%', 'predicted': '[2, 7, 9, 5, 1, 10, 3, 8, 4, 6, 11, 12, 2, 13, 14, 5, 7, 15, 9, 16, 6, 17, 8]', 'actual': '[123, 141, 144, 79, 28, 42, 42, 127, 31, 101, 42, 9, 3, 42, 42, 116, 141, 42, 60, 55, 29, 143, 63]'}
{'accuracy': '0/30 : 0.0%', 'predicted': '[2, 7, 9, 5, 1, 10, 3, 8, 4, 6, 11, 12, 2, 13, 14, 5, 7, 15, 9, 16, 6, 17, 8]', 'actual': '[123, 141, 144, 79, 28, 42, 42, 127, 31, 101, 42, 9, 3, 42, 42, 116, 141, 42, 60, 55, 29, 143, 63]'}




#################### Running 




{'accuracy': '0/31 : 0.0%', 'predicted': '[19, 134, 47, 68, 92, 103, 85, 116, 72, 108, 60, 94, 125, 99, 22, 77, 56, 38, 81, 45, 11]', 'actual': '[42, 42, 150, 33, 105, 139, 42, 19, 42, 146, 42, 40, 69, 22, 42, 42, 45, 136, 37, 112, 83]'}
{'accuracy': '0/31 : 0.0%', 'predicted': '[19, 134, 47, 68, 92, 103, 85, 116, 72, 108, 60, 94, 125, 99, 22, 77, 56, 38, 81, 45, 11]', 'actual': '[42, 42, 150, 33, 105, 139, 42, 19, 42, 146, 42, 40, 69, 22, 42, 42, 45, 136, 37, 112, 83]'}




#################### Running 




{'accuracy': '0/32 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 5, 13, 14, 15, 16, 5, 17, 18, 9, 2, 19, 20, 21, 22, 23]', 'actual': '[87, 18, 42, 93, 126, 22, 42, 129, 107, 59, 127, 16, 126, 3, 67, 102, 42, 97, 44, 31, 124, 18, 125, 20, 65, 56, 84]'}
{'accuracy': '0/32 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 5, 13, 14, 15, 16, 5, 17, 18, 9, 2, 19, 20, 21, 22, 23]', 'actual': '[87, 18, 42, 93, 126, 22, 42, 129, 107, 59, 127, 16, 126, 3, 67, 102, 42, 97, 44, 31, 124, 18, 125, 20, 65, 56, 84]'}




#################### Running 




{'accuracy': '0/33 : 0.0%', 'predicted': '[12, 31, 29, 44, 11, 9, 75, 8, 31, 54, 10, 28, 35, 55, 20, 72, 41, 90, 10, 50, 63, 21, 19, 20, 22, 47]', 'actual': '[28, 20, 147, 74, 42, 149, 114, 131, 89, 83, 9, 19, 42, 73, 47, 42, 110, 46, 23, 95, 42, 123, 140, 47, 38, 81]'}
{'accuracy': '0/33 : 0.0%', 'predicted': '[12, 31, 29, 44, 11, 9, 75, 8, 31, 54, 10, 28, 35, 55, 20, 72, 41, 90, 10, 50, 63, 21, 19, 20, 22, 47]', 'actual': '[28, 20, 147, 74, 42, 149, 114, 131, 89, 83, 9, 19, 42, 73, 47, 42, 110, 46, 23, 95, 42, 123, 140, 47, 38, 81]'}




#################### Running 




{'accuracy': '0/34 : 0.0%', 'predicted': '[23, 56, 89, 34, 12, 18, 71, 45, 51, 67, 38, 49, 29, 88, 53]', 'actual': '[4, 12, 42, 69, 30, 107, 138, 26, 92, 52, 1, 42, 20, 138, 68]'}
{'accuracy': '0/34 : 0.0%', 'predicted': '[23, 56, 89, 34, 12, 18, 71, 45, 51, 67, 38, 49, 29, 88, 53]', 'actual': '[4, 12, 42, 69, 30, 107, 138, 26, 92, 52, 1, 42, 20, 138, 68]'}




#################### Running 




{'accuracy': '0/35 : 0.0%', 'predicted': '[35, 35, 58, 79, 58, 92, 58, 14, 26, 14, 45]', 'actual': '[28, 69, 12, 104, 87, 42, 108, 66, 139, 78, 76]'}
{'accuracy': '0/35 : 0.0%', 'predicted': '[35, 35, 58, 79, 58, 92, 58, 14, 26, 14, 45]', 'actual': '[28, 69, 12, 104, 87, 42, 108, 66, 139, 78, 76]'}




#################### Running 




{'accuracy': '0/36 : 0.0%', 'predicted': '[14, 22, 31, 6, 9, 19, 24, 15, 14, 5, 12, 8, 7, 23, 17, 10, 14, 6, 11, 10, 13, 4, 15, 20, 9, 5, 16, 3, 18, 2]', 'actual': '[78, 38, 42, 75, 19, 117, 42, 42, 78, 71, 41, 30, 119, 120, 91, 142, 42, 121, 34, 3, 42, 23, 135, 74, 60, 76, 40, 111, 149, 146]'}
{'accuracy': '0/36 : 0.0%', 'predicted': '[14, 22, 31, 6, 9, 19, 24, 15, 14, 5, 12, 8, 7, 23, 17, 10, 14, 6, 11, 10, 13, 4, 15, 20, 9, 5, 16, 3, 18, 2]', 'actual': '[78, 38, 42, 75, 19, 117, 42, 42, 78, 71, 41, 30, 119, 120, 91, 142, 42, 121, 34, 3, 42, 23, 135, 74, 60, 76, 40, 111, 149, 146]'}




#################### Running 




{'accuracy': '0/37 : 0.0%', 'predicted': '[5, 4, 6, 4, 6, 4, 5, 9, 10, 7, 4, 1, 3, 2, 10, 8, 2, 2, 7, 9, 8, 4, 9, 9]', 'actual': '[16, 133, 42, 27, 17, 68, 43, 144, 42, 42, 68, 149, 31, 60, 121, 42, 150, 42, 66, 122, 42, 132, 138, 89]'}
{'accuracy': '0/37 : 0.0%', 'predicted': '[5, 4, 6, 4, 6, 4, 5, 9, 10, 7, 4, 1, 3, 2, 10, 8, 2, 2, 7, 9, 8, 4, 9, 9]', 'actual': '[16, 133, 42, 27, 17, 68, 43, 144, 42, 42, 68, 149, 31, 60, 121, 42, 150, 42, 66, 122, 42, 132, 138, 89]'}




#################### Running 




{'accuracy': '0/38 : 0.0%', 'predicted': '[56, 78, 102, 64, 89, 77, 93, 115, 54, 134, 107, 68, 145, 110, 95, 117, 129, 142, 88]', 'actual': '[42, 84, 1, 132, 120, 88, 86, 116, 147, 98, 57, 6, 148, 136, 42, 42, 36, 42, 51]'}
{'accuracy': '0/38 : 0.0%', 'predicted': '[56, 78, 102, 64, 89, 77, 93, 115, 54, 134, 107, 68, 145, 110, 95, 117, 129, 142, 88]', 'actual': '[42, 84, 1, 132, 120, 88, 86, 116, 147, 98, 57, 6, 148, 136, 42, 42, 36, 42, 51]'}




#################### Running 




{'accuracy': '0/39 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 1, 6, 4, 1, 7, 11, 12, 2, 14, 15, 16, 15, 1, 19, 20]', 'actual': '[11, 42, 110, 42, 137, 34, 46, 42, 93, 42, 135, 72, 21, 26, 104, 130, 104, 34, 82, 147]'}
{'accuracy': '0/39 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 1, 6, 4, 1, 7, 11, 12, 2, 14, 15, 16, 15, 1, 19, 20]', 'actual': '[11, 42, 110, 42, 137, 34, 46, 42, 93, 42, 135, 72, 21, 26, 104, 130, 104, 34, 82, 147]'}




#################### Running 




{'accuracy': '0/40 : 0.0%', 'predicted': '[2, 5, 3, 4, 5, 1, 2, 2, 3, 5, 2, 4, 4, 2, 2, 5, 5, 2, 4, 2, 4, 2, 2, 5, 3, 5]', 'actual': '[103, 19, 135, 71, 23, 139, 45, 79, 38, 18, 80, 110, 8, 122, 64, 95, 85, 128, 42, 59, 41, 6, 56, 15, 1, 73]'}
{'accuracy': '0/40 : 0.0%', 'predicted': '[2, 5, 3, 4, 5, 1, 2, 2, 3, 5, 2, 4, 4, 2, 2, 5, 5, 2, 4, 2, 4, 2, 2, 5, 3, 5]', 'actual': '[103, 19, 135, 71, 23, 139, 45, 79, 38, 18, 80, 110, 8, 122, 64, 95, 85, 128, 42, 59, 41, 6, 56, 15, 1, 73]'}




#################### Running 




{'accuracy': '0/41 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]', 'actual': '[9, 63, 42, 143, 42, 42, 19, 42, 42, 8, 83, 106, 24, 77, 146, 79, 86, 42, 143, 22, 100, 129, 125, 80, 72, 16, 69, 74, 42]'}
{'accuracy': '0/41 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]', 'actual': '[9, 63, 42, 143, 42, 42, 19, 42, 42, 8, 83, 106, 24, 77, 146, 79, 86, 42, 143, 22, 100, 129, 125, 80, 72, 16, 69, 74, 42]'}




#################### Running 




{'accuracy': '0/42 : 0.0%', 'predicted': '[101, 76, 57, 88, 76, 34, 78, 78, 29, 92, 41, 33, 19, 65]', 'actual': '[48, 36, 42, 87, 36, 98, 124, 124, 42, 145, 42, 143, 61, 32]'}
{'accuracy': '0/42 : 0.0%', 'predicted': '[101, 76, 57, 88, 76, 34, 78, 78, 29, 92, 41, 33, 19, 65]', 'actual': '[48, 36, 42, 87, 36, 98, 124, 124, 42, 145, 42, 143, 61, 32]'}




#################### Running 




{'accuracy': '0/43 : 0.0%', 'predicted': '[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129]', 'actual': '[31, 42, 44, 81, 42, 63, 61, 93, 138, 46, 25, 96, 35, 53, 71, 42, 42, 11, 128, 125, 52, 97, 109, 129, 61, 62, 42, 144, 15]'}
{'accuracy': '0/43 : 0.0%', 'predicted': '[101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129]', 'actual': '[31, 42, 44, 81, 42, 63, 61, 93, 138, 46, 25, 96, 35, 53, 71, 42, 42, 11, 128, 125, 52, 97, 109, 129, 61, 62, 42, 144, 15]'}




#################### Running 




{'accuracy': '0/44 : 0.0%', 'predicted': '[1, 2, 3, 4, 1, 2, 5, 6, 7, 5, 8, 9, 10, 11, 12, 13, 9, 14, 15, 16, 9, 17, 18, 19, 20, 21, 9, 22, 23]', 'actual': '[128, 42, 42, 42, 128, 84, 21, 42, 42, 21, 144, 81, 39, 42, 78, 6, 13, 42, 42, 1, 65, 23, 5, 148, 77, 62, 13, 56, 18]'}
{'accuracy': '0/44 : 0.0%', 'predicted': '[1, 2, 3, 4, 1, 2, 5, 6, 7, 5, 8, 9, 10, 11, 12, 13, 9, 14, 15, 16, 9, 17, 18, 19, 20, 21, 9, 22, 23]', 'actual': '[128, 42, 42, 42, 128, 84, 21, 42, 42, 21, 144, 81, 39, 42, 78, 6, 13, 42, 42, 1, 65, 23, 5, 148, 77, 62, 13, 56, 18]'}




#################### Running 




{'accuracy': '0/45 : 0.0%', 'predicted': '[3, 21, 14, 5, 14, 17, 17, 17, 12, 15, 14, 17, 1, 13, 16, 16, 15, 18, 2, 2, 6, 19, 11, 2, 20, 17, 7, 10, 9]', 'actual': '[145, 61, 60, 27, 86, 44, 132, 132, 125, 66, 86, 70, 55, 46, 126, 140, 5, 22, 124, 121, 42, 42, 39, 13, 17, 41, 146, 42, 42]'}
{'accuracy': '0/45 : 0.0%', 'predicted': '[3, 21, 14, 5, 14, 17, 17, 17, 12, 15, 14, 17, 1, 13, 16, 16, 15, 18, 2, 2, 6, 19, 11, 2, 20, 17, 7, 10, 9]', 'actual': '[145, 61, 60, 27, 86, 44, 132, 132, 125, 66, 86, 70, 55, 46, 126, 140, 5, 22, 124, 121, 42, 42, 39, 13, 17, 41, 146, 42, 42]'}




#################### Running 




{'accuracy': '0/46 : 0.0%', 'predicted': '[12, 23, 15, 34, 22, 18, 9, 27, 6, 14, 7, 8, 13, 19, 11, 5]', 'actual': '[53, 9, 88, 137, 18, 12, 44, 40, 42, 3, 42, 95, 42, 67, 25, 138]'}
{'accuracy': '0/46 : 0.0%', 'predicted': '[12, 23, 15, 34, 22, 18, 9, 27, 6, 14, 7, 8, 13, 19, 11, 5]', 'actual': '[53, 9, 88, 137, 18, 12, 44, 40, 42, 3, 42, 95, 42, 67, 25, 138]'}




#################### Running 




{'accuracy': '0/47 : 0.0%', 'predicted': '[81, 102, 94, 53, 21, 77, 77, 57, 12, 48, 105, 57, 108, 85, 124, 62, 26]', 'actual': '[119, 36, 42, 4, 42, 110, 142, 53, 114, 42, 42, 83, 42, 42, 42, 92, 79]'}
{'accuracy': '0/47 : 0.0%', 'predicted': '[81, 102, 94, 53, 21, 77, 77, 57, 12, 48, 105, 57, 108, 85, 124, 62, 26]', 'actual': '[119, 36, 42, 4, 42, 110, 142, 53, 114, 42, 42, 83, 42, 42, 42, 92, 79]'}




#################### Running 




{'accuracy': '0/48 : 0.0%', 'predicted': '[32, 57, 45, 12, 85, 32, 74, 57, 21, 68]', 'actual': '[37, 103, 62, 67, 141, 3, 30, 64, 24, 76]'}
{'accuracy': '0/48 : 0.0%', 'predicted': '[32, 57, 45, 12, 85, 32, 74, 57, 21, 68]', 'actual': '[37, 103, 62, 67, 141, 3, 30, 64, 24, 76]'}




#################### Running 




{'accuracy': '0/49 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 8, 15, 16, 17, 18, 19, 20, 6]', 'actual': '[42, 145, 21, 81, 17, 58, 141, 123, 34, 85, 112, 94, 126, 123, 28, 54, 102, 35, 32, 11, 58]'}
{'accuracy': '0/49 : 0.0%', 'predicted': '[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 8, 15, 16, 17, 18, 19, 20, 6]', 'actual': '[42, 145, 21, 81, 17, 58, 141, 123, 34, 85, 112, 94, 126, 123, 28, 54, 102, 35, 32, 11, 58]'}




#################### Running 




{'accuracy': '0/50 : 0.0%', 'predicted': '[38, 72, 15, 57, 84, 33, 29, 41, 95, 67]', 'actual': '[126, 0, 27, 124, 42, 144, 14, 76, 42, 85]'}
{'accuracy': '0/50 : 0.0%', 'predicted': '[38, 72, 15, 57, 84, 33, 29, 41, 95, 67]', 'actual': '[126, 0, 27, 124, 42, 144, 14, 76, 42, 85]'}
Time taken for evaluation: 161.3048689365387 sec
Time taken for evaluation: 161.3048689365387 sec


Final Accuracy: 0.0
