# Evolutionary Prompt Selection
## For Planner/Worker/Solver Framework

### Install Dependencies

In [1]:
!printf 'accelerate\nbitsandbytes\ndatasets\npinecone-client[grpc]\nsentencepiece\nsentence-transformers\ntorch\ntransformers\nwikipedia ' > requirements.txt  
!pip install -r requirements.txt


Collecting bitsandbytes (from -r requirements.txt (line 2))
  Downloading bitsandbytes-0.41.0-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting pinecone-client[grpc] (from -r requirements.txt (line 4))
  Downloading pinecone_client-2.2.2-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.1/179.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers (from -r requirements.txt (line 6))
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | done
Collecting wikipedia (from -r requirements.txt (line 9))
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l- \ done
Collecting loguru>=0.5.0 

### Import Statements

In [2]:
import json
import re
import string
import time

import pinecone
import torch
import wikipedia

from datasets import load_dataset
from numpy.random import choice
from sentence_transformers import SentenceTransformer
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
from transformers import StoppingCriteria, StoppingCriteriaList
from tqdm.auto import tqdm

from kaggle_secrets import UserSecretsClient


  from tqdm.autonotebook import tqdm
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


### Define Prompt Prefixes/Suffixes

In [3]:
PLANNER_PROMPT = {"prefix": "You are an advanced AI capable of making plans to solve complex problems. For the following tasks, make plans that can solve the problem step-by-step. For each plan, indicate which external tool together with tool input to retrieve evidence. You can store the evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...)","suffix": "Describe your plans with rich details. Each Plan should be followed by only one #E. Answer each question directly with plans."}
SOLVER_PROMPT = {"prefix": "You are an advanced AI capable of solving tasks based on evidence. Solve the following task or problem based on the provided plans and corresponding evidences. Keep your responses direct and concise.", "suffix": "Based on the provided evidence answer the following question directly and concisely: "}
TOOLS_PROMPT = {"Wikipedia": "Worker that search for similar page contents from Wikipedia. Useful when you need to get holistic knowledge about people, places, companies, historical events, or other subjects. The response are long and might contain some irrelevant information. Input should be a search query.", "LLM": "A pretrained LLM like yourself. Useful when you need to act with general world knowledge and common sense. Prioritize it when you are confident in solving the problem yourself. Input can be any instruction."}
EXTRACTOR_PROMPT = {"prefix": "Based on the given statement, concisely extract the answer to the following question. Respond directly with the concise answer to the question."}


### Define classes and functions

#### Nodes

In [4]:
class Node:
    """Basic node class"""
    def __init__(self):
        raise NotImplementedError

    def run (self, inputs):
        raise NotImplementedError


class LLMNode(Node):
    """A node that is based on an LLM"""
    def __init__(self, model):
        self.model = model
        self.system_tag = model.system_tag
        self.user_tag = model.user_tag
        self.ai_tag = model.ai_tag
        self.stops = ['.', '\n']

    def call_llm(self, prompt):
        """Calls the underlying LLM with the given inputs
        Parameters:
        ------------
        prompt: str
            prompt for the LLM

        Returns:
        ------------
        response: str
            LLM response
        """
        response = self.model.generate(prompt, self.stops)
        return response


class Planner(LLMNode):
    """Planner node for making plans within the PWS framework"""
    def __init__(self, model):
        super().__init__(model)
        self.stops = ['\n\n']
        self.prefix = PLANNER_PROMPT['prefix']
        self.suffix = PLANNER_PROMPT['suffix']
        self.tools = TOOLS_PROMPT

    def run(self, task, examples):
        """Generate plans for the given task, examples and tools
        Parameters:
        ------------
        task: str
            Task for which the plan is to be generated
        examples: list(dict)
            Examples related to the task for the fewshot prompt

        Returns:
        ------------
        planner_response: dict(str:obj)
            Planner response contains the plans and the evidences
        """
        prompt = self.generate_prompt(task, examples)
        response = self.call_llm(prompt)
        plans, tool_calls = self.parse_response(response)
        planner_response = {'plans': plans, 'tool_calls': tool_calls,
                            'text':response}
        return planner_response

    def generate_prompt(self, task, examples):
        """Generates a planner prompt for the given task, examples and tools
        Parameters:
        ------------
        task: str
            Task for which the plan is to be generated
        examples: list(dict)
            Examples related to the task for the fewshot prompt

        Returns:
        ------------
        prompt: str
            planner prompt
        """
        tools = {tool: self.tools[tool] for example in examples
                 for tool in example['tools']}

        prompt = f"{self.system_tag}{self.prefix}\n"
        prompt += "Tools can be one of the following:\n"
        for tool, description in tools.items():
            prompt += f"{tool}[input]: {description}\n"
        prompt += f"{self.suffix}\n\n"
        for example in examples:
            prompt += f"{self.user_tag}{example['question'].strip()}\n\n"
            prompt += f"{self.ai_tag}{example['plan'].strip()}\n\n"
        prompt += f"{self.user_tag}{task.strip()}\n\n"
        prompt += self.ai_tag
        return prompt

    def parse_response(self, response):
        """Parse the planner response and return plans and evidences dictionary
        Parameters:
        ------------
        response: str
            Planner response

        Returns:
        ------------
        plans: list(str)
            List that contains the plans
        evidences: dict(str:str)
            Evidence dict conatining evidences and associated tool calls
        """
        plans = []
        tool_calls = {}
        for line in response.splitlines():
            if line.startswith("Plan:"):
                plans.append(line)
            elif len(line) < 3:
                continue
            elif line.startswith("#") and line[1] == "E" and line[2].isdigit():
                e, tool_call = line.split("=", 1)
                e, tool_call = e.strip(), tool_call.strip()
                if len(e) == 3:
                    tool_calls[e] = tool_call
                else:
                    tool_calls[e] = "No evidence found"
        return plans, tool_calls


class WikipediaWorker(Node):
    """Worker that searches Wikipedia"""
    def __init__(self):
        pass

    def run(self, inputs):
        """Searches Wikipedia for the given inputs and returns the first
        paragraph of the first page in search results
        Parameters:
        ------------
        inputs: str
            String input for Wikipedia search

        Returns:
        ------------
        evidence: str
            First paragraph of the first page from the search results
        """
        evidence = "No evidence found."
        pages = wikipedia.search(inputs, results=1)
        if pages:
            try:
                evidence = wikipedia.page(pages[0], auto_suggest=False).content
            except:
                pass
            
        return evidence


class LLMWorker(LLMNode):
    """LLM node to be used for worker calls"""
    def run(self, inputs):
        """Run the LLM as a tool call
        Parameters:
        ------------
        inputs: str
            Input for the tool call

        Returns:
        ------------
        evidence: str
            Cleaned response from the tool call
        """
        prompt = f"{self.system_tag}Directly answer the following question with no extra words.\n\n"
        prompt += f"{self.user_tag}{inputs.strip()}\n\n{self.ai_tag}"
        response = self.call_llm(prompt)
        evidence = response.strip()
        return evidence


class Worker(Node):
    """Worker node that calls appropriate workers for each tool call"""
    def __init__(self, model):
        self.wiki_worker = WikipediaWorker()
        self.llm_worker = LLMWorker(model)

    def run(self, inputs):
        """Faciliates all tool calls and returns evidences
        Parameters:
        ------------
        inputs: dict(str:str)
            A dictionary of evidence variables and associated tool calls

        Returns:
        ------------
        evidences: dict(str:str)
            A dictinary of evidence variables and the outputs of the associated
            tool calls
        """
        evidences = {}
        for e, tool_call in inputs.items():
            # Do not process tools without input
            if "[" not in tool_call:
                evidences[e] = tool_call
                continue

            # Seperate tool and tool input
            tool, tool_input = tool_call.split("[", 1)
            tool_input = tool_input[:-1]

            # Find variables in input and replace with previous evidences
            for var in re.findall(r"#E\d+", tool_input):
                if var in evidences:
                    try:
                        evidence = evidences[var]
                    except KeyError:
                        evidence = "No evidence found."
                    evidence_words = evidence.split()
                    if len(evidence_words) > 512:
                        evidence = ' '.join(evidence_words[:512])
                        evidence += '...'
                    tool_input = tool_input.replace(var, f"[{evidence}]")

            match tool:
                case "Wikipedia":
                    evidences[e] = self.wiki_worker.run(tool_input)
                case "LLM":
                    evidences[e] = self.llm_worker.run(tool_input)
                case _:
                    evidences[e] = "No evidence found."

        return evidences


class Solver(LLMNode):
    """Solver node that solves tasks for given plans and evidences"""
    def __init__(self, model):
        super().__init__(model)
        self.prefix = SOLVER_PROMPT['prefix']
        self.suffix = SOLVER_PROMPT['suffix']

    def run(self, task, plans, evidences):
        """Solve the task based on the given plans and evidences
        Parameters:
        ------------
        task: str
            Task to be solved
        plans: list(str)
            List of plans generated by Planner
        evidences: dict(str:str)
            Dictionary of evidences generated by the Worker

        Returns:
        ------------
        output: str
            Solution generated based on the given plans and evidences
        """
        prompt = f"{self.system_tag}{self.prefix}\n\n"
        prompt += f"{self.user_tag}{task.strip()}\n"
        for i in range(len(plans)):
            e = f"#E{i + 1}"
            plan = plans[i]
            try:
                evidence = evidences[e]
            except KeyError:
                evidence = "No evidence found."
            evidence_words = evidence.split()
            if len(evidence_words) > 128:
                evidence = ' '.join(evidence_words[:128])
                evidence += '...'
            prompt += f"{plan}\nEvidence: {evidence}\n"
        prompt += f"{self.suffix + task.strip()}\n\n{self.ai_tag}"
        output = self.call_llm(prompt)
        return output

class Extractor(LLMNode):
    def __init__(self, model):
        super().__init__(model)
        self.prefix = EXTRACTOR_PROMPT['prefix']
    
    def __call__(self, statement, question):
        prompt = f"{self.system_tag}{self.prefix}\n"
        prompt += f"{self.user_tag}Statement: {statement}\n"
        prompt += f"Question: {question}\n{self.ai_tag}"
        output = self.call_llm(prompt)
        return output
    


#### Utils

In [5]:
class MultiTokenEOSCriteria(StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence."""

    def __init__(self, sequence, tokenizer, initial_decoder_input_length):
        self.initial_decoder_input_length = initial_decoder_input_length
        self.sequence = sequence
        self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
        self.sequence_id_len = len(self.sequence_ids)
        self.tokenizer = tokenizer

    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence
        lookback_ids = input_ids[0][self.initial_decoder_input_length:][-self.sequence_id_len:]
        lookback_tokens = self.tokenizer.decode(lookback_ids)
        return self.sequence in lookback_tokens


class LanguageModel:
    """Language model wrapper to be used in nodes"""
    def __init__(self, model_path, generation_config, load_in_8bit=False, access_token=None,
                 system_tag='\n', user_tag='\n', ai_tag='\n'):
        self.tokenizer = LlamaTokenizer.from_pretrained(model_path, use_auth_token=access_token)
        self.model = LlamaForCausalLM.from_pretrained(
            model_path, torch_dtype=torch.float16, device_map='auto',
            load_in_8bit=load_in_8bit, use_auth_token=access_token)
        self.generation_config = generation_config
        self.system_tag = system_tag
        self.user_tag = user_tag
        self.ai_tag = ai_tag

    def stop_sequences_criteria(self, stop_sequences, initial_decoder_input_length):
        return StoppingCriteriaList(
            [
                MultiTokenEOSCriteria(sequence, self.tokenizer, initial_decoder_input_length)
                for sequence in stop_sequences
            ]
        )

    def generate(self, prompt, stops):
        """Generate text based on given prompt
        Parameters:
        ------------
        prompt: str
            Prompt for the LLM

        Returns:
        ------------
        output_text: str
            LLM generated response
        """
        input_tokens = self.tokenizer(prompt, return_tensors="pt").to("cuda")
        input_length = input_tokens['input_ids'].shape[1]
        stopping_criteria = self.stop_sequences_criteria(stops, input_length)
        with torch.no_grad():
            output_tokens = model.model.generate(
                **input_tokens,
                generation_config=self.generation_config,
                stopping_criteria=stopping_criteria
                )      

        output_text = model.tokenizer.decode(output_tokens[0][input_length:],
                                             skip_special_tokens=True)

        return output_text


class PWS:
    """ Planner Worker Solver Framework"""
    def __init__(self, model):
        self.planner = Planner(model=model)
        self.worker = Worker(model=model)
        self.solver = Solver(model=model)

    def run(self, task, examples, verbose=False):
        """Run the PWS on a given task based on provided examples
        Parameters:
        ------------
        task: str
            Task for which the PWS is to be run
        examples: list(str)
            Examples related to the task for the fewshot prompt

        Returns:
        ------------
        pws_response: dict(str:obj)
            PWS response contains the output and time elapsed
            If verbose responses from intermediate nodes are also returned
        """

        st = time.time()
        # Plan
        planner_response = self.planner.run(task, examples)
        plans = planner_response["plans"]
        tool_calls = planner_response["tool_calls"]

        # Work
        evidences = self.worker.run(tool_calls)

        # Solve
        output = self.solver.run(task, plans, evidences)

        wall_time = time.time() - st

        pws_response = {"output": output,
                        "wall_time": wall_time}

        if verbose:
            pws_response["planner_response"] = planner_response
            pws_response["worker_response"] = evidences

        return pws_response


class EPS:
    """ Evolutionary Prompt Selection"""
    # TODO add comments
    def __init__(self, index, embedding_model, similar_pool_size=5, instructive_pool_size=5):
        self.index = index
        index_stats = self.index.describe_index_stats()
        self.index_size = index_stats['total_vector_count']
        self.dimension = index_stats['dimension']
        self.embedding_model = embedding_model
        self.similar_pool_size = similar_pool_size
        self.instructive_pool_size = instructive_pool_size
        self.most_instructive = []
        self.set_most_instructive()

    def set_most_instructive(self):
        batch_size = 1000
        score = lambda entry: entry['metadata']['score']
        for i in range(0, self.index_size, batch_size):
            # find end of batch
            i_end = min(i+batch_size, self.index_size)
            # create IDs batch
            ids = list(range(i, i_end))
            batch = self.index.query(self.dimension * [0],
                                     top_k=batch_size,
                                     filter={'id':{"$in": ids}},
                                     include_metadata=True)['matches']
            batch_sorted = sorted(batch + self.most_instructive, key=score, reverse=True)
            self.most_instructive = batch_sorted[:self.instructive_pool_size]

    def select_examples(self, task, num_examples=3):
        task_embedding = self.embedding_model.encode(task, show_progress_bar=False).tolist()
        most_similar = self.index.query(task_embedding,
                                        top_k=self.similar_pool_size,
                                        include_metadata=True)['matches']
        instructive_ids = [entry['metadata']['id'] for entry in self.most_instructive]
        most_instructive = self.index.query(task_embedding,
                                            top_k=self.instructive_pool_size,
                                            filter={'id':{"$in": instructive_ids}},
                                            include_metadata=True)['matches']
        pool = most_similar + most_instructive
        weights = [(entry['score'] + 1.0) * entry['metadata']['score'] for entry in pool]
        probabilities = list(map(lambda weight: weight/sum(weights), weights))
        sample_ids = choice(range(len(pool)), num_examples, replace=False, p=probabilities)
        examples = [pool[i] for i in sample_ids]
        return examples

    def update_score(self, entry):
        score = lambda entry: entry['metadata']['score']
        self.index.update(id=entry['id'], set_metadata={"score": score(entry)})
        if score(entry) > score(self.most_instructive[-1]):
            self.most_instructive.append(entry)
            self.most_instructive = sorted(self.most_instructive, key=score, reverse=True)
            self.most_instructive[:self.instructive_pool_size]

    def upsert_entry(self, metadata):
        entry_id = self.index_size
        embedding = self.embedding_model.encode(metadata['question'])
        metadata['id'] = entry_id
        metadata['score'] = 1
        index.upsert(zip([str(entry_id)], [embedding], [metadata]))
        self.index_size += 1


### Test the system

#### Define variables

In [6]:
user_secrets = UserSecretsClient()
PINECONE_API_KEY = user_secrets.get_secret('PINECONE_API_KEY')
PINECONE_ENV = user_secrets.get_secret('PINECONE_ENVIRONMENT')
INDEX_NAME = 'plans'

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

"""
MODEL_PATH = "NousResearch/Nous-Hermes-Llama2-13b"
SYSTEM_TAG = "### Instruction:\n"
USER_TAG = "### Input:\n"
AI_TAG = "### Response:\n"
"""

MODEL_PATH = "stabilityai/StableBeluga-13B"
SYSTEM_TAG = "### System:\n"
USER_TAG = "### User:\n"
AI_TAG = "### Assistant:\n"

LOAD_IN_8BIT = True
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

TEMPERATURE = 0.5
TOP_K = 50
TOP_P = 0.9
REPETITION_PENALTY= 1.0
MAX_NEW_TOKENS = 256

DATASET_NAME = "trivia_qa"

SIMILAR_POOL_SIZE = 5
INSTRUCTIVE_POOL_SIZE = 5
NUM_EXAMPLES = 3


#### Initialize models and prepare the data

In [7]:
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENV
)
index = pinecone.GRPCIndex(INDEX_NAME)

embedding_model = SentenceTransformer(EMBEDDING_MODEL)

generation_config = GenerationConfig(
    do_sample=True,
    temperature=TEMPERATURE,
    top_k=TOP_K,
    top_p=TOP_P,
    repetition_penalty=REPETITION_PENALTY,
    max_new_tokens=MAX_NEW_TOKENS
)

model = LanguageModel(MODEL_PATH, generation_config=generation_config,
                      load_in_8bit=LOAD_IN_8BIT, access_token=HF_TOKEN,
                      system_tag=SYSTEM_TAG, user_tag=USER_TAG, ai_tag=AI_TAG)

dataset = load_dataset(DATASET_NAME, 'rc.nocontext')

prompter = EPS(index, embedding_model, SIMILAR_POOL_SIZE, INSTRUCTIVE_POOL_SIZE)

agent = PWS(model)


Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

Downloading (…)of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading and preparing dataset trivia_qa/rc.nocontext (download: 2.48 GiB, generated: 118.84 MiB, post-processed: Unknown size, total: 2.60 GiB) to /root/.cache/huggingface/datasets/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Dataset trivia_qa downloaded and prepared to /root/.cache/huggingface/datasets/trivia_qa/rc.nocontext/1.2.0/e73c5e47a8704744fa9ded33504b35a6c098661813d1c2a09892eb9b9e9d59ae. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

#### Hyperparameter Optimization

In [8]:
sanitize = lambda text: text.strip().lower().translate(str.maketrans('', '', string.punctuation))
extractor = Extractor(model)
temp_values = [0.01, 0.25, 0.5, 0.75, 1.0]
rep_values = [1.0, 1.1, 1.2, 1.3]
results = []

for temp in temp_values:
    for rep in rep_values:
        generation_config = GenerationConfig(
            do_sample=True,
            temperature=temp,
            top_k=TOP_K,
            top_p=TOP_P,
            repetition_penalty=rep,
            max_new_tokens=MAX_NEW_TOKENS
        )
        model.generation_config = generation_config
        em = []
        instance_counter = 0
        for instance in tqdm(dataset['train']):
            if instance_counter == 10:
                break
            instance_counter += 1

            question = instance['question']
            list_of_candidates = [sanitize(alias) for alias in instance["answer"]["aliases"]]

            selection = prompter.select_examples(question, NUM_EXAMPLES)
            examples = [entry['metadata'] for entry in selection]
            response = agent.run(question, examples)
            answer = sanitize(response['output'])

            if answer not in list_of_candidates:
                extracted = sanitize(extractor(response['output'], question))
                if extracted not in list_of_candidates:
                    em.append(False)
                    continue 
            em.append(True)

        print(f"Temperature: {temp}\nRepetition Penalty: {rep}\nScore: {sum(em)}\n")
        results.append({'temp':temp, 'rep':rep, 'score':sum(em)})
    

  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.01
Repetition Penalty: 1.0
Score: 5



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.01
Repetition Penalty: 1.1
Score: 5



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.01
Repetition Penalty: 1.2
Score: 5



  0%|          | 0/138384 [00:00<?, ?it/s]



  lis = BeautifulSoup(html).find_all('li')


Temperature: 0.01
Repetition Penalty: 1.3
Score: 3



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.25
Repetition Penalty: 1.0
Score: 5



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.25
Repetition Penalty: 1.1
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.25
Repetition Penalty: 1.2
Score: 3



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.25
Repetition Penalty: 1.3
Score: 1



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.5
Repetition Penalty: 1.0
Score: 3



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.5
Repetition Penalty: 1.1
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.5
Repetition Penalty: 1.2
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.5
Repetition Penalty: 1.3
Score: 3



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.75
Repetition Penalty: 1.0
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.75
Repetition Penalty: 1.1
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.75
Repetition Penalty: 1.2
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 0.75
Repetition Penalty: 1.3
Score: 2



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 1.0
Repetition Penalty: 1.0
Score: 4



  0%|          | 0/138384 [00:00<?, ?it/s]

Temperature: 1.0
Repetition Penalty: 1.1
Score: 3



  0%|          | 0/138384 [00:00<?, ?it/s]

In [None]:
with open("results.json", "w") as f:
    json.dump(results, f)
    
for result in results:
    print(f"Temperature: {result['temp']}")
    print(f"Repetition Penalty: {result['rep']}")
    print(f"Score: {result['score']}")
    print()

### Run experiments

In [None]:
"""
sanitize = lambda text: text.strip().lower().translate(str.maketrans('', '', string.punctuation))
extractor = Extractor(model)
em = []
prompt_data = []
instance_counter = 0
for instance in tqdm(dataset['train']):
    if instance_counter and not instance_counter % 100:
        total_acc = sum(em) / len(em)
        last_100_acc = sum(em[-100:]) / 100
        print(f"Processed instances: {instance_counter}")
        print(f"\t{total_acc=}\t{last_100_acc=}")
        results = {'em': em[-100:], 'prompt_data': prompt_data[-100:]}
        batch_number = instance_counter / 100
        with open(f"results_batch_{batch_number}.json", "w") as f:
            json.dump(results, f)
        
    instance_counter += 1
    question = instance['question']
    list_of_candidates = [sanitize(alias) for alias in instance["answer"]["aliases"]]
    selection = prompter.select_examples(question, NUM_EXAMPLES)
    prompt_data.append([(entry['id'], entry['score']) for entry in selection])
    examples = [entry['metadata'] for entry in selection]
    response = agent.run(question, examples, verbose=True)
    answer = sanitize(response['output'])
    
    if answer not in list_of_candidates:
        extracted = sanitize(extractor(response['output'], question))
        if extracted not in list_of_candidates:
            em.append(False)
            continue  
    em.append(True)
    for entry in selection:
        entry['metadata']['score'] += 1
        prompter.update_score(entry)
    tools = set()
    for calls in response['planner_response']['tool_calls'].values():
        tool = calls.split('[', 1)[0]
        tools.add(tool)
    tools = list(tools)
    new_entry_metadata = {'question': question,
                          'plan': response['planner_response']['text'],
                          'tools': tools,
                          'dataset_name': DATASET_NAME,  
    }
    prompter.upsert_entry(new_entry_metadata)
"""
