diff --git a/neurons/validator.py b/neurons/validator.py index 846683d29..6f8370d2d 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -40,7 +40,7 @@ def __init__(self, config=None): mock=self.config.mock, ) - if sum(self.config.neuron.task_p) != 1: + if abs(1-sum(self.config.neuron.task_p)) > 0.001: raise ValueError("Task probabilities do not sum to 1.") # Filter out tasks with 0 probability diff --git a/prompting/task_registry.py b/prompting/task_registry.py index 66236c900..22c1d6e4c 100644 --- a/prompting/task_registry.py +++ b/prompting/task_registry.py @@ -1,19 +1,21 @@ -from .tasks import Task, MockTask, SummarizationTask, QuestionAnsweringTask, DebuggingTask, MathTask, DateQuestionAnsweringTask -from .tools import MockDataset, WikiDataset, HFCodingDataset, StackOverflowDataset, MathDataset, WikiDateDataset +from .tasks import Task, MockTask, SummarizationTask, QuestionAnsweringTask, DebuggingTask, MathTask, DateQuestionAnsweringTask, GenericInstructionTask +from .tools import MockDataset, WikiDataset, HFCodingDataset, StackOverflowDataset, MathDataset, WikiDateDataset, GenericInstructionDataset # TODO: Expand this to include extra information beyond just the task and dataset names mock_task, mock_dataset = MockTask.name, [MockDataset.name] summarization_task, summarization_dataset = SummarizationTask.name, [WikiDataset.name] qa_task, qa_dataset = QuestionAnsweringTask.name, [WikiDataset.name] -debugging_task, debugging_dataset = DebuggingTask.name, [HFCodingDataset.name] +#debugging_task, debugging_dataset = DebuggingTask.name, [HFCodingDataset.name] math_task, math_dataset = MathTask.name, [MathDataset.name] date_qa_task, date_qa_dataset = DateQuestionAnsweringTask.name, [WikiDateDataset.name] +generic_instruction_task, generic_instruction_dataset = GenericInstructionTask.name, [GenericInstructionDataset.name] TASK_REGISTRY = { mock_task: mock_dataset, summarization_task: summarization_dataset, qa_task: qa_dataset, - debugging_task: debugging_dataset, + #debugging_task: debugging_dataset, math_task: math_dataset, date_qa_task: date_qa_dataset, + generic_instruction_task: generic_instruction_dataset } \ No newline at end of file diff --git a/prompting/tasks/__init__.py b/prompting/tasks/__init__.py index fa8115a35..bc08c7fe3 100644 --- a/prompting/tasks/__init__.py +++ b/prompting/tasks/__init__.py @@ -13,7 +13,7 @@ QuestionAnsweringTask.name: QuestionAnsweringTask, DateQuestionAnsweringTask.name: DateQuestionAnsweringTask, SummarizationTask.name: SummarizationTask, - DebuggingTask.name: DebuggingTask, - #GenericInstructionTask.name: GenericInstructionTask, + #DebuggingTask.name: DebuggingTask, + GenericInstructionTask.name: GenericInstructionTask, MathTask.name: MathTask, } diff --git a/prompting/tasks/date_qa.py b/prompting/tasks/date_qa.py index 0f13015ec..bee5776b2 100644 --- a/prompting/tasks/date_qa.py +++ b/prompting/tasks/date_qa.py @@ -22,7 +22,7 @@ class DateQuestionAnsweringTask(Task): static_reference = True static_query = True - def __init__(self, llm_pipeline, context, create_reference=True): + def __init__(self, llm_pipeline, context, create_reference =True): self.context = context self.query = ( diff --git a/prompting/tasks/generic_instruction.py b/prompting/tasks/generic_instruction.py index b9aec5d20..e4274a768 100644 --- a/prompting/tasks/generic_instruction.py +++ b/prompting/tasks/generic_instruction.py @@ -1,141 +1,46 @@ -import re -import bittensor as bt -from dataclasses import dataclass -from tenacity import retry, stop_after_attempt from prompting.tasks import Task -from typing import Tuple -CRITERIA_GENERATION_PROMPT = """\ -We are brainstorming criteria with which to grade a language model on its responses in -diverse situations. -A ‘criteria‘ is some useful, real-world objective, and associated rubric for scores 1-5, that -tests a capability. - -Please brainstorm a new criteria and scoring rubrics. -Be creative and create new but useful criteria that people in different settings or industries -might find practical. -Please format the output as same as the above examples with no extra or surrounding text. -Write [END] after you are done. -New Criteria: +QUERY_PROMPT_TEMPLATE = """\ +You are a question-generating expert, focusing on delivering comprehensive and accurate questions with depth and clarity. Your response contains only the question, nothing more, nothing less. You will adhere to a word limit of 100 words. +{context} """ +REFERENCE_PROMPT_TEMPLATE = """\ +Answer the following question. -INSTRUCTION_GENERATION_PROMPT = """\ -Your job is to generate a new novel problem and a response that is related to the given score -rubric. -The score rubric: -{CRITERIA} -* Problem -- The problem should inherently be related to the score criteria and score rubric given above. -Specifically, the score criteria should be the core attributes required to solve the problem. -- The problem itself should not be too generic or easy to solve. -- If the score rubric is related to logical abilities, generate problems that require math or -coding abilities. -- Try to make the person who might solve the problem not notice the existence of the score -rubric by not explicitly mentioning it, and also provide additional inputs and options if -needed. -- Assume a situation where a user is interacting with an AI model. The user would try to -ask in a first-person point of view, but not using terms like ”I”, ”A User” or ”You” in the -first sentence. -- Do not give a role to the AI, assume that the user is asking a question from his point of -view. -- Do not include any phrase related to AI model in the problem. -* Response -- The response should be a response that would get a score of 5 from the score rubric. -- The response should be as detailed as possible unless the score rubric is related to -conciseness or brevity. It should consist of multiple paragraphs, a list of items, or a -step-by-step reasoning process. -- The response should look like how a well-prompted GPT-4 would normally answer your -problem. -* Format -- DO NOT WRITE ANY GREETING MESSAGES, just write the problem and response -only. -- In front of the problem, append the phrase ”Problem:” and in front of the response, append -the phrase ”Response:”. -- Write in the order of ”Problem” - ”Response”, where the two items are separated by the -phrase ”[NEXT]”. -- Write [END] after you are done. -Data Generation: -""" +# Question: +{query}""" -@dataclass class GenericInstructionTask(Task): + challenge_type = 'query' + name = "generic" + desc = "get help on answering a general instruction" + goal = "to get the answer to the following instruction" + reward_definition = [ dict(name="rouge", ngram="rouge-1", metric="f", weight=1.0), - dict(name="relevance", threshold=None, weight=1.0), + ] + penalty_definition = [ + dict(name="rouge", ngram="rouge-1", metric="f", weight=0.5), ] - def __init__(self, llm_pipeline): - super().__init__( - name="generic_instruction", - goal="to get the answer to a instruction", - delimiter="```", - reward_types=[ - "CRITERIA_REWARD", - ], - reward_threshold=0.5, - use_challenge_as_prompt=True, - desc="", - topics={}, - topic="", - subtopic="", - challenge="", - reference="", - criteria="", - ) - - self.criteria = self.create_criteria(llm_pipeline) - instruction, reference = self.create_instruction_and_reference(llm_pipeline) - self.challenge = instruction - self.reference = reference - - def extract_instruction_and_reference_from_text(self, text: str) -> Tuple[str, str]: - # Split the text into problem and response using regular expression - split_text = re.split(r"\nResponse:\n", text) - - # Extract problem and response - problem = split_text[0].strip() - response = split_text[1].strip() - - return problem, response - - def create_criteria(self, llm) -> str: - bt.logging.debug("🎲 Creating a generic criteria-scoring rubric ...") - - # Generate a score rubric with defined criterias - criteria_generation_response = llm(CRITERIA_GENERATION_PROMPT) - return criteria_generation_response - - @retry(stop=stop_after_attempt(5)) - def create_instruction_and_reference(self, llm) -> Tuple[str, str]: - try: - bt.logging.debug("📋 🎯 Creating instruction and referece text...") + cleaning_pipeline = [ + dict(name="remove_quotes"), + dict(name="prune_ending"), + dict(name="remove_roles"), + ] - if not self.criteria: - raise ValueError( - "Criteria must be defined before creating a generic instruction." - ) + def __init__(self, llm_pipeline, context, create_reference=True): + self.context = context - # Create generic instruction with the score rubric - instruction_generation_prompt_with_criteria = ( - INSTRUCTION_GENERATION_PROMPT.format(CRITERIA=self.criteria) - ) - instruction_generation_response = llm( - instruction_generation_prompt_with_criteria - ) + self.query_prompt = QUERY_PROMPT_TEMPLATE.format(context=context.content) + self.query = self.generate_query(llm_pipeline) - # Extract generic instruction and reference response from the generated text - ( - instruction, - reference, - ) = self.extract_instruction_and_reference_from_text( - instruction_generation_response - ) + self.reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(query = self.query) + if create_reference: + self.reference = self.generate_reference(llm_pipeline) - return instruction, reference - except Exception as e: - bt.logging.error( - f"Failed to create instruction and reference text: {e}, retrying..." - ) - raise e + self.topic = context.title + self.subtopic = context.topic + self.tags = context.tags \ No newline at end of file diff --git a/prompting/tasks/summarization.py b/prompting/tasks/summarization.py index 530a69942..8e413f826 100644 --- a/prompting/tasks/summarization.py +++ b/prompting/tasks/summarization.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from prompting.tasks import Task -from transformers import Pipeline + # TODO: introduce criteria for the query and reference answer (length, layout, etc.) and make these arguments @@ -39,7 +39,7 @@ class SummarizationTask(Task): static_query = True - def __init__(self, llm_pipeline: Pipeline, context: str, create_reference=True): + def __init__(self, llm_pipeline, context, create_reference=True): self.context = context # Query is just the article title and section name diff --git a/prompting/tools/__init__.py b/prompting/tools/__init__.py index 6d7e5b417..e9ef44df4 100644 --- a/prompting/tools/__init__.py +++ b/prompting/tools/__init__.py @@ -7,16 +7,18 @@ StackOverflowDataset, WikiDateDataset, MathDataset, + GenericInstructionDataset, ) from .selector import Selector DATASETS = { - "mock": MockDataset, - "hf_coding": HFCodingDataset, - "wiki": WikiDataset, - #"stack_overflow": StackOverflowDataset, - "wiki_date": WikiDateDataset, - "math": MathDataset, + MockDataset.name: MockDataset, + #HFCodingDataset.name: HFCodingDataset, + WikiDataset.name: WikiDataset, + #StackOverflowDataset.name: StackOverflowDataset, + MathDataset.name: MathDataset, + WikiDateDataset.name: WikiDateDataset, + GenericInstructionDataset.name: GenericInstructionDataset, } diff --git a/prompting/tools/datasets/__init__.py b/prompting/tools/datasets/__init__.py index 66c9e4de9..948fead7d 100644 --- a/prompting/tools/datasets/__init__.py +++ b/prompting/tools/datasets/__init__.py @@ -4,3 +4,4 @@ from .math import MathDataset from .mock import MockDataset from .wiki import WikiDataset, WikiDateDataset +from .generic_instruction import GenericInstructionDataset \ No newline at end of file diff --git a/prompting/tools/datasets/base.py b/prompting/tools/datasets/base.py index 55cce07fe..a2130e929 100644 --- a/prompting/tools/datasets/base.py +++ b/prompting/tools/datasets/base.py @@ -17,6 +17,8 @@ # DEALINGS IN THE SOFTWARE. import time +import random +import functools from abc import ABC, abstractmethod from typing import Dict import bittensor as bt @@ -28,7 +30,7 @@ class Dataset(ABC): """Base class for datasets.""" - name = "dataset" + max_tries: int = 10 @abstractmethod @@ -74,11 +76,51 @@ def next( f"Could not find any samples which meet {self.__class__.__name__} requirements after {tries} tries." ) + info["source"] = self.__class__.__name__ info["stats"] = { - "creator": self.__class__.__name__, "fetch_time": time.time() - t0, "num_tries": tries, "fetch_method": method, "next_kwargs": kwargs, } return Context(**info) + + +class TemplateDataset(Dataset): + """Base class for datasets based on a template.""" + + @property + def size(self): + return functools.reduce( + lambda x, y: x * y, [len(v) for v in self.params.values()], 1 + ) + + def __repr__(self): + return f"{self.__class__.__name__} with template: {self.query_template!r} and {self.size} possible phrases" + + def get(self, params: dict): + content = self.query_template.format(**params) + keys, values = list(zip(*params.items())) + + return { + "title": params.get( + "title", keys[0] + ), # Use the first key as the title if no field called title is present + "topic": params.get("topic", keys[min(1, len(keys) - 1)]), # Same for topic + "subtopic": params.get( + "subtopic", keys[min(2, len(keys) - 2)] + ), # Same for subtopic + "content": content, # content + "internal_links": values, # internal links + "external_links": values, # external links + "tags": values, # tags + "extra": {}, + } + + def random(self, selector: Selector = None): + selected = {k: selector(v) for k, v in self.params.items()} + return self.get(selected) + + def search(self, params: dict, selector: Selector = None): + selected = {k: params.get(k, selector(v)) for k, v in self.params.items()} + return self.get(selected) \ No newline at end of file diff --git a/prompting/tools/datasets/generic_instruction.py b/prompting/tools/datasets/generic_instruction.py new file mode 100644 index 000000000..b4d1e1d21 --- /dev/null +++ b/prompting/tools/datasets/generic_instruction.py @@ -0,0 +1,52 @@ +from .base import TemplateDataset + + +class GenericInstructionDataset(TemplateDataset): + "Generic question dataset, which creates LLM prompts for asking questions." + name = "generic_instruction" + query_template = ( + "Ask a {style} question about a {theme} {subtopic} related to {topic}" + ) + params = dict( + style=[ + "casual", + "basic", + "silly", + "random", + "thoughtful", + "detailed", + "deep", + "fun", + ], + theme=[ + "surprising", + "controvesial", + "historic", + "modern", + "famous", + "imfamous", + "popular", + "unpopular", + ], + subtopic=[ + "person", + "figure", + "opinion", + "event", + "leader", + "spokesperson", + "expert", + "topic", + ], + topic=[ + "science", + "politics", + "parenting", + "travel", + "cuisine", + "sports", + "pop culture", + "tech", + "history", + ], + ) \ No newline at end of file diff --git a/prompting/utils/config.py b/prompting/utils/config.py index f21b89a90..d52208193 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -286,7 +286,7 @@ def add_validator_args(cls, parser): type=float, nargs="+", help="The probability of sampling each task.", - default=[.25, .25, .25, 0, .25], + default=[1.0 / (len(TASKS)-1)] * (len(TASKS)-1), ) parser.add_argument( diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 907607f3e..ae352debe 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -61,7 +61,7 @@ def test_context_field_is_not_null(dataset: Dataset, field: str): @pytest.mark.parametrize("dataset", DATASETS) @pytest.mark.parametrize( - "field", ("creator", "fetch_time", "num_tries", "fetch_method", "next_kwargs") + "field", ("fetch_time", "num_tries", "fetch_method", "next_kwargs") ) def test_context_stats_field_contains_expected_keys(dataset: Dataset, field: str): assert field in CONTEXTS[dataset].stats