macrocosm-os · Hollyqui · Feb 19, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/past_websites.csv b/past_websites.csv
diff --git a/poetry.lock b/poetry.lock
diff --git a/prompting/llms/apis/llm_wrapper.py b/prompting/llms/apis/llm_wrapper.py
@@ -20,7 +20,11 @@ def chat_complete(
         logprobs=True,
     ) -> str:
         response: str | None = None
-        if "gpt" not in model.lower() and shared_settings.SN19_API_KEY and shared_settings.SN19_API_URL:
+        if (
+            shared_settings.SN19_API_KEY
+            and shared_settings.SN19_API_URL
+            and (model is None or "gpt" not in model.lower())
+        ):
             try:
                 response = chat_complete(
                     messages=messages,

diff --git a/prompting/rewards/scoring.py b/prompting/rewards/scoring.py
@@ -70,7 +70,7 @@ async def run_step(self) -> RewardLoggingEvent:
         scoring_config: ScoringConfig = scorable.pop(0)
 
         # here we generate the actual reference
-        scoring_config.task.make_reference(
+        await scoring_config.task.make_reference(
             dataset_entry=scoring_config.dataset_entry,
         )
 
@@ -84,6 +84,12 @@ async def run_step(self) -> RewardLoggingEvent:
             task=scoring_config.task,
         )
         self.reward_events.append(reward_events)
+
+        # TODO: Remove this once we have a better way to handle organic tasks
+        if scoring_config.task.organic:
+            self.reward_events.append(
+                reward_events
+            )  # Add the organic a second time, doubling the weight of the organic
         logger.debug(
             f"Scored {scoring_config.task.__class__.__name__} {scoring_config.task.task_id} with model "
             f"{scoring_config.task.llm_model_id}"

diff --git a/prompting/rewards/web_retrieval.py b/prompting/rewards/web_retrieval.py
@@ -27,6 +27,12 @@
 
 # Define blacklisted terms
 BLACKLISTED_TERMS = {
+    "howtogeek",
+    "docs.google.com",
+    "?q=",
+    "/search",
+    "sheets.google.com",
+    "drive.google.com",
     "pastebin",
     "paste",
     "gist",
@@ -102,22 +108,30 @@ def score_website_result(
         if netloc.startswith("www."):
             netloc = netloc[4:]
 
+        # Penalise a completion where the relevant section is contained in the URL (e.g. miners)
+        # trying to use a search box to enter exactly the relevant section they need
+        discount_factor = 1 - fuzz.token_sort_ratio(response_url, response_relevant) / 100
         # Check if URL is IP-based or has port
+        if not response_url or len(response_url) > 500:
+            logger.debug(f"URL {response_url} is too long, setting discount factor to 0")
+            return 0
         if not netloc or any(c.isdigit() for c in netloc.split(".")) or ":" in netloc:
             discount_factor = 0
             logger.debug(f"URL {response_url} appears to be IP-based or on specific port, setting discount factor to 0")
             return 0
         else:
             domain = netloc
 
+            domain_count = np.sum(np.array([domain == d for d in past_websites[uid]])) + 1
+
             # If domain is in top 100k, don't apply penalty
             if domain in TOP_DOMAINS:
-                discount_factor = 1.0
+                # if the domain is in the top 100k, we allow 10 occurrences in the last 200 URLs before penalising
+                discount_factor *= 1.0 / (max(1, domain_count - 10))
                 logger.debug(f"Domain {domain} is in top 100k domains, not applying penalty")
             else:
                 # Count how many times this domain has been used by this miner
-                domain_count = np.sum(np.array([domain == d for d in past_websites[uid]])) + 1
-                discount_factor = 1.0 / domain_count
+                discount_factor *= 1.0 / max(1, domain_count)
                 if domain in past_websites[uid]:
                     logger.debug(
                         f"Already used domain {domain} for this UID, applying ( discount ) factor {discount_factor}"

diff --git a/prompting/tasks/base_task.py b/prompting/tasks/base_task.py
@@ -41,7 +41,7 @@ def make_query(self, **kwargs):
         raise NotImplementedError("Method make_query must be implemented")
 
     @abstractmethod
-    def make_reference(self, **kwargs):
+    async def make_reference(self, **kwargs):
         raise NotImplementedError("Method make_reference must be implemented")
 
 
@@ -75,7 +75,7 @@ def get_model_id_and_seed(self) -> "BaseTextTask":
     def make_query(self, dataset_entry: DatasetEntry, **kwargs) -> str:
         return self.query
 
-    def make_reference(self, dataset_entry: DatasetEntry) -> str:
+    async def make_reference(self, dataset_entry: DatasetEntry) -> str:
         return self.reference
 
     def generate_reference(self, messages: list[str]) -> str:

diff --git a/prompting/tasks/inference.py b/prompting/tasks/inference.py
@@ -75,7 +75,7 @@ def make_query(self, dataset_entry: ChatEntry) -> str:
 
         return self.query
 
-    def make_reference(self, dataset_entry: ChatEntry) -> str:
+    async def make_reference(self, dataset_entry: ChatEntry) -> str:
         self.reference = model_manager.generate(
             messages=self.messages,
             model=self.llm_model,

diff --git a/prompting/tasks/multi_choice.py b/prompting/tasks/multi_choice.py
@@ -126,7 +126,7 @@ def post_process_qa(self, query: str) -> str:
         new_query = "?".join(query.split("?")[:2]) + "?\n" + options_string
         return new_query
 
-    def make_reference(self, dataset_entry: Context) -> str:
+    async def make_reference(self, dataset_entry: Context) -> str:
         return self.reference
 
     def extract_query_and_reference(self, query_with_choices: str) -> tuple[str, str]:

diff --git a/prompting/tasks/multi_step_reasoning.py b/prompting/tasks/multi_step_reasoning.py
@@ -1,168 +1,19 @@
-import json
-import re
-import time
+import random
 from typing import ClassVar
 
 from loguru import logger
 
-from prompting.llms.apis.gpt_wrapper import LLMMessage, LLMMessages
-from prompting.llms.apis.llm_wrapper import LLMWrapper
+from prompting.datasets.random_website import DDGDatasetEntry
 from prompting.rewards.relevance import RelevanceRewardModel
 from prompting.rewards.reward import BaseRewardConfig, BaseRewardModel
 from prompting.tasks.qa import WikiQuestionAnsweringTask
 from shared.base import Context
-from shared.timer import Timer
+from validator_api.test_time_inference import generate_response
 
 MAX_THINKING_STEPS = 10
 
 
-def parse_multiple_json(api_response):
-    """
-    Parses a string containing multiple JSON objects and returns a list of dictionaries.
-
-    Args:
-        api_response (str): The string returned by the API containing JSON objects.
-
-    Returns:
-        list: A list of dictionaries parsed from the JSON objects.
-    """
-    # Regular expression pattern to match individual JSON objects
-    json_pattern = re.compile(r"\{.*?\}", re.DOTALL)
-
-    # Find all JSON object strings in the response
-    json_strings = json_pattern.findall(api_response)
-
-    parsed_objects = []
-    for json_str in json_strings:
-        try:
-            # Replace escaped single quotes with actual single quotes
-            json_str_clean = json_str.replace("\\'", "'")
-
-            # Parse the JSON string into a dictionary
-            obj = json.loads(json_str_clean)
-            parsed_objects.append(obj)
-        except json.JSONDecodeError as e:
-            print(f"Failed to parse JSON object: {e}")
-            continue
-
-    return parsed_objects
-
-
-def make_api_call(messages, max_tokens, is_final_answer=False):
-    # TOOD: Make this use local model to prevent relay mining
-    for attempt in range(3):
-        try:
-            response = LLMWrapper.chat_complete(messages=LLMMessages(*messages))
-            response_dict = parse_multiple_json(response)[0]
-            return response_dict
-        except Exception as e:
-            if attempt == 2:
-                logger.debug(f"ERROR GENERATING ANSWER. RESPONSE DICT: {response_dict}")
-                if is_final_answer:
-                    return {
-                        "title": "Error",
-                        "content": f"Failed to generate final answer after 3 attempts. Error: {str(e)}",
-                    }
-                else:
-                    return {
-                        "title": "Error",
-                        "content": f"Failed to generate step after 3 attempts. Error: {str(e)}",
-                        "next_action": "final_answer",
-                    }
-            time.sleep(1)  # Wait for 1 second before retrying
-
-
-def generate_response(prompt):
-    messages = [
-        LLMMessage(
-            role="system",
-            content="""You are an expert AI assistant with advanced reasoning capabilities. Your task is to provide detailed, step-by-step explanations of your thought process. For each step:
-
-1. Provide a clear, concise title describing the current reasoning phase.
-2. Elaborate on your thought process in the content section.
-3. Decide whether to continue reasoning or provide a final answer.
-
-Response Format:
-Use JSON with keys: 'title', 'content', 'next_action' (values: 'continue' or 'final_answer')
-
-Key Instructions:
-- Employ at least 5 distinct reasoning steps.
-- Acknowledge your limitations as an AI and explicitly state what you can and cannot do.
-- Actively explore and evaluate alternative answers or approaches.
-- Critically assess your own reasoning; identify potential flaws or biases.
-- When re-examining, employ a fundamentally different approach or perspective.
-- Utilize at least 3 diverse methods to derive or verify your answer.
-- Incorporate relevant domain knowledge and best practices in your reasoning.
-- Quantify certainty levels for each step and the final conclusion when applicable.
-- Consider potential edge cases or exceptions to your reasoning.
-- Provide clear justifications for eliminating alternative hypotheses.
-- Output only one step at a time to ensure a detailed and coherent explanation.
-
-
-Example of a valid JSON response:
-```json
-{
-    "title": "Initial Problem Analysis",
-    "content": "To approach this problem effectively, I'll first break down the given information into key components. This involves identifying...[detailed explanation]... By structuring the problem this way, we can systematically address each aspect.",
-    "next_action": "continue"
-}```
-""",
-        )
-    ]
-    messages += [LLMMessage(role="user", content=prompt)]
-    messages += [
-        LLMMessage(
-            role="assistant",
-            content="Thank you! I will now think step by step following my instructions, starting at the beginning after decomposing the problem.",
-        )
-    ]
-
-    steps = []
-    step_count = 1
-    total_thinking_time = 0
-
-    for _ in range(MAX_THINKING_STEPS):
-        with Timer() as timer:
-            step_data = make_api_call(messages, 300)
-        thinking_time = timer.final_time
-        total_thinking_time += thinking_time
-
-        steps.append((f"Step {step_count}: {step_data['title']}", step_data["content"], thinking_time))
-
-        messages.append(LLMMessage(role="assistant", content=json.dumps(step_data)))
-
-        if step_data["next_action"] == "final_answer" or not step_data.get("next_action"):
-            break
-
-        step_count += 1
-
-        # Yield after each step
-        yield steps, None
-
-    # Generate final answer
-    messages.append(
-        LLMMessage(
-            role="user",
-            content="Please provide the final answer based on your reasoning above. You must return your answer in a valid json.",
-        )
-    )
-
-    start_time = time.time()
-    final_data = make_api_call(messages, 200, is_final_answer=True)
-    end_time = time.time()
-    thinking_time = end_time - start_time
-    total_thinking_time += thinking_time
-
-    if final_data["title"] == "Error":
-        steps.append(("Error", final_data["content"], thinking_time))
-        raise ValueError("Failed to generate final answer: {final_data['content']}")
-
-    steps.append(("Final Answer", final_data["content"], thinking_time))
-
-    yield steps, total_thinking_time
-
-
-def execute_multi_step_reasoning(user_query):
+def execute_multi_step_reasoning(user_query: str):
     for steps, total_thinking_time in generate_response(user_query):
         if total_thinking_time is not None:
             logger.info(f"**Total thinking time: {total_thinking_time:.2f} seconds**")
@@ -201,7 +52,7 @@ class MultiStepReasoningRewardConfig(BaseRewardConfig):
 - Obvious or straightforward calculations
 - Questions that don't require analysis
 
-Remember: The goal is to create questions where the context and parameters are revealed progressively, requiring the reader to integrate information across multiple sentences to fully understand and solve the problem.
+Remember: The goal is to create questions where the context and parameters are revealed progressively, requiring the reader to integrate information across multiple sentences to fully understand and solve the problem. Make sure that the question is spread over at least 3 sentences.
 """
 
 QUERY_PROMPT_TEMPLATE = """\
@@ -210,9 +61,15 @@ class MultiStepReasoningRewardConfig(BaseRewardConfig):
 #Context:
 {context}
 
-You must ask a question that can be answered by the context.
+Remember the question must encourage logical thinking and reasoning and must be spread over at least 3 sentences.
 """
 
+SAMPLE_SYSTEM_PROMPTS = [
+    """You are an LLM specialising in reasoning and solving complex questions. You will be given a chat interaction with a user and must answer appropriately.""",
+    """You are a step-by-step problem solver. When given a complex question, you break it down into clear logical steps, showing your work and explaining your reasoning at each stage. You maintain a methodical approach to ensure accuracy.""",
+    """You are an expert at mathematical and analytical reasoning. You excel at carefully parsing multi-part problems, identifying key information, and systematically working through solutions while clearly documenting your thought process.""",
+]
+
 
 class MultiStepReasoningTask(WikiQuestionAnsweringTask):
     """QuestionAnsweringTasks must be initialised with an LLM pipeline to generate query and reference plus
@@ -221,16 +78,37 @@ class MultiStepReasoningTask(WikiQuestionAnsweringTask):
     name: ClassVar[str] = "multi_step_reasoning"
     augmentation_system_prompt: ClassVar[str] = ""
     query: str | None = None
+    query_system_prompt: str = QUERY_SYSTEM_PROMPT
     reference: str | None = None
 
-    def make_query(self, dataset_entry: Context):
-        query_prompt = QUERY_PROMPT_TEMPLATE.format(context=dataset_entry.content)
-        question = self.generate_query(messages=[QUERY_SYSTEM_PROMPT, query_prompt])
+    def make_query(self, dataset_entry: DDGDatasetEntry):
+        query_prompt = QUERY_PROMPT_TEMPLATE.format(context=dataset_entry.website_content)
+        question = self.generate_query(messages=[query_prompt])
         msgs = [p + ". " if i < len(question.split(". ")) - 1 else p for i, p in enumerate(question.split(". ")) if p]
-        self.messages = [{"role": "user", "content": msg} for msg in msgs]
+        self.messages = [{"role": "system", "content": random.choice(SAMPLE_SYSTEM_PROMPTS)}] + [
+            {"role": random.choice(["user", "assistant"]), "content": msg} for msg in msgs
+        ]
         return self.query
 
-    def make_reference(self, dataset_entry: Context):
-        steps, total_thinking_time = execute_multi_step_reasoning(user_query=self.query)
-        self.reference = steps[-1][1]
+    async def _async_generate_reference(self):
+        async for steps, total_thinking_time in generate_response(
+            self.messages, model=self.llm_model_id, use_miners=False
+        ):
+            logger.debug(f"Step generated in reference of MSR: {steps}")
+            if total_thinking_time is not None:
+                logger.debug(f"**Total thinking time: {total_thinking_time:.2f} seconds**")
+        return steps[-1][1]
+
+    async def make_reference(self, dataset_entry: Context):
+        try:
+            logger.debug(f"Generating reference for MSR: {self.messages}")
+            # Run the async function in a new event loop
+            self.reference = await self._async_generate_reference()
+            logger.debug(f"Generated reference for MSR: {self.reference}")
+        except Exception as e:
+            logger.error(f"Error getting final answer for MSR: {e}")
+            self.reference = None
+        if self.reference is None:
+            logger.error("No reference found for MSR")
+            return None
         return self.reference
diff --git a/prompting/tasks/qa.py b/prompting/tasks/qa.py
@@ -63,7 +63,7 @@ def make_query(self, dataset_entry: Context):
         self.query = self.generate_query(messages=[query_prompt])
         return self.query
 
-    def make_reference(self, dataset_entry: Context):
+    async def make_reference(self, dataset_entry: Context):
         reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(context=dataset_entry.content, question=self.query)
         self.reference = self.generate_reference(messages=[{"role": "user", "content": reference_prompt}])
         return self.reference
@@ -85,7 +85,7 @@ def make_query(self, dataset_entry: Context):
         self.query = self.generate_query(messages=[query_prompt])
         return self.query
 
-    def make_reference(self, dataset_entry: Context):
+    async def make_reference(self, dataset_entry: Context):
         reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(context=dataset_entry.website_content, question=self.query)
         self.reference = self.generate_reference(messages=[{"role": "user", "content": reference_prompt}])
         return self.reference
diff --git a/prompting/tasks/task_creation.py b/prompting/tasks/task_creation.py
@@ -62,6 +62,7 @@ async def run_step(self):
             if not task.query:
                 logger.debug(f"Generating query for task: {task.__class__.__name__}.")
                 task.make_query(dataset_entry=dataset_entry)
+                logger.debug(f"Generated Messages: {task.task_messages}")
 
             logger.debug(f"Appending task: {task.__class__.__name__} to task queue.")
             self.task_queue.append(task)