macrocosm-os · bkb2135 · Jan 14, 2025 · Jan 2, 2025 · Jan 2, 2025 · Jan 2, 2025
diff --git a/.env.validator.example b/.env.validator.example
@@ -24,7 +24,9 @@ SN19_API_URL = "e.g. http://24.199.112.174:4051/"
 OPENAI_API_KEY = "your_openai_api_key_here"
 HF_TOKEN = "your_huggingface_token_here"
 
-# Scoring API.
+# Scoring API (optional).
 DEPLOY_SCORING_API = true
 SCORING_ADMIN_KEY = "123456"
 SCORING_API_PORT = 8094
+# Scoring key must match the scoring key in the .env.api.
+# SCORING_KEY="..."
diff --git a/README.md b/README.md
@@ -49,24 +49,21 @@ Subnet one utilizes the concept of "Tasks" to control the behavior of miners. Va
 ### 1. **QA (Question Answering)**
 The miner receives a question about a specific section from a Wikipedia page. The miner must then find the original context in the specified section and use it to return an accurate answer. References are generated using the validators privileged knowledge of the context, and miner complestions are scored based on similarity metrics.
 
-### 2. **Summarization**
-Similar to QA, but the miner uses the entire Wikipedia page instead of a specific section. The miner reads the whole page, summarizes it, and provides a concise answer.
-
-### 3. **DateQA**
-The miner receives a question about an event from Wikipedia. The miner must search through Wikipedia for the relevant event and return the correct answer based on the findings. References are again generated with validator's knowledge of the context, and similarity metrics are used to score miner completions.
-
-### 4. **Inference**
+### 2. **Inference**
 A question is given with some pre-seeded information and a random seed. The miner must perform an inference based on this information to provide the correct answer. Completions are scored based on similarity metrics.
 
-### 5. **MultiChoice**
+### 3. **MultiChoice**
 The miner is presented with a question from Wikipedia along with four possible answers (A, B, C, or D). The miner must search Wikipedia and return the correct answer by selecting one of the given options. Miner completions are scored by Regex matching.
 
-### 6. **Programming**
+### 5. **Programming**
 The miner receives a code snippet that is incomplete. The task is to complete the code snippet to perform its intended function. The validator generates a reference using it's internal LLM, and the miner is scored based on its similarity to this reference.
 
-### 7. **Web Retrieval**
+### 6. **Web Retrieval**
 The miner is given a question based on a random web page and must return a scraped website that contains the answer. This requires searching the web to locate the most accurate and reliable source to provide the answer. The miner is scored based on the embedding similarity between the answer it returns and the original website that the validator generated the reference from.
 
+### 7. **Multistep Reasoning**
+The miner is given a complex problem that requires multiple steps to solve. Each step builds upon the previous one, and the miner must provide intermediate results before arriving at the final answer. The validator generates a reference solution using its internal LLM, and the miner is scored based on the accuracy and coherence of the intermediate and final results.
+
 # API Documentation
 
 For detailed information on the available API endpoints, request/response formats, and usage examples, please refer to the [API Documentation](./validator_api/API_docs.md).

diff --git a/docs/SN1_validation.md b/docs/SN1_validation.md
@@ -31,7 +31,6 @@ More tooling will be included in future releases.
 # Tasks
 The validation process supports an ever-growing number of tasks. Tasks drive agent behaviour based on specific goals, such as;
 - Question answering
-- Summarization
 - Code debugging
 - Mathematics
  and more.

diff --git a/neurons/validator.py b/neurons/validator.py
@@ -22,7 +22,7 @@
 
 torch.multiprocessing.set_start_method("spawn", force=True)
 
-NEURON_SAMPLE_SIZE = 100
+NEURON_SAMPLE_SIZE = 100  # TODO: Should add this to constants.py
 
 
 def create_loop_process(task_queue, scoring_queue, reward_events):

diff --git a/poetry.lock b/poetry.lock
diff --git a/prompting/api/scoring/api.py b/prompting/api/scoring/api.py
@@ -27,29 +27,34 @@ async def score_response(request: Request, api_key_data: dict = Depends(validate
     model = None
     payload: dict[str, Any] = await request.json()
     body = payload.get("body")
-
-    try:
-        if body.get("model") is not None:
-            model = ModelZoo.get_model_by_id(body.get("model"))
-    except Exception:
-        logger.warning(
-            f"Organic request with model {body.get('model')} made but the model cannot be found in model zoo. Skipping scoring."
-        )
-        return
     uid = int(payload.get("uid"))
     chunks = payload.get("chunks")
-    llm_model = ModelZoo.get_model_by_id(model) if (model := body.get("model")) else None
+    model = body.get("model")
+    if model:
+        try:
+            llm_model = ModelZoo.get_model_by_id(model)
+        except Exception:
+            logger.warning(
+                f"Organic request with model {body.get('model')} made but the model cannot be found in model zoo. Skipping scoring."
+            )
+        return
+    else:
+        llm_model = None
     task = body.get("task")
     if task == "InferenceTask":
         logger.info(f"Received Organic InferenceTask with body: {body}")
+        logger.info(f"With model of type {type(body.get('model'))}")
+        organic_task = InferenceTask(
+            messages=body.get("messages"),
+            llm_model=llm_model,
+            llm_model_id=body.get("model"),
+            seed=int(body.get("seed", 0)),
+            sampling_params=body.get("sampling_parameters", shared_settings.SAMPLING_PARAMS),
+            query=body.get("messages")[0]["content"],
+        )
+        logger.info(f"Task created: {organic_task}")
         task_scorer.add_to_queue(
-            task=InferenceTask(
-                messages=[msg["content"] for msg in body.get("messages")],
-                llm_model=llm_model,
-                llm_model_id=body.get("model"),
-                seed=int(body.get("seed", 0)),
-                sampling_params=body.get("sampling_params", {}),
-            ),
+            task=organic_task,
             response=DendriteResponseEvent(
                 uids=[uid],
                 stream_results=[

diff --git a/prompting/base/duckduckgo_patch.py b/prompting/base/duckduckgo_patch.py
@@ -1,3 +1,4 @@
+from threading import Event
 from typing import cast
 
 import httpx
@@ -13,6 +14,7 @@ def __init__(self, *args, **kwargs):
             timeout=kwargs.get("timeout", 10),
             verify=kwargs.get("verify", True),
         )
+        self._exception_event = Event()
 
     def _get_url(
         self: DDGS,

diff --git a/prompting/datasets/huggingface_github.py b/prompting/datasets/huggingface_github.py
@@ -20,6 +20,7 @@ class HuggingFaceGithubDatasetEntry(DatasetEntry):
     github_url: str
     file_path: str
     file_content: str
+    source: str | None = None
 
 
 class HuggingFaceGithubDataset(BaseDataset):
@@ -46,8 +47,9 @@ def _filter_function(self, example):
 
     def _process_entry(self, entry: dict) -> HuggingFaceGithubDatasetEntry:
         file_content = "\n".join(entry["content"].split("\n")[:MAX_LINES])
+        url = f"https://github.com/{entry['repo_name']}"
         return HuggingFaceGithubDatasetEntry(
-            github_url=f"https://github.com/{entry['repo_name']}", file_path=entry["path"], file_content=file_content
+            github_url=url, file_path=entry["path"], file_content=file_content, source=url
         )
 
     def get(self) -> HuggingFaceGithubDatasetEntry:

diff --git a/prompting/datasets/random_website.py b/prompting/datasets/random_website.py
@@ -17,21 +17,24 @@ class DDGDatasetEntry(DatasetEntry):
     search_term: str
     website_url: str = None
     website_content: str = None
+    query: str | None = None
+    source: str | None = None
 
 
 class DDGDataset(BaseDataset):
     english_words: list[str] = None
 
     def search_random_term(self, retries: int = 3) -> tuple[Optional[str], Optional[list[dict[str, str]]]]:
-        try:
-            ddg = PatchedDDGS(proxy=shared_settings.PROXY_URL, verify=False)
-            for _ in range(retries):
-                random_words = " ".join(random.sample(ENGLISH_WORDS, 5))
+        ddg = PatchedDDGS(proxy=shared_settings.PROXY_URL, verify=False)
+        for _ in range(retries):
+            random_words = " ".join(random.sample(ENGLISH_WORDS, 3))
+            try:
                 results = list(ddg.text(random_words))
                 if results:
                     return random_words, results
-        except Exception as ex:
-            logger.error(f"Failed to get search results from DuckDuckGo: {ex}")
+            except Exception as ex:
+                logger.debug(f"Failed to get search results from DuckDuckGo: {ex}")
+        logger.warning(f"Failed to get search results from DuckDuckGo after {retries} tries")
         return None, None
 
     @staticmethod
@@ -41,19 +44,21 @@ def extract_website_content(url: str) -> Optional[str]:
             extracted = trafilatura.extract(website)
             return extracted[:MAX_CHARS] if extracted else None
         except Exception as ex:
-            logger.error(f"Failed to extract content from website {url}: {ex}")
+            logger.debug(f"Failed to extract content from website {url}: {ex}")
 
     def next(self) -> Optional[DDGDatasetEntry]:
-        search_term, results = self.search_random_term(retries=3)
+        search_term, results = self.search_random_term(retries=5)
         if not results:
             return None
         website_url = results[0]["href"]
         website_content = self.extract_website_content(website_url)
         if not website_content or len(website_content) == 0:
-            logger.error(f"Failed to extract content from website {website_url}")
+            logger.debug(f"Failed to extract content from website {website_url}")
             return None
 
-        return DDGDatasetEntry(search_term=search_term, website_url=website_url, website_content=website_content)
+        return DDGDatasetEntry(
+            search_term=search_term, website_url=website_url, website_content=website_content, source=website_url
+        )
 
     def get(self) -> Optional[DDGDatasetEntry]:
         return self.next()

diff --git a/prompting/datasets/sn13.py b/prompting/datasets/sn13.py
@@ -2,14 +2,10 @@
 from typing import ClassVar
 
 import datasets
-import nltk
-from nltk.corpus import wordnet
 from pydantic import model_validator
 
 from shared.base import BaseDataset, ChatEntry
 
-nltk.download("wordnet")
-
 
 class SN13Dataset(BaseDataset):
     _url: ClassVar[str] = "arrmlet/x_dataset_218"
@@ -41,51 +37,10 @@ def sample(self) -> ChatEntry:
         if self.exception is not None:
             raise self.exception
         # Randomly select a sample from the dataset.
-        sample_idx = random.randint(0, len(self.dataset) - 1)
-        message = self.dataset[sample_idx]["text"]
-        role = ["user"]
-
-        # Augment the messages by modifying words and introducing errors.
-        messages = [self._augment_message(role, message)]
-
-        return ChatEntry(roles=role, messages=messages, organic=False, source=self._url)
-
-    def _augment_message(self, role: str, message: str) -> str:
-        if role == "assistant":
-            return message
-
-        words = message.split()
-        num_words_to_modify = random.randint(1, max(1, int(len(words) * self._chance_word_synonym)))
-        words_to_modify = random.sample(range(len(words)), num_words_to_modify)
-
-        for idx in words_to_modify:
-            synonym = self._get_synonym(words[idx])
-            if synonym:
-                words[idx] = synonym
-
-        message = " ".join(words)
-        message = self._introduce_typos(message)
-        return message
-
-    def _get_synonym(self, word: str) -> str:
-        synonyms = wordnet.synsets(word)
-        if synonyms:
-            # Choose a synonym that is not the word itself.
-            synonym_words = [lemma.name() for lemma in synonyms[0].lemmas() if lemma.name() != word]
-            if synonym_words:
-                return random.choice(synonym_words)
-        return word
-
-    def _introduce_typos(self, message: str) -> str:
-        message = list(message)
-        num_errors = random.randint(0, max(1, int(len(message) * self._chance_char_typo)))
-        for _ in range(num_errors):
-            error_type = random.choice(["remove", "add_space"])
-            error_position = random.randint(0, len(message) - 1)
-
-            if error_type == "remove":
-                message.pop(error_position)
-            elif error_type == "add_space":
-                message.insert(error_position, " ")
+        messages = []
+        for _ in range(4):
+            sample_idx = random.randint(0, len(self.dataset) - 1)
+            if message := self.dataset[sample_idx]["text"]:
+                messages.append({"role": random.choice(["user", "assistant"]), "content": message})
 
-        return "".join(message)
+        return ChatEntry(messages=messages, organic=False, source=self._url)