macrocosm-os · bkb2135 · Feb 4, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 16, 2025
diff --git a/neurons/miners/epistula_miner/miner.py b/neurons/miners/epistula_miner/miner.py
@@ -18,6 +18,7 @@
 from loguru import logger
 from starlette.background import BackgroundTask
 from starlette.responses import StreamingResponse
+from web_retrieval import get_websites_with_similarity
 
 from prompting.llms.hf_llm import ReproducibleHF
 from shared.epistula import verify_signature
@@ -60,9 +61,28 @@ async def format_openai_query(self, request: Request):
 
         return openai_request
 
+    async def stream_web_retrieval(self, body, headers):
+        async def word_stream(body, headers):
+            websites = await get_websites_with_similarity(body["messages"][0]["content"], 10, headers["target_results"])
+
+            # Simulate the OpenAI streaming response format
+            data = {"choices": [{"delta": {"content": json.dumps(websites)}, "index": 0, "finish_reason": None}]}
+            yield f"data: {json.dumps(data)}\n\n"
+            await asyncio.sleep(0.1)
+            # Indicate the end of the stream
+            data = {"choices": [{"delta": {}, "index": 0, "finish_reason": "stop"}]}
+            yield f"data: {json.dumps(data)}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(word_stream(body, headers), media_type="text/event-stream")
+
     async def create_chat_completion(self, request: Request):
+        data = await request.json()
+        headers = request.headers
         if self.llm and request.headers.get("task", None) == "inference":
             return await self.create_inference_completion(request)
+        if request.headers.get("task", None) == "WebRetrievalTask":
+            return await self.stream_web_retrieval(data, headers)
         req = self.client.build_request("POST", "chat/completions", json=await self.format_openai_query(request))
         r = await self.client.send(req, stream=True)
         return StreamingResponse(r.aiter_raw(), background=BackgroundTask(r.aclose), headers=r.headers)
@@ -71,6 +91,7 @@ async def create_inference_completion(self, request: Request):
         async def word_stream():
             inference = await self.run_inference(request)
             words = inference.split()
+            print(words)
             for word in words:
                 # Simulate the OpenAI streaming response format
                 data = {"choices": [{"delta": {"content": word + " "}, "index": 0, "finish_reason": None}]}

diff --git a/neurons/miners/epistula_miner/web_retrieval.py b/neurons/miners/epistula_miner/web_retrieval.py
@@ -0,0 +1,113 @@
+import asyncio
+from typing import Dict, List
+
+import numpy as np
+import trafilatura
+from loguru import logger
+from openai import OpenAI
+
+from prompting.base.duckduckgo_patch import PatchedDDGS
+from shared.settings import shared_settings
+
+# Import the patched DDGS and use that
+
+
+# Import the patched DDGS and use that
+
+
+async def fetch_url(url: str) -> str:
+    return trafilatura.fetch_url(url)
+
+
+async def extract_content(content: str) -> str:
+    return trafilatura.extract(content)
+
+
+def create_chunks(text: str, chunk_size: int = 500, min_length: int = 301) -> List[str]:
+    """Split text into chunks of approximately chunk_size characters."""
+    chunks = []
+    current_chunk = ""
+
+    for sentence in text.split(". "):
+        if len(current_chunk) + len(sentence) <= chunk_size:
+            current_chunk += sentence + ". "
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+
+    if current_chunk and current_chunk.strip():
+        chunks.append(current_chunk.strip())
+
+    return [chunk for chunk in chunks if chunk and len(chunk) > min_length]
+
+
+async def get_websites_with_similarity(
+    query: str = "What are the 5 best phones I can buy this year?", n_results: int = 5, k: int = 3
+) -> List[Dict[str, str]]:
+    """
+    Search for websites and return top K results based on embedding similarity.
+
+    Args:
+        query: Search query string
+        n_results: Number of initial results to process
+        k: Number of top similar results to return
+
+    Returns:
+        List of dictionaries containing website URLs and their best matching chunks
+    """
+    logger.debug("Getting results")
+    ddgs = PatchedDDGS(proxy=shared_settings.PROXY_URL, verify=False)
+    results = list(ddgs.text(query))
+    logger.debug(f"Got {len(results)} results")
+    urls = [r["href"] for r in results][:n_results]
+
+    # Fetch and extract content
+    content = await asyncio.gather(*[fetch_url(url) for url in urls])
+    extracted = await asyncio.gather(*[extract_content(c) for c in content])
+
+    # Create embeddings
+    client = OpenAI(api_key=shared_settings.OPENAI_API_KEY)
+    query_embedding = client.embeddings.create(model="text-embedding-ada-002", input=query).data[0].embedding
+    # Process each website
+    results_with_similarity = []
+    for url, html, text in zip(urls, content, extracted):
+        if not text:  # Skip if extraction failed
+            continue
+
+        # logger.debug(f"TEXTS: {text}")
+        chunks = create_chunks(text)
+        chunk_embeddings = client.embeddings.create(model="text-embedding-ada-002", input=chunks).data
+
+        # Find chunk with highest similarity
+        similarities = [np.dot(query_embedding, chunk.embedding) for chunk in chunk_embeddings]
+        best_chunk_idx = np.argmax(similarities)
+
+        results_with_similarity.append(
+            {
+                "website": url,
+                "best_chunk": chunks[best_chunk_idx],
+                "similarity_score": similarities[best_chunk_idx],
+                # "html": html,
+                "text": text,
+            }
+        )
+
+    # Sort by similarity score and return top K
+    top_k = sorted(results_with_similarity, key=lambda x: x["similarity_score"], reverse=True)[: int(k)]
+
+    return [
+        {
+            "url": result["website"],
+            "content": result["text"],
+            # "html": result["html"],
+            "relevant": result["best_chunk"],
+        }
+        for result in top_k
+    ]
+
+
+# await get_websites_with_similarity(
+#     "What are the 5 best phones I can buy this year?",
+#     n_results=5, # number of initial websites to get
+#     k=3 # number of top similar results to return
+# )
diff --git a/poetry.lock b/poetry.lock
diff --git a/prompting/api/scoring/api.py b/prompting/api/scoring/api.py
@@ -45,8 +45,8 @@ async def score_response(request: Request, api_key_data: dict = Depends(validate
         return
     else:
         llm_model = None
-    task = body.get("task")
-    if task == "InferenceTask":
+    task_name = body.get("task")
+    if task_name == "InferenceTask":
         logger.info(f"Received Organic InferenceTask with body: {body}")
         logger.info(f"With model of type {type(body.get('model'))}")
         organic_task = InferenceTask(
@@ -70,7 +70,7 @@ async def score_response(request: Request, api_key_data: dict = Depends(validate
             step=-1,
             task_id=str(uuid.uuid4()),
         )
-    elif task == "WebRetrievalTask":
+    elif task_name == "WebRetrievalTask":
         logger.info(f"Received Organic WebRetrievalTask with body: {body}")
         try:
             search_term = body.get("messages")[0].get("content")
@@ -90,7 +90,7 @@ async def score_response(request: Request, api_key_data: dict = Depends(validate
                 stream_results=[
                     SynapseStreamResult(accumulated_chunks=[chunk for chunk in chunks if chunk is not None])
                 ],
-                timeout=shared_settings.NEURON_TIMEOUT,  # TODO: Change this to read from the body
+                timeout=body.get("timeout", shared_settings.NEURON_TIMEOUT),
             ),
             dataset_entry=DDGDatasetEntry(search_term=search_term),
             block=shared_settings.METAGRAPH.block,

diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py
@@ -83,7 +83,7 @@ def apply(
     ) -> WeightedRewardEvent:
         t0 = time.time()
         comparator = reference if reward_type == "reward" else challenge
-        batch_rewards_output: BatchRewardOutput = self.reward(comparator, response_event, **kwargs)
+        batch_rewards_output: BatchRewardOutput = self.reward(comparator, response_event, task=task, **kwargs)
         batch_rewards_time = time.time() - t0
 
         return WeightedRewardEvent(