Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10,959 changes: 10,959 additions & 0 deletions past_websites.csv

Large diffs are not rendered by default.

322 changes: 320 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion prompting/llms/apis/llm_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ def chat_complete(
logprobs=True,
) -> str:
response: str | None = None
if "gpt" not in model.lower() and shared_settings.SN19_API_KEY and shared_settings.SN19_API_URL:
if (
shared_settings.SN19_API_KEY
and shared_settings.SN19_API_URL
and (model is None or "gpt" not in model.lower())
):
try:
response = chat_complete(
messages=messages,
Expand Down
8 changes: 7 additions & 1 deletion prompting/rewards/scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ async def run_step(self) -> RewardLoggingEvent:
scoring_config: ScoringConfig = scorable.pop(0)

# here we generate the actual reference
scoring_config.task.make_reference(
await scoring_config.task.make_reference(
dataset_entry=scoring_config.dataset_entry,
)

Expand All @@ -84,6 +84,12 @@ async def run_step(self) -> RewardLoggingEvent:
task=scoring_config.task,
)
self.reward_events.append(reward_events)

# TODO: Remove this once we have a better way to handle organic tasks
if scoring_config.task.organic:
self.reward_events.append(
reward_events
) # Add the organic a second time, doubling the weight of the organic
logger.debug(
f"Scored {scoring_config.task.__class__.__name__} {scoring_config.task.task_id} with model "
f"{scoring_config.task.llm_model_id}"
Expand Down
20 changes: 17 additions & 3 deletions prompting/rewards/web_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@

# Define blacklisted terms
BLACKLISTED_TERMS = {
"howtogeek",
"docs.google.com",
"?q=",
"/search",
"sheets.google.com",
"drive.google.com",
"pastebin",
"paste",
"gist",
Expand Down Expand Up @@ -102,22 +108,30 @@ def score_website_result(
if netloc.startswith("www."):
netloc = netloc[4:]

# Penalise a completion where the relevant section is contained in the URL (e.g. miners)
# trying to use a search box to enter exactly the relevant section they need
discount_factor = 1 - fuzz.token_sort_ratio(response_url, response_relevant) / 100
# Check if URL is IP-based or has port
if not response_url or len(response_url) > 500:
logger.debug(f"URL {response_url} is too long, setting discount factor to 0")
return 0
if not netloc or any(c.isdigit() for c in netloc.split(".")) or ":" in netloc:
discount_factor = 0
logger.debug(f"URL {response_url} appears to be IP-based or on specific port, setting discount factor to 0")
return 0
else:
domain = netloc

domain_count = np.sum(np.array([domain == d for d in past_websites[uid]])) + 1

# If domain is in top 100k, don't apply penalty
if domain in TOP_DOMAINS:
discount_factor = 1.0
# if the domain is in the top 100k, we allow 10 occurrences in the last 200 URLs before penalising
discount_factor *= 1.0 / (max(1, domain_count - 10))
logger.debug(f"Domain {domain} is in top 100k domains, not applying penalty")
else:
# Count how many times this domain has been used by this miner
domain_count = np.sum(np.array([domain == d for d in past_websites[uid]])) + 1
discount_factor = 1.0 / domain_count
discount_factor *= 1.0 / max(1, domain_count)
if domain in past_websites[uid]:
logger.debug(
f"Already used domain {domain} for this UID, applying ( discount ) factor {discount_factor}"
Expand Down
4 changes: 2 additions & 2 deletions prompting/tasks/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def make_query(self, **kwargs):
raise NotImplementedError("Method make_query must be implemented")

@abstractmethod
def make_reference(self, **kwargs):
async def make_reference(self, **kwargs):
raise NotImplementedError("Method make_reference must be implemented")


Expand Down Expand Up @@ -75,7 +75,7 @@ def get_model_id_and_seed(self) -> "BaseTextTask":
def make_query(self, dataset_entry: DatasetEntry, **kwargs) -> str:
return self.query

def make_reference(self, dataset_entry: DatasetEntry) -> str:
async def make_reference(self, dataset_entry: DatasetEntry) -> str:
return self.reference

def generate_reference(self, messages: list[str]) -> str:
Expand Down
2 changes: 1 addition & 1 deletion prompting/tasks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def make_query(self, dataset_entry: ChatEntry) -> str:

return self.query

def make_reference(self, dataset_entry: ChatEntry) -> str:
async def make_reference(self, dataset_entry: ChatEntry) -> str:
self.reference = model_manager.generate(
messages=self.messages,
model=self.llm_model,
Expand Down
2 changes: 1 addition & 1 deletion prompting/tasks/multi_choice.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def post_process_qa(self, query: str) -> str:
new_query = "?".join(query.split("?")[:2]) + "?\n" + options_string
return new_query

def make_reference(self, dataset_entry: Context) -> str:
async def make_reference(self, dataset_entry: Context) -> str:
return self.reference

def extract_query_and_reference(self, query_with_choices: str) -> tuple[str, str]:
Expand Down
202 changes: 40 additions & 162 deletions prompting/tasks/multi_step_reasoning.py
Original file line number Diff line number Diff line change
@@ -1,168 +1,19 @@
import json
import re
import time
import random
from typing import ClassVar

from loguru import logger

from prompting.llms.apis.gpt_wrapper import LLMMessage, LLMMessages
from prompting.llms.apis.llm_wrapper import LLMWrapper
from prompting.datasets.random_website import DDGDatasetEntry
from prompting.rewards.relevance import RelevanceRewardModel
from prompting.rewards.reward import BaseRewardConfig, BaseRewardModel
from prompting.tasks.qa import WikiQuestionAnsweringTask
from shared.base import Context
from shared.timer import Timer
from validator_api.test_time_inference import generate_response

MAX_THINKING_STEPS = 10


def parse_multiple_json(api_response):
"""
Parses a string containing multiple JSON objects and returns a list of dictionaries.

Args:
api_response (str): The string returned by the API containing JSON objects.

Returns:
list: A list of dictionaries parsed from the JSON objects.
"""
# Regular expression pattern to match individual JSON objects
json_pattern = re.compile(r"\{.*?\}", re.DOTALL)

# Find all JSON object strings in the response
json_strings = json_pattern.findall(api_response)

parsed_objects = []
for json_str in json_strings:
try:
# Replace escaped single quotes with actual single quotes
json_str_clean = json_str.replace("\\'", "'")

# Parse the JSON string into a dictionary
obj = json.loads(json_str_clean)
parsed_objects.append(obj)
except json.JSONDecodeError as e:
print(f"Failed to parse JSON object: {e}")
continue

return parsed_objects


def make_api_call(messages, max_tokens, is_final_answer=False):
# TOOD: Make this use local model to prevent relay mining
for attempt in range(3):
try:
response = LLMWrapper.chat_complete(messages=LLMMessages(*messages))
response_dict = parse_multiple_json(response)[0]
return response_dict
except Exception as e:
if attempt == 2:
logger.debug(f"ERROR GENERATING ANSWER. RESPONSE DICT: {response_dict}")
if is_final_answer:
return {
"title": "Error",
"content": f"Failed to generate final answer after 3 attempts. Error: {str(e)}",
}
else:
return {
"title": "Error",
"content": f"Failed to generate step after 3 attempts. Error: {str(e)}",
"next_action": "final_answer",
}
time.sleep(1) # Wait for 1 second before retrying


def generate_response(prompt):
messages = [
LLMMessage(
role="system",
content="""You are an expert AI assistant with advanced reasoning capabilities. Your task is to provide detailed, step-by-step explanations of your thought process. For each step:

1. Provide a clear, concise title describing the current reasoning phase.
2. Elaborate on your thought process in the content section.
3. Decide whether to continue reasoning or provide a final answer.

Response Format:
Use JSON with keys: 'title', 'content', 'next_action' (values: 'continue' or 'final_answer')

Key Instructions:
- Employ at least 5 distinct reasoning steps.
- Acknowledge your limitations as an AI and explicitly state what you can and cannot do.
- Actively explore and evaluate alternative answers or approaches.
- Critically assess your own reasoning; identify potential flaws or biases.
- When re-examining, employ a fundamentally different approach or perspective.
- Utilize at least 3 diverse methods to derive or verify your answer.
- Incorporate relevant domain knowledge and best practices in your reasoning.
- Quantify certainty levels for each step and the final conclusion when applicable.
- Consider potential edge cases or exceptions to your reasoning.
- Provide clear justifications for eliminating alternative hypotheses.
- Output only one step at a time to ensure a detailed and coherent explanation.


Example of a valid JSON response:
```json
{
"title": "Initial Problem Analysis",
"content": "To approach this problem effectively, I'll first break down the given information into key components. This involves identifying...[detailed explanation]... By structuring the problem this way, we can systematically address each aspect.",
"next_action": "continue"
}```
""",
)
]
messages += [LLMMessage(role="user", content=prompt)]
messages += [
LLMMessage(
role="assistant",
content="Thank you! I will now think step by step following my instructions, starting at the beginning after decomposing the problem.",
)
]

steps = []
step_count = 1
total_thinking_time = 0

for _ in range(MAX_THINKING_STEPS):
with Timer() as timer:
step_data = make_api_call(messages, 300)
thinking_time = timer.final_time
total_thinking_time += thinking_time

steps.append((f"Step {step_count}: {step_data['title']}", step_data["content"], thinking_time))

messages.append(LLMMessage(role="assistant", content=json.dumps(step_data)))

if step_data["next_action"] == "final_answer" or not step_data.get("next_action"):
break

step_count += 1

# Yield after each step
yield steps, None

# Generate final answer
messages.append(
LLMMessage(
role="user",
content="Please provide the final answer based on your reasoning above. You must return your answer in a valid json.",
)
)

start_time = time.time()
final_data = make_api_call(messages, 200, is_final_answer=True)
end_time = time.time()
thinking_time = end_time - start_time
total_thinking_time += thinking_time

if final_data["title"] == "Error":
steps.append(("Error", final_data["content"], thinking_time))
raise ValueError("Failed to generate final answer: {final_data['content']}")

steps.append(("Final Answer", final_data["content"], thinking_time))

yield steps, total_thinking_time


def execute_multi_step_reasoning(user_query):
def execute_multi_step_reasoning(user_query: str):
for steps, total_thinking_time in generate_response(user_query):
if total_thinking_time is not None:
logger.info(f"**Total thinking time: {total_thinking_time:.2f} seconds**")
Expand Down Expand Up @@ -201,7 +52,7 @@ class MultiStepReasoningRewardConfig(BaseRewardConfig):
- Obvious or straightforward calculations
- Questions that don't require analysis

Remember: The goal is to create questions where the context and parameters are revealed progressively, requiring the reader to integrate information across multiple sentences to fully understand and solve the problem.
Remember: The goal is to create questions where the context and parameters are revealed progressively, requiring the reader to integrate information across multiple sentences to fully understand and solve the problem. Make sure that the question is spread over at least 3 sentences.
"""

QUERY_PROMPT_TEMPLATE = """\
Expand All @@ -210,9 +61,15 @@ class MultiStepReasoningRewardConfig(BaseRewardConfig):
#Context:
{context}

You must ask a question that can be answered by the context.
Remember the question must encourage logical thinking and reasoning and must be spread over at least 3 sentences.
"""

SAMPLE_SYSTEM_PROMPTS = [
"""You are an LLM specialising in reasoning and solving complex questions. You will be given a chat interaction with a user and must answer appropriately.""",
"""You are a step-by-step problem solver. When given a complex question, you break it down into clear logical steps, showing your work and explaining your reasoning at each stage. You maintain a methodical approach to ensure accuracy.""",
"""You are an expert at mathematical and analytical reasoning. You excel at carefully parsing multi-part problems, identifying key information, and systematically working through solutions while clearly documenting your thought process.""",
]


class MultiStepReasoningTask(WikiQuestionAnsweringTask):
"""QuestionAnsweringTasks must be initialised with an LLM pipeline to generate query and reference plus
Expand All @@ -221,16 +78,37 @@ class MultiStepReasoningTask(WikiQuestionAnsweringTask):
name: ClassVar[str] = "multi_step_reasoning"
augmentation_system_prompt: ClassVar[str] = ""
query: str | None = None
query_system_prompt: str = QUERY_SYSTEM_PROMPT
reference: str | None = None

def make_query(self, dataset_entry: Context):
query_prompt = QUERY_PROMPT_TEMPLATE.format(context=dataset_entry.content)
question = self.generate_query(messages=[QUERY_SYSTEM_PROMPT, query_prompt])
def make_query(self, dataset_entry: DDGDatasetEntry):
query_prompt = QUERY_PROMPT_TEMPLATE.format(context=dataset_entry.website_content)
question = self.generate_query(messages=[query_prompt])
msgs = [p + ". " if i < len(question.split(". ")) - 1 else p for i, p in enumerate(question.split(". ")) if p]
self.messages = [{"role": "user", "content": msg} for msg in msgs]
self.messages = [{"role": "system", "content": random.choice(SAMPLE_SYSTEM_PROMPTS)}] + [
{"role": random.choice(["user", "assistant"]), "content": msg} for msg in msgs
]
return self.query

def make_reference(self, dataset_entry: Context):
steps, total_thinking_time = execute_multi_step_reasoning(user_query=self.query)
self.reference = steps[-1][1]
async def _async_generate_reference(self):
async for steps, total_thinking_time in generate_response(
self.messages, model=self.llm_model_id, use_miners=False
):
logger.debug(f"Step generated in reference of MSR: {steps}")
if total_thinking_time is not None:
logger.debug(f"**Total thinking time: {total_thinking_time:.2f} seconds**")
return steps[-1][1]

async def make_reference(self, dataset_entry: Context):
try:
logger.debug(f"Generating reference for MSR: {self.messages}")
# Run the async function in a new event loop
self.reference = await self._async_generate_reference()
logger.debug(f"Generated reference for MSR: {self.reference}")
except Exception as e:
logger.error(f"Error getting final answer for MSR: {e}")
self.reference = None
if self.reference is None:
logger.error("No reference found for MSR")
return None
return self.reference
4 changes: 2 additions & 2 deletions prompting/tasks/qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def make_query(self, dataset_entry: Context):
self.query = self.generate_query(messages=[query_prompt])
return self.query

def make_reference(self, dataset_entry: Context):
async def make_reference(self, dataset_entry: Context):
reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(context=dataset_entry.content, question=self.query)
self.reference = self.generate_reference(messages=[{"role": "user", "content": reference_prompt}])
return self.reference
Expand All @@ -85,7 +85,7 @@ def make_query(self, dataset_entry: Context):
self.query = self.generate_query(messages=[query_prompt])
return self.query

def make_reference(self, dataset_entry: Context):
async def make_reference(self, dataset_entry: Context):
reference_prompt = REFERENCE_PROMPT_TEMPLATE.format(context=dataset_entry.website_content, question=self.query)
self.reference = self.generate_reference(messages=[{"role": "user", "content": reference_prompt}])
return self.reference
1 change: 1 addition & 0 deletions prompting/tasks/task_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ async def run_step(self):
if not task.query:
logger.debug(f"Generating query for task: {task.__class__.__name__}.")
task.make_query(dataset_entry=dataset_entry)
logger.debug(f"Generated Messages: {task.task_messages}")

logger.debug(f"Appending task: {task.__class__.__name__} to task queue.")
self.task_queue.append(task)
Expand Down
Loading