From 9b334547645fb805465109339c26b977e3795c3d Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Thu, 16 May 2024 19:14:47 +0000 Subject: [PATCH 01/59] clean up legacy prompting synapse --- prompting/protocol.py | 111 +----------------------------------------- 1 file changed, 2 insertions(+), 109 deletions(-) diff --git a/prompting/protocol.py b/prompting/protocol.py index 1367a4f45..5b8ad01c7 100644 --- a/prompting/protocol.py +++ b/prompting/protocol.py @@ -17,116 +17,9 @@ import pydantic import bittensor as bt - from typing import List, AsyncIterator from starlette.responses import StreamingResponse -import pdb - - -class PromptingSynapse(bt.Synapse): - """ - The PromptingSynapse subclass of the Synapse class encapsulates the functionalities related to prompting scenarios. - - It specifies three fields - `roles`, `messages` and `completion` - that define the state of the PromptingSynapse object. - The `roles` and `messages` are read-only fields defined during object initialization, and `completion` is a mutable - field that can be updated as the prompting scenario progresses. - - The Config inner class specifies that assignment validation should occur on this class (validate_assignment = True), - meaning value assignments to the instance fields are checked against their defined types for correctness. - - Attributes: - roles (List[str]): A list of roles in the prompting scenario. This field is both mandatory and immutable. - messages (List[str]): A list of messages in the prompting scenario. This field is both mandatory and immutable. - completion (str): A string that captures completion of the prompt. This field is mutable. - required_hash_fields List[str]: A list of fields that are required for the hash. - - Methods: - deserialize() -> "PromptingSynapse": Returns the instance of the current object. - - - The `PromptingSynapse` class also overrides the `deserialize` method, returning the - instance itself when this method is invoked. Additionally, it provides a `Config` - inner class that enforces the validation of assignments (`validate_assignment = True`). - - Here is an example of how the `PromptingSynapse` class can be used: - - ```python - # Create a PromptingSynapse instance - prompt = PromptingSynapse(roles=["system", "user"], messages=["Hello", "Hi"]) - - # Print the roles and messages - print("Roles:", prompt.roles) - print("Messages:", prompt.messages) - - # Update the completion - model_prompt =... # Use prompt.roles and prompt.messages to generate a prompt - for your LLM as a single string. - prompt.completion = model(model_prompt) - - # Print the completion - print("Completion:", prompt.completion) - ``` - - This will output: - ``` - Roles: ['system', 'user'] - Messages: ['You are a helpful assistant.', 'Hi, what is the meaning of life?'] - Completion: "The meaning of life is 42. Deal with it, human." - ``` - - This example demonstrates how to create an instance of the `PromptingSynapse` class, access the - `roles` and `messages` fields, and update the `completion` field. - """ - - class Config: - """ - Pydantic model configuration class for PromptingSynapse. This class sets validation of attribute assignment as True. - validate_assignment set to True means the pydantic model will validate attribute assignments on the class. - """ - - validate_assignment = True - - def deserialize(self) -> "PromptingSynapse": - """ - Returns the instance of the current PromptingSynapse object. - - This method is intended to be potentially overridden by subclasses for custom deserialization logic. - In the context of the PromptingSynapse class, it simply returns the instance itself. However, for subclasses - inheriting from this class, it might give a custom implementation for deserialization if need be. - - Returns: - PromptingSynapse: The current instance of the PromptingSynapse class. - """ - return self - - roles: List[str] = pydantic.Field( - ..., - title="Roles", - description="A list of roles in the PromptingSynapse scenario. Immuatable.", - allow_mutation=False, - ) - - messages: List[str] = pydantic.Field( - ..., - title="Messages", - description="A list of messages in the PromptingSynapse scenario. Immutable.", - allow_mutation=False, - ) - - completion: str = pydantic.Field( - "", - title="Completion", - description="Completion status of the current PromptingSynapse object. This attribute is mutable and can be updated.", - ) - - required_hash_fields: List[str] = pydantic.Field( - ["messages"], - title="Required Hash Fields", - description="A list of required fields for the hash.", - allow_mutation=False, - ) - class StreamPromptingSynapse(bt.StreamingSynapse): """ @@ -212,9 +105,9 @@ async def process_streaming_response( self.completion = "" async for chunk in response.content.iter_any(): - tokens = chunk.decode("utf-8").split("\n") + tokens = chunk.decode("utf-8") - self.completion = self.completion + "".join([t for t in tokens if t]) + self.completion = self.completion + tokens yield tokens def deserialize(self) -> str: From 5d3e1b99dca28286500c92060f646166824a3795 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Thu, 16 May 2024 19:15:23 +0000 Subject: [PATCH 02/59] initial adjustments to protocol + chunks logging --- prompting/forward.py | 97 +++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/prompting/forward.py b/prompting/forward.py index 170f85254..4ab253d50 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -46,32 +46,44 @@ async def execute_dendrite_call(dendrite_call): responses = await dendrite_call return responses + @dataclass class StreamResult: - synapse: StreamPromptingSynapse = None exception: BaseException = None uid: int = None + accumulated_chunks: List[str] + accumulated_chunks_timings: List[float] + synapse: StreamPromptingSynapse + -async def process_response(uid: int, async_generator: Awaitable): + +async def process_stream(uid: int, async_iterator: Awaitable) -> StreamResult: """Process a single response asynchronously.""" - try: - chunk = None # Initialize chunk with a default value - async for chunk in async_generator: # most important loop, as this is where we acquire the final synapse. + synapse = None # Initialize chunk with a default value + exception = None + accumulated_chunks = [] + accumulated_chunks_timings = [] + start_time = time.time() + + try: + async for chunk in async_iterator: # most important loop, as this is where we acquire the final synapse. + accumulated_chunks.append(chunk) + accumulated_chunks_timings.append(time.time() - start_time) + bt.logging.debug(f"\nchunk for uid {uid}: {chunk}") - if chunk is not None: - synapse = chunk # last object yielded is the synapse itself with completion filled - - # Assuming chunk holds the last value yielded which should be a synapse - if isinstance(synapse, StreamPromptingSynapse): - return synapse - - bt.logging.debug( - f"Synapse is not StreamPromptingSynapse. Miner uid {uid} completion set to '' " - ) - except Exception as e: - # bt.logging.error(f"Error in generating reference or handling responses: {e}", exc_info=True) + # Assuming last chunk of async_iterator holds the last value yielded as a StreamingSynapse + synapse = chunk + if synapse is None or not isinstance(synapse, StreamPromptingSynapse): + raise ValueError( + f"Something went wrong with miner uid {uid}, Synapse is not StreamPromptingSynapse." + ) + + accumulated_chunks.append(synapse.completion) + accumulated_chunks_timings.append(time.time() - start_time) + except Exception as e: + exception = e traceback_details = traceback.format_exc() bt.logging.error( f"Error in generating reference or handling responses for uid {uid}: {e}\n{traceback_details}" @@ -81,11 +93,19 @@ async def process_response(uid: int, async_generator: Awaitable): roles=["user"], messages=["failure"], completion="" ) - return failed_synapse + synapse = failed_synapse + finally: + return StreamResult( + accumulated_chunks=accumulated_chunks, + accumulated_chunks_timings=accumulated_chunks_timings, + synapse=synapse, + uid=uid, + exception=exception + ) @async_log -async def handle_response(responses: Dict[int, Awaitable]) -> List[StreamResult]: +async def handle_response(stream_results: Dict[int, Awaitable]) -> List[StreamResult]: """The handle_response function is responsible for creating asyncio tasks around acquiring streamed miner chunks and processing them asynchronously. It then pairs the results with their original UIDs and returns a list of StreamResults. @@ -99,36 +119,14 @@ async def handle_response(responses: Dict[int, Awaitable]) -> List[StreamResult] List[StreamResult]: DataClass containing the synapse, exception, and uid """ tasks_with_uid = [ - (uid, responses[uid]) for uid, _ in responses.items() + (uid, stream_results[uid]) for uid, _ in stream_results.items() ] # Pair UIDs with their tasks # Start tasks, preserving order and their associated UIDs - tasks = [process_response(uid, resp) for uid, resp in tasks_with_uid] - - results = await asyncio.gather(*tasks, return_exceptions=True) - - mapped_results = [] - # Pair each result with its original uid - for (uid, _), result in zip(tasks_with_uid, results): - # If the result is a StreamPromptingSynapse, the response was successful and the stream result is added without exceptions - if isinstance(result, StreamPromptingSynapse): - mapped_results.append(StreamResult(synapse=result, uid=uid)) - - # If the result is an exception, the response was unsuccessful and the stream result is added with the exception and an empty synapse - elif isinstance(result, BaseException): - failed_synapse = StreamPromptingSynapse( - roles=["user"], messages=["failure"], completion="" - ) - mapped_results.append( - StreamResult(synapse=failed_synapse, exception=result, uid=uid) - ) - - # If the result is neither an error or a StreamSynapse, log the error and raise a ValueError - else: - bt.logging.error(f"Unexpected result type for UID {uid}: {result}") - raise ValueError(f"Unexpected result type for UID {uid}: {result}") - - return mapped_results + process_stream_tasks = [process_stream(uid, resp) for uid, resp in tasks_with_uid] + stream_results = await asyncio.gather(*process_stream_tasks, return_exceptions=True) + + return stream_results @async_log @@ -208,7 +206,7 @@ async def run_step( # Prepare the task for handling stream responses handle_stream_responses_task = asyncio.create_task( - handle_response(responses=dict(zip(uids_cpu, streams_responses))) + handle_response(stream_results=dict(zip(uids_cpu, streams_responses))) ) if not agent.task.static_reference: @@ -254,6 +252,9 @@ async def run_step( serialize_exception_to_string(stream_result.exception) for stream_result in stream_results ] + stream_results_all_chunks = [stream_result.accumulated_chunks for stream_result in stream_results] + stream_results_all_chunks_timings = [stream_result.accumulated_chunks_timings for stream_result in stream_results] + # Log the step event. event = { "best": best_response, @@ -262,6 +263,8 @@ async def run_step( "step_time": time.time() - start_time, "stream_results_uids": stream_results_uids, "stream_results_exceptions": stream_results_exceptions, + "stream_results_all_chunks": stream_results_all_chunks, + "stream_results_all_chunks_timings": stream_results_all_chunks_timings, **agent.__state_dict__(full=self.config.neuron.log_full), **reward_result.__state_dict__(full=self.config.neuron.log_full), **response_event.__state_dict__(), From b6b7e22fe6d3eb88fada290e0dc3db8aa71d7cdd Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Thu, 16 May 2024 20:11:28 +0000 Subject: [PATCH 03/59] fix chunk processing + mock --- prompting/forward.py | 17 +++++++++-------- prompting/mock.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/prompting/forward.py b/prompting/forward.py index 4ab253d50..34de0661d 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -51,9 +51,9 @@ async def execute_dendrite_call(dendrite_call): class StreamResult: exception: BaseException = None uid: int = None - accumulated_chunks: List[str] - accumulated_chunks_timings: List[float] - synapse: StreamPromptingSynapse + accumulated_chunks: List[str] = None + accumulated_chunks_timings: List[float] = None + synapse: StreamPromptingSynapse = None @@ -67,11 +67,12 @@ async def process_stream(uid: int, async_iterator: Awaitable) -> StreamResult: start_time = time.time() try: - async for chunk in async_iterator: # most important loop, as this is where we acquire the final synapse. - accumulated_chunks.append(chunk) - accumulated_chunks_timings.append(time.time() - start_time) - - bt.logging.debug(f"\nchunk for uid {uid}: {chunk}") + async for chunk in async_iterator: # most important loop, as this is where we acquire the final synapse. + if isinstance(chunk, str): + accumulated_chunks.append(chunk) + accumulated_chunks_timings.append(time.time() - start_time) + + bt.logging.debug(f"\nchunk for uid {uid}: {chunk}") # Assuming last chunk of async_iterator holds the last value yielded as a StreamingSynapse synapse = chunk diff --git a/prompting/mock.py b/prompting/mock.py index e6058ddff..e8fd32193 100644 --- a/prompting/mock.py +++ b/prompting/mock.py @@ -3,7 +3,7 @@ import asyncio import random import bittensor as bt -from prompting.protocol import StreamPromptingSynapse, PromptingSynapse +from prompting.protocol import StreamPromptingSynapse from functools import partial from typing import Dict, List, Union, AsyncGenerator, Any, Iterator From f3e5466e8214b05a4869bcb5ce288fb9a568d849 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Thu, 16 May 2024 20:38:36 +0000 Subject: [PATCH 04/59] adjust dendrite response --- prompting/dendrite.py | 39 ++++++++++++++++++++++++++++++++------- prompting/forward.py | 39 +++++++-------------------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/prompting/dendrite.py b/prompting/dendrite.py index a9df0aa8c..12cdac562 100644 --- a/prompting/dendrite.py +++ b/prompting/dendrite.py @@ -1,19 +1,32 @@ import torch -import bittensor as bt from typing import List +from dataclasses import dataclass +from prompting.protocol import StreamPromptingSynapse +from prompting.utils.misc import serialize_exception_to_string + + +@dataclass +class SynapseStreamResult: + exception: BaseException = None + uid: int = None + accumulated_chunks: List[str] = None + accumulated_chunks_timings: List[float] = None + synapse: StreamPromptingSynapse = None class DendriteResponseEvent: def __init__( - self, responses: List[bt.Synapse], uids: torch.LongTensor, timeout: float + self, stream_results: SynapseStreamResult, uids: torch.LongTensor, timeout: float ): self.uids = uids self.completions = [] self.status_messages = [] self.status_codes = [] self.timings = [] + + synapses = [stream_result.synapse for stream_result in stream_results] - for synapse in responses: + for synapse in synapses: self.completions.append(synapse.completion) self.status_messages.append(synapse.dendrite.status_message) @@ -32,14 +45,22 @@ def __init__( else: self.timings.append(0) # situation where miner is not alive - self.completions = [synapse.completion for synapse in responses] + self.completions = [synapse.completion for synapse in synapses] self.timings = [ - synapse.dendrite.process_time or timeout for synapse in responses + synapse.dendrite.process_time or timeout for synapse in synapses ] self.status_messages = [ - synapse.dendrite.status_message for synapse in responses + synapse.dendrite.status_message for synapse in synapses ] - self.status_codes = [synapse.dendrite.status_code for synapse in responses] + self.status_codes = [synapse.dendrite.status_code for synapse in synapses] + + self.stream_results_uids = [stream_result.uid for stream_result in stream_results] + self.stream_results_exceptions = [ + serialize_exception_to_string(stream_result.exception) + for stream_result in stream_results + ] + self.stream_results_all_chunks = [stream_result.accumulated_chunks for stream_result in stream_results] + self.stream_results_all_chunks_timings = [stream_result.accumulated_chunks_timings for stream_result in stream_results] def __state_dict__(self): return { @@ -48,6 +69,10 @@ def __state_dict__(self): "timings": self.timings, "status_messages": self.status_messages, "status_codes": self.status_codes, + "stream_results_uids": self.stream_results_uids, + "stream_results_exceptions": self.stream_results_exceptions, + "stream_results_all_chunks": self.stream_results_all_chunks, + "stream_results_all_chunks_timings": self.stream_results_all_chunks_timings, } def __repr__(self): diff --git a/prompting/forward.py b/prompting/forward.py index 34de0661d..40ec9465b 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -25,7 +25,7 @@ import bittensor as bt from typing import List, Dict, Awaitable from prompting.agent import HumanAgent -from prompting.dendrite import DendriteResponseEvent +from prompting.dendrite import DendriteResponseEvent, SynapseStreamResult from prompting.conversation import create_task from prompting.protocol import StreamPromptingSynapse from prompting.rewards import RewardResult @@ -46,19 +46,8 @@ async def execute_dendrite_call(dendrite_call): responses = await dendrite_call return responses - -@dataclass -class StreamResult: - exception: BaseException = None - uid: int = None - accumulated_chunks: List[str] = None - accumulated_chunks_timings: List[float] = None - synapse: StreamPromptingSynapse = None - - - -async def process_stream(uid: int, async_iterator: Awaitable) -> StreamResult: +async def process_stream(uid: int, async_iterator: Awaitable) -> SynapseStreamResult: """Process a single response asynchronously.""" synapse = None # Initialize chunk with a default value exception = None @@ -96,7 +85,7 @@ async def process_stream(uid: int, async_iterator: Awaitable) -> StreamResult: synapse = failed_synapse finally: - return StreamResult( + return SynapseStreamResult( accumulated_chunks=accumulated_chunks, accumulated_chunks_timings=accumulated_chunks_timings, synapse=synapse, @@ -106,7 +95,7 @@ async def process_stream(uid: int, async_iterator: Awaitable) -> StreamResult: @async_log -async def handle_response(stream_results: Dict[int, Awaitable]) -> List[StreamResult]: +async def handle_response(stream_results: Dict[int, Awaitable]) -> List[SynapseStreamResult]: """The handle_response function is responsible for creating asyncio tasks around acquiring streamed miner chunks and processing them asynchronously. It then pairs the results with their original UIDs and returns a list of StreamResults. @@ -139,7 +128,7 @@ async def generate_reference(agent: HumanAgent): return result -def log_stream_results(stream_results: List[StreamResult]): +def log_stream_results(stream_results: List[SynapseStreamResult]): failed_responses = [ response for response in stream_results if response.exception is not None ] @@ -220,11 +209,9 @@ async def run_step( log_stream_results(stream_results) - all_synapses_results = [stream_result.synapse for stream_result in stream_results] - # Encapsulate the responses in a response event (dataclass) response_event = DendriteResponseEvent( - responses=all_synapses_results, uids=uids, timeout=timeout + stream_results=stream_results, uids=uids, timeout=timeout ) bt.logging.info(f"Created DendriteResponseEvent:\n {response_event}") @@ -247,25 +234,13 @@ async def run_step( ) self.update_scores(reward_result.rewards, uids) - - stream_results_uids = [stream_result.uid for stream_result in stream_results] - stream_results_exceptions = [ - serialize_exception_to_string(stream_result.exception) - for stream_result in stream_results - ] - stream_results_all_chunks = [stream_result.accumulated_chunks for stream_result in stream_results] - stream_results_all_chunks_timings = [stream_result.accumulated_chunks_timings for stream_result in stream_results] # Log the step event. event = { "best": best_response, "block": self.block, "step": self.step, - "step_time": time.time() - start_time, - "stream_results_uids": stream_results_uids, - "stream_results_exceptions": stream_results_exceptions, - "stream_results_all_chunks": stream_results_all_chunks, - "stream_results_all_chunks_timings": stream_results_all_chunks_timings, + "step_time": time.time() - start_time, **agent.__state_dict__(full=self.config.neuron.log_full), **reward_result.__state_dict__(full=self.config.neuron.log_full), **response_event.__state_dict__(), From 36344f080980b2b283ed55931d9a748f58be59ad Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Thu, 16 May 2024 21:44:16 +0000 Subject: [PATCH 05/59] refactors base pipeline apply function --- prompting/rewards/__init__.py | 1 + prompting/rewards/code_diff.py | 5 +++-- prompting/rewards/date.py | 4 +++- prompting/rewards/float_diff.py | 5 +++-- prompting/rewards/ordinal.py | 6 ++++-- prompting/rewards/pipeline.py | 2 ++ prompting/rewards/relevance.py | 6 ++++-- prompting/rewards/reward.py | 8 ++++---- prompting/rewards/rouge.py | 5 +++-- prompting/rewards/streaming.py | 22 ++++++++++++++++++++++ 10 files changed, 49 insertions(+), 15 deletions(-) create mode 100644 prompting/rewards/streaming.py diff --git a/prompting/rewards/__init__.py b/prompting/rewards/__init__.py index 51cab779a..a44b50bfd 100644 --- a/prompting/rewards/__init__.py +++ b/prompting/rewards/__init__.py @@ -11,4 +11,5 @@ from .float_diff import FloatDiffModel from .date import DateRewardModel from .ordinal import OrdinalRewardModel +from .streaming import StreamingRewardModel from .pipeline import RewardPipeline, REWARD_MODELS diff --git a/prompting/rewards/code_diff.py b/prompting/rewards/code_diff.py index 356617902..df9e55cf4 100644 --- a/prompting/rewards/code_diff.py +++ b/prompting/rewards/code_diff.py @@ -6,6 +6,7 @@ BatchRewardOutput, RewardModelTypeEnum, ) +from prompting.dendrite import DendriteResponseEvent import time @@ -27,13 +28,13 @@ def unified_diff(self, reference, completion): def seq_match(self, reference, completion): return difflib.SequenceMatcher(None, reference, completion).ratio() - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Get the score between two strings. lines: If True, return a unified diff. If False, return a ratio. """ - rewards = [] timings = [] + completions: List[str] = response_event.completions if self.lines: for completion in completions: diff --git a/prompting/rewards/date.py b/prompting/rewards/date.py index 463eaf880..cf95537c7 100644 --- a/prompting/rewards/date.py +++ b/prompting/rewards/date.py @@ -5,6 +5,7 @@ import numpy as np from typing import List from prompting.rewards import BaseRewardModel, BatchRewardOutput, RewardModelTypeEnum +from prompting.dendrite import DendriteResponseEvent import bittensor as bt @@ -85,7 +86,7 @@ def date_score(self, reference: str, completion: str) -> float: score = 0 return score - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Compute difference scores given a completion and reference pair. Args: @@ -95,6 +96,7 @@ def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: Returns: BatchRewardOutput: A BatchRewardOutput object containing the rewards and timings. """ + completions: List[str] = response_event.completions rewards = [] timings = [] diff --git a/prompting/rewards/float_diff.py b/prompting/rewards/float_diff.py index 7d14c65cd..387c2a8a8 100644 --- a/prompting/rewards/float_diff.py +++ b/prompting/rewards/float_diff.py @@ -3,7 +3,7 @@ from typing import List from sympy.parsing.sympy_parser import parse_expr from prompting.rewards import BaseRewardModel, BatchRewardOutput, RewardModelTypeEnum - +from prompting.dendrite import DendriteResponseEvent class FloatDiffModel(BaseRewardModel): @property @@ -52,10 +52,11 @@ def math_score(reference: str, completion: str) -> float: except Exception: return 0.0 - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Compute difference scores given a completion and reference pair.""" rewards = [] timings = [] + completions: List[str] = response_event.completions for completion in completions: t0 = time.time() diff --git a/prompting/rewards/ordinal.py b/prompting/rewards/ordinal.py index 34f749621..bf2801a79 100644 --- a/prompting/rewards/ordinal.py +++ b/prompting/rewards/ordinal.py @@ -2,7 +2,7 @@ import torch from typing import List from prompting.rewards import BaseRewardModel, BatchRewardOutput - +from prompting.dendrite import DendriteResponseEvent class OrdinalRewardModel(BaseRewardModel): @property @@ -18,11 +18,13 @@ def __init__(self, **kwargs): "negative", ] - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Compute difference scores given a completion and reference pair.""" rewards = [] timings = [] classes = self.sentiments + completions: List[str] = response_event.completions + for completion in completions: t0 = time.time() completion = completion.lower() diff --git a/prompting/rewards/pipeline.py b/prompting/rewards/pipeline.py index 244ec0b3e..a05ea137d 100644 --- a/prompting/rewards/pipeline.py +++ b/prompting/rewards/pipeline.py @@ -9,6 +9,7 @@ FloatDiffModel, DateRewardModel, OrdinalRewardModel, + StreamingRewardModel ) REWARD_MODELS = { @@ -18,6 +19,7 @@ "float_diff": FloatDiffModel, "date": DateRewardModel, "ordinal": OrdinalRewardModel, + "streaming": StreamingRewardModel, } diff --git a/prompting/rewards/relevance.py b/prompting/rewards/relevance.py index b754ae330..e063a582c 100644 --- a/prompting/rewards/relevance.py +++ b/prompting/rewards/relevance.py @@ -6,8 +6,9 @@ from prompting.rewards import ( BaseRewardModel, BatchRewardOutput, - RewardModelTypeEnum, ) +from prompting.dendrite import DendriteResponseEvent + class RelevanceRewardModel(BaseRewardModel): @@ -25,7 +26,7 @@ def __init__(self, threshold=None, device=None, pooling_strategy="cls"): # This line is necessary to pass the model to the device defined at its initialization self.model = self.model.cuda() - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Calculates the cosine similarity between sentence embeddings of the reference and completions. We subtract a baseline score which is what an empty string would get (a failed completion). This is usually around 0.35 We also clip the rewards between 0 and 1. The maximum effective score is around 0.65 @@ -33,6 +34,7 @@ def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: reference_embedding = self.model.encode(reference, to_numpy=False) rewards = [] timings = [] + completions: List[str] = response_event.completions # baseline is the cosine similarity between the reference and an empty string baseline = cosine_similarity( reference_embedding.reshape(1, -1), diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py index bf5d2bc0b..7a1f33832 100644 --- a/prompting/rewards/reward.py +++ b/prompting/rewards/reward.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass from enum import Enum - +from prompting.dendrite import DendriteResponseEvent class RewardModelTypeEnum(Enum): WEIGHTED_REWARD = "reward" @@ -151,12 +151,12 @@ def __init__(self, **kwargs): pass @abstractmethod - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: pass - def apply(self, reference: str, response_event, reward_type) -> RewardEvent: + def apply(self, reference: str, response_event: DendriteResponseEvent, reward_type: RewardModelTypeEnum) -> RewardEvent: t0 = time.time() - batch_rewards_output = self.reward(reference, response_event.completions) + batch_rewards_output = self.reward(reference, response_event) batch_rewards_time = time.time() - t0 return RewardEvent( diff --git a/prompting/rewards/rouge.py b/prompting/rewards/rouge.py index c06d236f8..cd56bec0a 100644 --- a/prompting/rewards/rouge.py +++ b/prompting/rewards/rouge.py @@ -5,8 +5,8 @@ from prompting.rewards import ( BaseRewardModel, BatchRewardOutput, - RewardModelTypeEnum, ) +from prompting.dendrite import DendriteResponseEvent class RougeRewardModel(BaseRewardModel): @@ -28,10 +28,11 @@ def rouge_score(self, reference, completion): self.ngram ][self.metric] - def reward(self, reference: str, completions: List[str]) -> BatchRewardOutput: + def reward(self, reference: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Compute ROUGE scores given a completion and reference pair.""" rewards = [] timings = [] + completions: List[str] = response_event.completions for completion in completions: t0 = time.time() diff --git a/prompting/rewards/streaming.py b/prompting/rewards/streaming.py new file mode 100644 index 000000000..380a03c41 --- /dev/null +++ b/prompting/rewards/streaming.py @@ -0,0 +1,22 @@ +import time +import torch +from typing import List +from prompting.rewards import ( + BaseRewardModel, + BatchRewardOutput, +) +from prompting.dendrite import DendriteResponseEvent + + +class StreamingRewardModel(BaseRewardModel): + @property + def name(self) -> str: + return "streaming" + + def __init__(self, max_tokens_per_chunk:int, **kwargs): + super().__init__() + self.max_tokens_per_chunk = max_tokens_per_chunk + + def reward(self, _: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: + """Compute difference scores given a completion and reference pair.""" + pass From b2693652ba8f20e1fb33497fd8c6b8b3be9e0748 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Fri, 17 May 2024 18:12:12 +0000 Subject: [PATCH 06/59] adds tokenizer to flow + dendrite refactor --- prompting/dendrite.py | 51 +++++++++++++++++++------------------------ prompting/forward.py | 32 ++++++++++++++------------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/prompting/dendrite.py b/prompting/dendrite.py index 12cdac562..824e4c948 100644 --- a/prompting/dendrite.py +++ b/prompting/dendrite.py @@ -11,6 +11,7 @@ class SynapseStreamResult: uid: int = None accumulated_chunks: List[str] = None accumulated_chunks_timings: List[float] = None + tokens_per_chunk: List[int] = None synapse: StreamPromptingSynapse = None @@ -23,44 +24,36 @@ def __init__( self.status_messages = [] self.status_codes = [] self.timings = [] + self.stream_results_uids = [] + self.stream_results_exceptions = [] + self.stream_results_all_chunks = [] + self.stream_results_all_chunks_timings = [] + self.stream_results_all_tokens_per_chunk = [] - synapses = [stream_result.synapse for stream_result in stream_results] + for stream_result in stream_results: + synapse = stream_result.synapse - for synapse in synapses: self.completions.append(synapse.completion) self.status_messages.append(synapse.dendrite.status_message) + status_code = synapse.dendrite.status_code - if len(synapse.completion) == 0 and synapse.dendrite.status_code == 200: - synapse.dendrite.status_code = 204 + if len(synapse.completion) == 0 and status_code == 200: + status_code = 204 - self.status_codes.append(synapse.dendrite.status_code) - - if (synapse.dendrite.process_time) and ( - synapse.dendrite.status_code == 200 - or synapse.dendrite.status_code == 204 - ): - self.timings.append(synapse.dendrite.process_time) - elif synapse.dendrite.status_code == 408: + self.status_codes.append(status_code) + process_time = synapse.dendrite.process_time or 0 + if status_code == 200 or status_code == 204: + self.timings.append(process_time) + elif status_code == 408: self.timings.append(timeout) else: - self.timings.append(0) # situation where miner is not alive + self.timings.append(0) - self.completions = [synapse.completion for synapse in synapses] - self.timings = [ - synapse.dendrite.process_time or timeout for synapse in synapses - ] - self.status_messages = [ - synapse.dendrite.status_message for synapse in synapses - ] - self.status_codes = [synapse.dendrite.status_code for synapse in synapses] - - self.stream_results_uids = [stream_result.uid for stream_result in stream_results] - self.stream_results_exceptions = [ - serialize_exception_to_string(stream_result.exception) - for stream_result in stream_results - ] - self.stream_results_all_chunks = [stream_result.accumulated_chunks for stream_result in stream_results] - self.stream_results_all_chunks_timings = [stream_result.accumulated_chunks_timings for stream_result in stream_results] + self.stream_results_uids.append(stream_result.uid) + self.stream_results_exceptions.append(serialize_exception_to_string(stream_result.exception)) + self.stream_results_all_chunks.append(stream_result.accumulated_chunks) + self.stream_results_all_chunks_timings.append(stream_result.accumulated_chunks_timings) + self.stream_results_all_tokens_per_chunk.append(stream_result.tokens_per_chunk) def __state_dict__(self): return { diff --git a/prompting/forward.py b/prompting/forward.py index 40ec9465b..ed4affb41 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -13,8 +13,7 @@ # THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -# DEALINGS IN -# THE SOFTWARE. +# DEALINGS IN THE SOFTWARE. import sys import time @@ -33,7 +32,7 @@ from prompting.utils.uids import get_random_uids from prompting.utils.logging import log_event from prompting.utils.misc import async_log, serialize_exception_to_string -from dataclasses import dataclass +from transformers import PreTrainedTokenizerFast as Tokenizer @async_log async def generate_reference(agent): @@ -47,13 +46,14 @@ async def execute_dendrite_call(dendrite_call): return responses -async def process_stream(uid: int, async_iterator: Awaitable) -> SynapseStreamResult: +async def process_stream(uid: int, async_iterator: Awaitable, tokenizer: Tokenizer) -> SynapseStreamResult: """Process a single response asynchronously.""" synapse = None # Initialize chunk with a default value exception = None accumulated_chunks = [] - accumulated_chunks_timings = [] - start_time = time.time() + accumulated_chunks_timings = [] + accumulated_tokens_per_chunk = [] + start_time = time.time() try: async for chunk in async_iterator: # most important loop, as this is where we acquire the final synapse. @@ -61,6 +61,9 @@ async def process_stream(uid: int, async_iterator: Awaitable) -> SynapseStreamRe accumulated_chunks.append(chunk) accumulated_chunks_timings.append(time.time() - start_time) + tokens_in_chunk = len(tokenizer.tokenize(chunk)) + accumulated_tokens_per_chunk.append(tokens_in_chunk) + bt.logging.debug(f"\nchunk for uid {uid}: {chunk}") # Assuming last chunk of async_iterator holds the last value yielded as a StreamingSynapse @@ -95,7 +98,7 @@ async def process_stream(uid: int, async_iterator: Awaitable) -> SynapseStreamRe @async_log -async def handle_response(stream_results: Dict[int, Awaitable]) -> List[SynapseStreamResult]: +async def handle_response(stream_results_dict: Dict[int, Awaitable], tokenizer: Tokenizer) -> List[SynapseStreamResult]: """The handle_response function is responsible for creating asyncio tasks around acquiring streamed miner chunks and processing them asynchronously. It then pairs the results with their original UIDs and returns a list of StreamResults. @@ -109,14 +112,14 @@ async def handle_response(stream_results: Dict[int, Awaitable]) -> List[SynapseS List[StreamResult]: DataClass containing the synapse, exception, and uid """ tasks_with_uid = [ - (uid, stream_results[uid]) for uid, _ in stream_results.items() + (uid, stream_results_dict[uid]) for uid, _ in stream_results_dict.items() ] # Pair UIDs with their tasks # Start tasks, preserving order and their associated UIDs - process_stream_tasks = [process_stream(uid, resp) for uid, resp in tasks_with_uid] - stream_results = await asyncio.gather(*process_stream_tasks, return_exceptions=True) + process_stream_tasks = [process_stream(uid, resp, tokenizer) for uid, resp in tasks_with_uid] + processed_stream_results = await asyncio.gather(*process_stream_tasks, return_exceptions=True) - return stream_results + return processed_stream_results @async_log @@ -174,7 +177,6 @@ async def run_step( timeout (float): The timeout for the queries. exclude (list, optional): The list of uids to exclude from the query. Defaults to []. """ - bt.logging.debug("run_step", agent.task.name) # Record event start time. @@ -195,9 +197,9 @@ async def run_step( ) # Prepare the task for handling stream responses - handle_stream_responses_task = asyncio.create_task( - handle_response(stream_results=dict(zip(uids_cpu, streams_responses))) - ) + stream_results_dict = dict(zip(uids_cpu, streams_responses)) + tokenizer = self.llm_pipeline.tokenizer + handle_stream_responses_task = asyncio.create_task(handle_response(stream_results_dict, tokenizer)) if not agent.task.static_reference: reference_generation_task = generate_reference(agent) From 7227814ba8b9140d06337780d8f224e90162c3ed Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Fri, 17 May 2024 18:12:33 +0000 Subject: [PATCH 07/59] implements streaming reward function --- prompting/llms/base_llm.py | 1 + prompting/llms/vllm_llm.py | 1 + prompting/rewards/streaming.py | 37 +++++++++++++++++++++++++++++----- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/prompting/llms/base_llm.py b/prompting/llms/base_llm.py index 0ed7b1398..e5df5d319 100644 --- a/prompting/llms/base_llm.py +++ b/prompting/llms/base_llm.py @@ -22,6 +22,7 @@ def __init__( self.model_kwargs = model_kwargs self.messages = [] self.times = [] + self.tokenizer = None def query( self, diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 2c88ba37e..d054e98f2 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -73,6 +73,7 @@ def __init__(self, model_id: str, llm_max_allowed_memory_in_gb:int, device: str self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus + self.tokenizer = self.llm.llm_engine.tokenizer def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: diff --git a/prompting/rewards/streaming.py b/prompting/rewards/streaming.py index 380a03c41..44bb1c23f 100644 --- a/prompting/rewards/streaming.py +++ b/prompting/rewards/streaming.py @@ -1,13 +1,12 @@ import time import torch -from typing import List +from prompting.dendrite import DendriteResponseEvent from prompting.rewards import ( BaseRewardModel, BatchRewardOutput, ) -from prompting.dendrite import DendriteResponseEvent - +## TODO: Create unit tests class StreamingRewardModel(BaseRewardModel): @property def name(self) -> str: @@ -16,7 +15,35 @@ def name(self) -> str: def __init__(self, max_tokens_per_chunk:int, **kwargs): super().__init__() self.max_tokens_per_chunk = max_tokens_per_chunk + self.max_penalty = 0.2 def reward(self, _: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: - """Compute difference scores given a completion and reference pair.""" - pass + """Compute difference scores given a completion and reference pair.""" + rewards = [] + timings = [] + penalty_per_exceeding_chunk = 0.01 + + # Iterate through each chunk of response tokens + for response_tokens_per_chunks in response_event.stream_results_all_tokens_per_chunk: + start_time = time.time() + + # Calculate the accumulated penalty for the current chunk + accumulated_penalty = sum( + penalty_per_exceeding_chunk if tokens_per_chunk > self.max_tokens_per_chunk else 0 + for tokens_per_chunk in response_tokens_per_chunks + ) + + # Record the timing for this computation + timings.append(time.time() - start_time) + + # Calculate the reward and ensure it does not go below max_penalty + rewards.append(max(1 - accumulated_penalty, self.max_penalty)) + + # Create the output object with rewards, timings, and extra information + output = BatchRewardOutput( + rewards=torch.FloatTensor(rewards), + timings=torch.FloatTensor(timings), + extra_info={"type": "streaming"} + ) + return output + From a47bdf124c5d8a880871ee31f4b6958d14d641b4 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Fri, 17 May 2024 20:35:56 +0000 Subject: [PATCH 08/59] adds tokens_per_chunk in forward flow + adds global penalties --- prompting/dendrite.py | 1 + prompting/forward.py | 5 ++++- prompting/rewards/pipeline.py | 1 + prompting/rewards/reward.py | 9 ++++++++- prompting/rewards/streaming.py | 8 ++++---- prompting/tasks/task.py | 4 ++++ 6 files changed, 22 insertions(+), 6 deletions(-) diff --git a/prompting/dendrite.py b/prompting/dendrite.py index 824e4c948..7b9981a94 100644 --- a/prompting/dendrite.py +++ b/prompting/dendrite.py @@ -66,6 +66,7 @@ def __state_dict__(self): "stream_results_exceptions": self.stream_results_exceptions, "stream_results_all_chunks": self.stream_results_all_chunks, "stream_results_all_chunks_timings": self.stream_results_all_chunks_timings, + "stream_results_all_tokens_per_chunk": self.stream_results_all_tokens_per_chunk, } def __repr__(self): diff --git a/prompting/forward.py b/prompting/forward.py index ed4affb41..6189bc552 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -74,7 +74,9 @@ async def process_stream(uid: int, async_iterator: Awaitable, tokenizer: Tokeniz ) accumulated_chunks.append(synapse.completion) - accumulated_chunks_timings.append(time.time() - start_time) + accumulated_chunks_timings.append(time.time() - start_time) + tokens_in_completion = len(tokenizer.tokenize(synapse.completion)) + accumulated_tokens_per_chunk.append(tokens_in_completion) except Exception as e: exception = e traceback_details = traceback.format_exc() @@ -91,6 +93,7 @@ async def process_stream(uid: int, async_iterator: Awaitable, tokenizer: Tokeniz return SynapseStreamResult( accumulated_chunks=accumulated_chunks, accumulated_chunks_timings=accumulated_chunks_timings, + tokens_per_chunk=accumulated_tokens_per_chunk, synapse=synapse, uid=uid, exception=exception diff --git a/prompting/rewards/pipeline.py b/prompting/rewards/pipeline.py index a05ea137d..2b126c401 100644 --- a/prompting/rewards/pipeline.py +++ b/prompting/rewards/pipeline.py @@ -92,6 +92,7 @@ def load_reward_pipeline(self): for task in self.selected_tasks: active_reward_models += TASKS[task].reward_definition active_reward_models += TASKS[task].penalty_definition + active_reward_models += TASKS[task].global_penalty_definition # Instantiate only the required reward models reward_models = {} diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py index 7a1f33832..beed5ad9f 100644 --- a/prompting/rewards/reward.py +++ b/prompting/rewards/reward.py @@ -52,16 +52,23 @@ def __init__(self, reward_pipeline, agent, response_event, device): self.device = device self.task_rewards = agent.task.reward_definition self.task_penalties = agent.task.penalty_definition + self.global_task_penalties = agent.task.global_penalty_definition self.reward_events = self.reward_responses( reference=agent.task.reference, models=self.task_rewards, reward_type=RewardModelTypeEnum.WEIGHTED_REWARD, ) - self.penalty_events = self.reward_responses( + task_penalties= self.reward_responses( reference=agent.challenge, models=self.task_penalties, reward_type=RewardModelTypeEnum.PENALTY, ) + global_task_penalties = self.reward_responses( + reference=agent.challenge, + models=self.global_task_penalties, + reward_type=RewardModelTypeEnum.PENALTY, + ) + self.penalty_events = task_penalties + global_task_penalties self.rewards = self.total_reward() def __state_dict__(self, full=False): diff --git a/prompting/rewards/streaming.py b/prompting/rewards/streaming.py index 44bb1c23f..039326a66 100644 --- a/prompting/rewards/streaming.py +++ b/prompting/rewards/streaming.py @@ -15,13 +15,13 @@ def name(self) -> str: def __init__(self, max_tokens_per_chunk:int, **kwargs): super().__init__() self.max_tokens_per_chunk = max_tokens_per_chunk - self.max_penalty = 0.2 + def reward(self, _: str, response_event: DendriteResponseEvent) -> BatchRewardOutput: """Compute difference scores given a completion and reference pair.""" rewards = [] timings = [] - penalty_per_exceeding_chunk = 0.01 + penalty_per_exceeding_chunk = 0.25 # Iterate through each chunk of response tokens for response_tokens_per_chunks in response_event.stream_results_all_tokens_per_chunk: @@ -36,8 +36,8 @@ def reward(self, _: str, response_event: DendriteResponseEvent) -> BatchRewardOu # Record the timing for this computation timings.append(time.time() - start_time) - # Calculate the reward and ensure it does not go below max_penalty - rewards.append(max(1 - accumulated_penalty, self.max_penalty)) + # Calculate the reward and ensure it does not go above 1 + rewards.append(min(accumulated_penalty, 1)) # Create the output object with rewards, timings, and extra information output = BatchRewardOutput( diff --git a/prompting/tasks/task.py b/prompting/tasks/task.py index d837aac46..379c4d3cf 100644 --- a/prompting/tasks/task.py +++ b/prompting/tasks/task.py @@ -50,6 +50,10 @@ class Task(ABC): query_prompt = "" cleaner = None challenge_type = 'inference' + + global_penalty_definition = [ + dict(name="streaming", max_tokens_per_chunk=200, weight=0.2) + ] def __str__(self): return f"{self.__class__.__name__}(name={self.name!r}, desc={self.desc!r}, goal={self.goal!r}, query={self.query!r}, reference={self.reference!r}, topic={self.topic!r}, subtopic={self.subtopic!r}, tags={self.tags!r})" From c5d9d069d2c221ddd0c0d57600657b560ab27c5f Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sat, 18 May 2024 18:05:15 +0000 Subject: [PATCH 09/59] Increase default max memory to 80 GB --- prompting/llms/vllm_llm.py | 74 +++++++++++++++++++++++--------------- prompting/utils/config.py | 2 +- 2 files changed, 46 insertions(+), 30 deletions(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 382982d47..3304bff12 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -24,18 +24,12 @@ from prompting.llms import BasePipeline, BaseLLM from prompting.mock import MockPipeline from prompting.llms.utils import calculate_gpu_requirements -from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel -def clean_gpu_cache(): - destroy_model_parallel() +def _clear_cache(): + """Explicitly run Python and Torch garbage collection""" gc.collect() torch.cuda.empty_cache() - if torch.distributed.is_initialized(): - torch.distributed.destroy_process_group() - - # Wait for the GPU to clean up - time.sleep(10) torch.cuda.synchronize() @@ -44,7 +38,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory if mock or model_id == "mock": return MockPipeline(model_id) - # Calculates the gpu memory utilization required to run the model within 20GB of GPU + # Calculates the gpu memory utilization required to run the model max_allowed_memory_allocation_in_bytes = max_allowed_memory_in_gb * 1e9 gpu_mem_utilization = calculate_gpu_requirements( device, gpus, max_allowed_memory_allocation_in_bytes @@ -52,7 +46,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory try: # Attempt to initialize the LLM - llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) + llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) # This solution implemented by @bkb2135 sets the eos_token_id directly for efficiency in vLLM usage. # This approach avoids the overhead of loading a tokenizer each time the custom eos token is needed. # Using the Hugging Face pipeline, the eos token specific to llama models was fetched and saved (128009). @@ -68,11 +62,20 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory class vLLMPipeline(BasePipeline): - def __init__(self, model_id: str, llm_max_allowed_memory_in_gb: int, device: str = None, gpus: int = 1, mock: bool = False): + def __init__( + self, + model_id: str, + llm_max_allowed_memory_in_gb: int, + device: str = None, + gpus: int = 1, + mock: bool = False + ): super().__init__() self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus + self._inference_counter = 0 + self._clean_frequency = 20 def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: @@ -88,6 +91,14 @@ def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: ) output = self.llm.generate(composed_prompt, sampling_params, use_tqdm=True) response = output[0].outputs[0].text + + # vLLM cuda memory leak workaround + self._inference_counter += 1 + if self._inference_counter % self._clean_frequency == 0: + bt.logging.debug(f"Cleaning cache after {self._inference_counter} inferences") + _clear_cache() + self._inference_counter = 0 + return response @@ -109,7 +120,13 @@ def __init__( # Keep track of generation data using messages and times self.messages = [{"content": self.system_prompt, "role": "system"}] - self.times = [0] + self.times: List[float] = [0] + self._role_template = { + "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "end": "<|start_header_id|>assistant<|end_header_id|>", + } def query( self, @@ -125,29 +142,25 @@ def query( response = self.forward(messages=messages) response = self.clean_response(cleaner, response) - self.messages = messages + [{"content": response, "role": "assistant"}] - self.times = self.times + [0, time.time() - t0] + self.messages = messages + self.messages.append({"content": response, "role": "assistant"}) + self.times.extend((0, time.time() - t0)) return response - def _make_prompt(self, messages: List[Dict[str, str]]): - composed_prompt = "" + def _make_prompt(self, messages: List[Dict[str, str]]) -> str: + composed_prompt: List[str] = [] for message in messages: - if message["role"] == "system": - composed_prompt += ( - f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - ) - elif message["role"] == "user": - composed_prompt += f'<|start_header_id|>user<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - elif message["role"] == "assistant": - composed_prompt += ( - f'<|start_header_id|>assistant<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - ) + role = message["role"] + if role not in self._role_template: + continue + content = message["content"] + composed_prompt.append(self._role_template[role].format(content)) # Adds final tag indicating the assistant's turn - composed_prompt += "<|start_header_id|>assistant<|end_header_id|>" - return composed_prompt + composed_prompt.append(self._role_template["end"]) + return "".join(composed_prompt) def forward(self, messages: List[Dict[str, str]]): # make composed prompt from messages @@ -164,7 +177,10 @@ def forward(self, messages: List[Dict[str, str]]): if __name__ == "__main__": # Example usage llm_pipeline = vLLMPipeline( - model_id="HuggingFaceH4/zephyr-7b-beta", device="cuda", mock=False + model_id="casperhansen/llama-3-70b-instruct-awq", + device="cuda", + llm_max_allowed_memory_in_gb=80, + gpus=2, ) llm = vLLM_LLM(llm_pipeline, system_prompt="You are a helpful AI assistant") diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 2f4c148f6..2f995d36f 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -83,7 +83,7 @@ def add_args(cls, parser): "--neuron.llm_max_allowed_memory_in_gb", type=int, help="The max gpu memory utilization set for initializing the model. This parameter currently reflects on the property `gpu_memory_utilization` of vllm", - default=60, + default=80, ) parser.add_argument( From 02cd837d31f43d9460c4141379ede5d4dd51ad2d Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sat, 18 May 2024 18:09:30 +0000 Subject: [PATCH 10/59] Change example run to 1 GPU --- prompting/llms/vllm_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 3304bff12..81cd999a3 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -180,7 +180,7 @@ def forward(self, messages: List[Dict[str, str]]): model_id="casperhansen/llama-3-70b-instruct-awq", device="cuda", llm_max_allowed_memory_in_gb=80, - gpus=2, + gpus=1, ) llm = vLLM_LLM(llm_pipeline, system_prompt="You are a helpful AI assistant") From 107a4d694160b4728fcd03111a9b2a778d9a15e2 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 21:01:29 +0000 Subject: [PATCH 11/59] Fix validator hotkeys IndexError --- prompting/base/validator.py | 10 ++++++---- prompting/llms/vllm_llm.py | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 893bf2055..91c8119f2 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -297,13 +297,15 @@ def resync_metagraph(self): "Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages" ) # Zero out all hotkeys that have been replaced. - for uid, hotkey in enumerate(self.hotkeys): - if hotkey != self.metagraph.hotkeys[uid]: - self.scores[uid] = 0 # hotkey has been replaced + min_len = min(len(self.hotkeys), len(self.metagraph.hotkeys)) + for uid in range(min_len): + if self.hotkeys[uid] != self.metagraph.hotkeys[uid]: + # hotkey has been replaced + self.scores[uid] = 0 # Check to see if the metagraph has changed size. # If so, we need to add new hotkeys and moving averages. - if len(self.hotkeys) < len(self.metagraph.hotkeys): + if len(self.hotkeys) != len(self.metagraph.hotkeys): # Update the size of the moving average scores. new_moving_average = torch.zeros((self.metagraph.n)).to(self.device) min_len = min(len(self.hotkeys), len(self.scores)) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 81cd999a3..b7c259b3b 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -74,8 +74,8 @@ def __init__( self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus - self._inference_counter = 0 - self._clean_frequency = 20 + self._inference_counter = 15 + self._clean_frequency = 1 def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: @@ -95,8 +95,10 @@ def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: # vLLM cuda memory leak workaround self._inference_counter += 1 if self._inference_counter % self._clean_frequency == 0: - bt.logging.debug(f"Cleaning cache after {self._inference_counter} inferences") + bt.logging.info(f"Cleaning cache after {self._inference_counter} inferences") + print(f"Memory allocated before: {torch.cuda.memory_allocated()}") _clear_cache() + print(f"Memory allocated after: {torch.cuda.memory_allocated()}") self._inference_counter = 0 return response From 807f162ccfd3b84c24db30e167329f163fa9011b Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 21:30:39 +0000 Subject: [PATCH 12/59] Revert vllm changes --- prompting/llms/vllm_llm.py | 76 +++++++++++++++----------------------- 1 file changed, 29 insertions(+), 47 deletions(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index b7c259b3b..382982d47 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -24,12 +24,18 @@ from prompting.llms import BasePipeline, BaseLLM from prompting.mock import MockPipeline from prompting.llms.utils import calculate_gpu_requirements +from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel -def _clear_cache(): - """Explicitly run Python and Torch garbage collection""" +def clean_gpu_cache(): + destroy_model_parallel() gc.collect() torch.cuda.empty_cache() + if torch.distributed.is_initialized(): + torch.distributed.destroy_process_group() + + # Wait for the GPU to clean up + time.sleep(10) torch.cuda.synchronize() @@ -38,7 +44,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory if mock or model_id == "mock": return MockPipeline(model_id) - # Calculates the gpu memory utilization required to run the model + # Calculates the gpu memory utilization required to run the model within 20GB of GPU max_allowed_memory_allocation_in_bytes = max_allowed_memory_in_gb * 1e9 gpu_mem_utilization = calculate_gpu_requirements( device, gpus, max_allowed_memory_allocation_in_bytes @@ -46,7 +52,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory try: # Attempt to initialize the LLM - llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) + llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) # This solution implemented by @bkb2135 sets the eos_token_id directly for efficiency in vLLM usage. # This approach avoids the overhead of loading a tokenizer each time the custom eos token is needed. # Using the Hugging Face pipeline, the eos token specific to llama models was fetched and saved (128009). @@ -62,20 +68,11 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory class vLLMPipeline(BasePipeline): - def __init__( - self, - model_id: str, - llm_max_allowed_memory_in_gb: int, - device: str = None, - gpus: int = 1, - mock: bool = False - ): + def __init__(self, model_id: str, llm_max_allowed_memory_in_gb: int, device: str = None, gpus: int = 1, mock: bool = False): super().__init__() self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus - self._inference_counter = 15 - self._clean_frequency = 1 def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: @@ -91,16 +88,6 @@ def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: ) output = self.llm.generate(composed_prompt, sampling_params, use_tqdm=True) response = output[0].outputs[0].text - - # vLLM cuda memory leak workaround - self._inference_counter += 1 - if self._inference_counter % self._clean_frequency == 0: - bt.logging.info(f"Cleaning cache after {self._inference_counter} inferences") - print(f"Memory allocated before: {torch.cuda.memory_allocated()}") - _clear_cache() - print(f"Memory allocated after: {torch.cuda.memory_allocated()}") - self._inference_counter = 0 - return response @@ -122,13 +109,7 @@ def __init__( # Keep track of generation data using messages and times self.messages = [{"content": self.system_prompt, "role": "system"}] - self.times: List[float] = [0] - self._role_template = { - "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", - "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", - "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", - "end": "<|start_header_id|>assistant<|end_header_id|>", - } + self.times = [0] def query( self, @@ -144,25 +125,29 @@ def query( response = self.forward(messages=messages) response = self.clean_response(cleaner, response) - self.messages = messages - self.messages.append({"content": response, "role": "assistant"}) - self.times.extend((0, time.time() - t0)) + self.messages = messages + [{"content": response, "role": "assistant"}] + self.times = self.times + [0, time.time() - t0] return response - def _make_prompt(self, messages: List[Dict[str, str]]) -> str: - composed_prompt: List[str] = [] + def _make_prompt(self, messages: List[Dict[str, str]]): + composed_prompt = "" for message in messages: - role = message["role"] - if role not in self._role_template: - continue - content = message["content"] - composed_prompt.append(self._role_template[role].format(content)) + if message["role"] == "system": + composed_prompt += ( + f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' + ) + elif message["role"] == "user": + composed_prompt += f'<|start_header_id|>user<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' + elif message["role"] == "assistant": + composed_prompt += ( + f'<|start_header_id|>assistant<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' + ) # Adds final tag indicating the assistant's turn - composed_prompt.append(self._role_template["end"]) - return "".join(composed_prompt) + composed_prompt += "<|start_header_id|>assistant<|end_header_id|>" + return composed_prompt def forward(self, messages: List[Dict[str, str]]): # make composed prompt from messages @@ -179,10 +164,7 @@ def forward(self, messages: List[Dict[str, str]]): if __name__ == "__main__": # Example usage llm_pipeline = vLLMPipeline( - model_id="casperhansen/llama-3-70b-instruct-awq", - device="cuda", - llm_max_allowed_memory_in_gb=80, - gpus=1, + model_id="HuggingFaceH4/zephyr-7b-beta", device="cuda", mock=False ) llm = vLLM_LLM(llm_pipeline, system_prompt="You are a helpful AI assistant") From 77c913e2e58543b09f99912ae5931cabd05f6152 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 21:34:19 +0000 Subject: [PATCH 13/59] Remove redundant validator code --- prompting/base/validator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 91c8119f2..4034f94b9 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -307,10 +307,7 @@ def resync_metagraph(self): # If so, we need to add new hotkeys and moving averages. if len(self.hotkeys) != len(self.metagraph.hotkeys): # Update the size of the moving average scores. - new_moving_average = torch.zeros((self.metagraph.n)).to(self.device) - min_len = min(len(self.hotkeys), len(self.scores)) - new_moving_average[:min_len] = self.scores[:min_len] - self.scores = new_moving_average + self.scores = self.scores[:min_len] # Update the hotkeys. self.hotkeys = copy.deepcopy(self.metagraph.hotkeys) From 8c2cfced5f6d138672ff82a9699eb1c7331d936f Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 22:08:53 +0000 Subject: [PATCH 14/59] Fix comment style --- prompting/base/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 4034f94b9..1b964eeb8 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -300,7 +300,7 @@ def resync_metagraph(self): min_len = min(len(self.hotkeys), len(self.metagraph.hotkeys)) for uid in range(min_len): if self.hotkeys[uid] != self.metagraph.hotkeys[uid]: - # hotkey has been replaced + # Hotkey has been replaced. self.scores[uid] = 0 # Check to see if the metagraph has changed size. From a90839221e602359152cfbcc90749f5dc4e2d4c1 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 23:19:04 +0000 Subject: [PATCH 15/59] Restore validator changes --- prompting/base/validator.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 91c8119f2..893bf2055 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -297,15 +297,13 @@ def resync_metagraph(self): "Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages" ) # Zero out all hotkeys that have been replaced. - min_len = min(len(self.hotkeys), len(self.metagraph.hotkeys)) - for uid in range(min_len): - if self.hotkeys[uid] != self.metagraph.hotkeys[uid]: - # hotkey has been replaced - self.scores[uid] = 0 + for uid, hotkey in enumerate(self.hotkeys): + if hotkey != self.metagraph.hotkeys[uid]: + self.scores[uid] = 0 # hotkey has been replaced # Check to see if the metagraph has changed size. # If so, we need to add new hotkeys and moving averages. - if len(self.hotkeys) != len(self.metagraph.hotkeys): + if len(self.hotkeys) < len(self.metagraph.hotkeys): # Update the size of the moving average scores. new_moving_average = torch.zeros((self.metagraph.n)).to(self.device) min_len = min(len(self.hotkeys), len(self.scores)) From 6c9eed9fae3b0b9baf01f2a111d45282a418643b Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 23:20:23 +0000 Subject: [PATCH 16/59] Restore config.py --- prompting/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 2f995d36f..2f4c148f6 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -83,7 +83,7 @@ def add_args(cls, parser): "--neuron.llm_max_allowed_memory_in_gb", type=int, help="The max gpu memory utilization set for initializing the model. This parameter currently reflects on the property `gpu_memory_utilization` of vllm", - default=80, + default=60, ) parser.add_argument( From 44ec400378cb174bff4762ab32fb3f5de5983e65 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 23:21:41 +0000 Subject: [PATCH 17/59] Remove clean function --- prompting/llms/vllm_llm.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index b7c259b3b..18f503deb 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -26,13 +26,6 @@ from prompting.llms.utils import calculate_gpu_requirements -def _clear_cache(): - """Explicitly run Python and Torch garbage collection""" - gc.collect() - torch.cuda.empty_cache() - torch.cuda.synchronize() - - def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory_in_gb: int, mock=False): """Loads the VLLM pipeline for the LLM, or a mock pipeline if mock=True""" if mock or model_id == "mock": @@ -74,8 +67,6 @@ def __init__( self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus - self._inference_counter = 15 - self._clean_frequency = 1 def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: @@ -91,16 +82,6 @@ def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: ) output = self.llm.generate(composed_prompt, sampling_params, use_tqdm=True) response = output[0].outputs[0].text - - # vLLM cuda memory leak workaround - self._inference_counter += 1 - if self._inference_counter % self._clean_frequency == 0: - bt.logging.info(f"Cleaning cache after {self._inference_counter} inferences") - print(f"Memory allocated before: {torch.cuda.memory_allocated()}") - _clear_cache() - print(f"Memory allocated after: {torch.cuda.memory_allocated()}") - self._inference_counter = 0 - return response From 07ff31a5c98a9b7702b9f789640f7ccec6ff718a Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Sun, 19 May 2024 23:22:45 +0000 Subject: [PATCH 18/59] Update comments --- prompting/llms/vllm_llm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 18f503deb..58fca8af0 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -31,7 +31,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory if mock or model_id == "mock": return MockPipeline(model_id) - # Calculates the gpu memory utilization required to run the model + # Calculates the gpu memory utilization required to run the model. max_allowed_memory_allocation_in_bytes = max_allowed_memory_in_gb * 1e9 gpu_mem_utilization = calculate_gpu_requirements( device, gpus, max_allowed_memory_allocation_in_bytes From 154a5efbf28973425d7852d18e0aea762ab75466 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Mon, 20 May 2024 14:05:25 +0000 Subject: [PATCH 19/59] Update miner requirements, increase minumum storage --- min_compute.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/min_compute.yml b/min_compute.yml index bc2adcc3f..ae6d77f72 100644 --- a/min_compute.yml +++ b/min_compute.yml @@ -22,12 +22,12 @@ compute_spec: gpu: required: True # Does the application require a GPU? - min_vram: 20 # Minimum GPU VRAM (GB) - recommended_vram: 24 # Recommended GPU VRAM (GB) + min_vram: 80 # Minimum GPU VRAM (GB) + recommended_vram: 80 # Recommended GPU VRAM (GB) cuda_cores: 1024 # Minimum number of CUDA cores (if applicable) min_compute_capability: 6.0 # Minimum CUDA compute capability recommended_compute_capability: 7.0 # Recommended CUDA compute capability - recommended_gpu: "NVIDIA A10" # Recommended GPU to purchase/rent + recommended_gpu: "NVIDIA A100" # Recommended GPU to purchase/rent memory: min_ram: 16 # Minimum RAM (GB) @@ -36,7 +36,7 @@ compute_spec: ram_type: "DDR4" # RAM type (e.g., DDR4, DDR3, etc.) storage: - min_space: 24 # Minimum free storage space (GB) + min_space: 60 # Minimum free storage space (GB) recommended_space: 100 # Recommended free storage space (GB) type: "SSD" # Preferred storage type (e.g., SSD, HDD) min_iops: 1000 # Minimum I/O operations per second (if applicable) @@ -71,7 +71,7 @@ compute_spec: ram_type: "DDR4" # RAM type (e.g., DDR4, DDR3, etc.) storage: - min_space: 40 # Minimum free storage space (GB) + min_space: 60 # Minimum free storage space (GB) recommended_space: 100 # Recommended free storage space (GB) type: "SSD" # Preferred storage type (e.g., SSD, HDD) min_iops: 1000 # Minimum I/O operations per second (if applicable) From 919780de5d344ecc1a1b68520e1b6a7e2de77200 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Mon, 20 May 2024 20:59:27 +0000 Subject: [PATCH 20/59] drops synapse final data out of accumulated chunks --- prompting/forward.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/prompting/forward.py b/prompting/forward.py index 6189bc552..264d6a450 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -72,11 +72,6 @@ async def process_stream(uid: int, async_iterator: Awaitable, tokenizer: Tokeniz raise ValueError( f"Something went wrong with miner uid {uid}, Synapse is not StreamPromptingSynapse." ) - - accumulated_chunks.append(synapse.completion) - accumulated_chunks_timings.append(time.time() - start_time) - tokens_in_completion = len(tokenizer.tokenize(synapse.completion)) - accumulated_tokens_per_chunk.append(tokens_in_completion) except Exception as e: exception = e traceback_details = traceback.format_exc() From 28fe4bc1510f400541aa13ea1a12d987c685ce8e Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 13:08:45 +0000 Subject: [PATCH 21/59] Set maximum GPU mem to 65 --- prompting/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 2f995d36f..17175506b 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -83,7 +83,7 @@ def add_args(cls, parser): "--neuron.llm_max_allowed_memory_in_gb", type=int, help="The max gpu memory utilization set for initializing the model. This parameter currently reflects on the property `gpu_memory_utilization` of vllm", - default=80, + default=65, ) parser.add_argument( From 43862f66add4010e89caad48f4b5b19f2413e566 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 13:23:19 +0000 Subject: [PATCH 22/59] Set maximum GPU mem to 62 --- prompting/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 17175506b..b869d4fbd 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -83,7 +83,7 @@ def add_args(cls, parser): "--neuron.llm_max_allowed_memory_in_gb", type=int, help="The max gpu memory utilization set for initializing the model. This parameter currently reflects on the property `gpu_memory_utilization` of vllm", - default=65, + default=62, ) parser.add_argument( From 5337dde762724313070bcfc535ca04cabfb15f3b Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 13:25:38 +0000 Subject: [PATCH 23/59] Decrease min memory to 62 --- min_compute.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/min_compute.yml b/min_compute.yml index ae6d77f72..99991ef2f 100644 --- a/min_compute.yml +++ b/min_compute.yml @@ -22,7 +22,7 @@ compute_spec: gpu: required: True # Does the application require a GPU? - min_vram: 80 # Minimum GPU VRAM (GB) + min_vram: 62 # Minimum GPU VRAM (GB) recommended_vram: 80 # Recommended GPU VRAM (GB) cuda_cores: 1024 # Minimum number of CUDA cores (if applicable) min_compute_capability: 6.0 # Minimum CUDA compute capability @@ -57,7 +57,7 @@ compute_spec: gpu: required: True # Does the application require a GPU? - min_vram: 80 # Minimum GPU VRAM (GB) + min_vram: 62 # Minimum GPU VRAM (GB) recommended_vram: 80 # Recommended GPU VRAM (GB) cuda_cores: 1024 # Minimum number of CUDA cores (if applicable) min_compute_capability: 6.0 # Minimum CUDA compute capability From 4b862e762364a2f41aeffe5fb63e2a6b2d2e2a5a Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 21 May 2024 20:27:53 +0000 Subject: [PATCH 24/59] adds unit tests for streaming reward model --- tests/test_reward.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 tests/test_reward.py diff --git a/tests/test_reward.py b/tests/test_reward.py new file mode 100644 index 000000000..d074fb9fe --- /dev/null +++ b/tests/test_reward.py @@ -0,0 +1,30 @@ +import pytest +import torch +from unittest.mock import MagicMock +from prompting.rewards import StreamingRewardModel + +@pytest.mark.parametrize( + "all_tokens_per_chunk, expected_rewards", + [ + ([[1, 1, 1, 1, 1]], [0]), # No penalties + ([[2, 1, 1, 1, 1]], [0.25]), # First chunk exceeds + ([[2, 2, 1, 1, 1]], [0.5]), # Two chunks exceed + ([[2, 2, 2, 1, 1]], [0.75]), # Three chunks exceed + ([[2, 2, 2, 2, 1]], [1]), # Four chunks exceed + ([[2, 2, 2, 2, 2, 2]], [1]), # Sum of chunks > 1, clipped at 1 + ] +) +def test_streaming_reward_model(all_tokens_per_chunk, expected_rewards): + max_tokens_per_chunk = 1 + response_event = MagicMock() + response_event.stream_results_all_tokens_per_chunk = all_tokens_per_chunk + + model = StreamingRewardModel(max_tokens_per_chunk) + + output = model.reward("", response_event) + + assert torch.allclose(output.rewards, torch.tensor(expected_rewards, dtype=torch.float)), \ + f"Expected rewards {expected_rewards} but got {output.rewards.tolist()}" + +if __name__ == "__main__": + pytest.main() \ No newline at end of file From da4a066e3be5a52763a482e01a42e3e18ba0fed9 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 21:25:52 +0000 Subject: [PATCH 25/59] Properly update removed or replaced hotkeys --- prompting/base/validator.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 1b964eeb8..acf575f22 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -296,18 +296,18 @@ def resync_metagraph(self): bt.logging.info( "Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages" ) - # Zero out all hotkeys that have been replaced. - min_len = min(len(self.hotkeys), len(self.metagraph.hotkeys)) - for uid in range(min_len): - if self.hotkeys[uid] != self.metagraph.hotkeys[uid]: - # Hotkey has been replaced. - self.scores[uid] = 0 - - # Check to see if the metagraph has changed size. - # If so, we need to add new hotkeys and moving averages. - if len(self.hotkeys) != len(self.metagraph.hotkeys): - # Update the size of the moving average scores. - self.scores = self.scores[:min_len] + + # Create a dictionary for quick lookup of hotkey indices. + hotkey_to_index = {hotkey: idx for idx, hotkey in enumerate(self.hotkeys)} + + # Initialize a tensor for the updated scores. + new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype) + + # Identify the hotkeys that have been deleted or replaced. + for idx, hotkey in enumerate(self.metagraph.hotkeys): + if hotkey in hotkey_to_index: + new_scores[idx] = self.scores[hotkey_to_index[hotkey]] + self.scores = new_scores # Update the hotkeys. self.hotkeys = copy.deepcopy(self.metagraph.hotkeys) From 4d292f52a442cb21f5535541c39f5af6610c89a2 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 21:35:30 +0000 Subject: [PATCH 26/59] Set scores device --- prompting/base/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index acf575f22..3708bc5f9 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -301,7 +301,7 @@ def resync_metagraph(self): hotkey_to_index = {hotkey: idx for idx, hotkey in enumerate(self.hotkeys)} # Initialize a tensor for the updated scores. - new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype) + new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype).to(self.scores.device) # Identify the hotkeys that have been deleted or replaced. for idx, hotkey in enumerate(self.metagraph.hotkeys): From aa61f86fe9edd87781bd5d0138539e4574e124e2 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 21 May 2024 23:30:07 +0000 Subject: [PATCH 27/59] Optimize device setting --- prompting/base/validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 3708bc5f9..2b6b8d934 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -301,7 +301,7 @@ def resync_metagraph(self): hotkey_to_index = {hotkey: idx for idx, hotkey in enumerate(self.hotkeys)} # Initialize a tensor for the updated scores. - new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype).to(self.scores.device) + new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype, device=self.scores.device) # Identify the hotkeys that have been deleted or replaced. for idx, hotkey in enumerate(self.metagraph.hotkeys): From 0b8dc10a995b96bb1b6581988ddb44fd5328dfd7 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Wed, 22 May 2024 00:01:55 +0000 Subject: [PATCH 28/59] Revert vllm_llm to staging --- prompting/llms/vllm_llm.py | 53 +++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index 9ae5d5169..58fca8af0 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -24,7 +24,6 @@ from prompting.llms import BasePipeline, BaseLLM from prompting.mock import MockPipeline from prompting.llms.utils import calculate_gpu_requirements -from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory_in_gb: int, mock=False): @@ -40,7 +39,7 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory try: # Attempt to initialize the LLM - llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) + llm = LLM(model=model_id, gpu_memory_utilization = gpu_mem_utilization, quantization="AWQ", tensor_parallel_size=gpus) # This solution implemented by @bkb2135 sets the eos_token_id directly for efficiency in vLLM usage. # This approach avoids the overhead of loading a tokenizer each time the custom eos token is needed. # Using the Hugging Face pipeline, the eos token specific to llama models was fetched and saved (128009). @@ -56,7 +55,14 @@ def load_vllm_pipeline(model_id: str, device: str, gpus: int, max_allowed_memory class vLLMPipeline(BasePipeline): - def __init__(self, model_id: str, llm_max_allowed_memory_in_gb: int, device: str = None, gpus: int = 1, mock: bool = False): + def __init__( + self, + model_id: str, + llm_max_allowed_memory_in_gb: int, + device: str = None, + gpus: int = 1, + mock: bool = False + ): super().__init__() self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock @@ -97,7 +103,13 @@ def __init__( # Keep track of generation data using messages and times self.messages = [{"content": self.system_prompt, "role": "system"}] - self.times = [0] + self.times: List[float] = [0] + self._role_template = { + "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>", + "end": "<|start_header_id|>assistant<|end_header_id|>", + } def query( self, @@ -113,29 +125,25 @@ def query( response = self.forward(messages=messages) response = self.clean_response(cleaner, response) - self.messages = messages + [{"content": response, "role": "assistant"}] - self.times = self.times + [0, time.time() - t0] + self.messages = messages + self.messages.append({"content": response, "role": "assistant"}) + self.times.extend((0, time.time() - t0)) return response - def _make_prompt(self, messages: List[Dict[str, str]]): - composed_prompt = "" + def _make_prompt(self, messages: List[Dict[str, str]]) -> str: + composed_prompt: List[str] = [] for message in messages: - if message["role"] == "system": - composed_prompt += ( - f'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - ) - elif message["role"] == "user": - composed_prompt += f'<|start_header_id|>user<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - elif message["role"] == "assistant": - composed_prompt += ( - f'<|start_header_id|>assistant<|end_header_id|>\n{{{{ {message["content"]} }}}}<|eot_id|>' - ) + role = message["role"] + if role not in self._role_template: + continue + content = message["content"] + composed_prompt.append(self._role_template[role].format(content)) # Adds final tag indicating the assistant's turn - composed_prompt += "<|start_header_id|>assistant<|end_header_id|>" - return composed_prompt + composed_prompt.append(self._role_template["end"]) + return "".join(composed_prompt) def forward(self, messages: List[Dict[str, str]]): # make composed prompt from messages @@ -152,7 +160,10 @@ def forward(self, messages: List[Dict[str, str]]): if __name__ == "__main__": # Example usage llm_pipeline = vLLMPipeline( - model_id="HuggingFaceH4/zephyr-7b-beta", device="cuda", mock=False + model_id="casperhansen/llama-3-70b-instruct-awq", + device="cuda", + llm_max_allowed_memory_in_gb=80, + gpus=1, ) llm = vLLM_LLM(llm_pipeline, system_prompt="You are a helpful AI assistant") From b4fd266b0b73277dd82e67c64ae211af7fad0c53 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Wed, 22 May 2024 00:04:41 +0000 Subject: [PATCH 29/59] Update README with 62 GB VRAM --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 42e9be125..51ba8ffe0 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,8 @@ bash install.sh # Compute Requirements -1. To run a **validator**, you will need at least 24GB of VRAM. -2. To run the default huggingface **miner**, you will need at least 18GB of VRAM. +1. To run a **validator**, you will need at least 62GB of VRAM. +2. To run the default huggingface **miner**, you will need at least 62GB of VRAM. From 6c851891790869c886d3687adf4b5554724f3f04 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Wed, 22 May 2024 13:43:26 +0000 Subject: [PATCH 30/59] Change readme description to Llama3 --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 51ba8ffe0..59ae86b0d 100644 --- a/README.md +++ b/README.md @@ -77,9 +77,9 @@ For ease of use, you can run the scripts as well with PM2. Installation of PM2 i sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update ``` -Example of running a SOLAR miner: +Example of running a Llama3 miner: ```bash -pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 1 --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug +pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 1 --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug ``` # Testnet @@ -90,7 +90,7 @@ In order to run on testnet, you will need to go through the same hotkey registra To run: ```bash -pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name solar_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug +pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug ``` # Limitations From 9d6796307dffa586e8c81bfd916639525d0fabc9 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Wed, 22 May 2024 18:25:23 +0000 Subject: [PATCH 31/59] adjust reward calculation to include global penalties --- prompting/rewards/reward.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py index beed5ad9f..d51ce4c77 100644 --- a/prompting/rewards/reward.py +++ b/prompting/rewards/reward.py @@ -28,12 +28,16 @@ class RewardEvent: # implement custom asdict to return a dict with the same keys as the dataclass using the model name def asdict(self) -> dict: return { - f"{self.model_name}_raw_{self.model_type.value}": self.rewards.tolist(), - f"{self.model_name}_{self.model_type.value}": self.rewards_normalized.tolist(), - f"{self.model_name}_{self.model_type.value}_timings": self.timings.tolist(), + f"{self.model_name}_raw_{self.model_type.value}": self.tensor_to_rounded_list(self.rewards), + f"{self.model_name}_{self.model_type.value}": self.tensor_to_rounded_list(self.rewards_normalized, 4), + f"{self.model_name}_{self.model_type.value}_timings": self.tensor_to_rounded_list(self.timings), f"{self.model_name}_{self.model_type.value}_batch_time": self.batch_time, f"{self.model_name}_{self.model_type.value}_extra_info": self.extra_info, } + + def tensor_to_rounded_list(self, tensor, decimals=6): + # Convert the tensor elements to floats and round them to 6 decimal places + return [round(float(element), decimals) for element in tensor] class RewardResult: @@ -51,24 +55,17 @@ def __init__(self, reward_pipeline, agent, response_event, device): self.response_event = response_event self.device = device self.task_rewards = agent.task.reward_definition - self.task_penalties = agent.task.penalty_definition - self.global_task_penalties = agent.task.global_penalty_definition + self.task_penalties = agent.task.penalty_definition + agent.task.global_penalty_definition self.reward_events = self.reward_responses( reference=agent.task.reference, models=self.task_rewards, reward_type=RewardModelTypeEnum.WEIGHTED_REWARD, ) - task_penalties= self.reward_responses( + self.penalty_events = self.reward_responses( reference=agent.challenge, models=self.task_penalties, reward_type=RewardModelTypeEnum.PENALTY, ) - global_task_penalties = self.reward_responses( - reference=agent.challenge, - models=self.global_task_penalties, - reward_type=RewardModelTypeEnum.PENALTY, - ) - self.penalty_events = task_penalties + global_task_penalties self.rewards = self.total_reward() def __state_dict__(self, full=False): From 369166d646b0d3658bb5bd82dc751a0aa51d0165 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Wed, 22 May 2024 23:53:00 +0000 Subject: [PATCH 32/59] Revert validator changes --- prompting/base/validator.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 2b6b8d934..893bf2055 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -296,18 +296,19 @@ def resync_metagraph(self): bt.logging.info( "Metagraph updated, re-syncing hotkeys, dendrite pool and moving averages" ) - - # Create a dictionary for quick lookup of hotkey indices. - hotkey_to_index = {hotkey: idx for idx, hotkey in enumerate(self.hotkeys)} - - # Initialize a tensor for the updated scores. - new_scores = torch.zeros(len(self.metagraph.hotkeys), dtype=self.scores.dtype, device=self.scores.device) - - # Identify the hotkeys that have been deleted or replaced. - for idx, hotkey in enumerate(self.metagraph.hotkeys): - if hotkey in hotkey_to_index: - new_scores[idx] = self.scores[hotkey_to_index[hotkey]] - self.scores = new_scores + # Zero out all hotkeys that have been replaced. + for uid, hotkey in enumerate(self.hotkeys): + if hotkey != self.metagraph.hotkeys[uid]: + self.scores[uid] = 0 # hotkey has been replaced + + # Check to see if the metagraph has changed size. + # If so, we need to add new hotkeys and moving averages. + if len(self.hotkeys) < len(self.metagraph.hotkeys): + # Update the size of the moving average scores. + new_moving_average = torch.zeros((self.metagraph.n)).to(self.device) + min_len = min(len(self.hotkeys), len(self.scores)) + new_moving_average[:min_len] = self.scores[:min_len] + self.scores = new_moving_average # Update the hotkeys. self.hotkeys = copy.deepcopy(self.metagraph.hotkeys) From 03e37bdba88a77fa3e02159a9eb9ec3bab9de1e7 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 23 May 2024 12:40:34 +0000 Subject: [PATCH 33/59] Update miner deps --- install.sh | 5 +++++ requirements.txt | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/install.sh b/install.sh index 5b46c863c..14ac77cea 100644 --- a/install.sh +++ b/install.sh @@ -5,3 +5,8 @@ pip uninstall mathgenerator -y # Installing package from the current directory pip install -e . + +# Install AutoAWQ without dependencies, to avoid conflicts with lower version of transformers. +pip install zstandard --no-deps +pip installautoawq-kernels --no-deps +pip install autoawq==0.2.5 --no-deps diff --git a/requirements.txt b/requirements.txt index 9a91e8df0..94fd7b9cc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,6 @@ datasets==2.14.6 deprecation==2.1.0 torch==2.1.1 torchmetrics -transformers==4.36.2 pre-commit==3.3.2 git+https://github.com/synapse-alpha/mathgenerator.git@main#egg=mathgenerator numpy==1.22.0 @@ -24,4 +23,5 @@ python-dotenv wikipedia_sections vllm loguru -argostranslate \ No newline at end of file +argostranslate +transformers==4.41.1 \ No newline at end of file From ffeb58dc4c636d576b0baf8c41b924482cf25960 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 23 May 2024 13:05:50 +0000 Subject: [PATCH 34/59] Fix install script --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 14ac77cea..1cc496e3f 100644 --- a/install.sh +++ b/install.sh @@ -8,5 +8,5 @@ pip install -e . # Install AutoAWQ without dependencies, to avoid conflicts with lower version of transformers. pip install zstandard --no-deps -pip installautoawq-kernels --no-deps +pip install autoawq-kernels --no-deps pip install autoawq==0.2.5 --no-deps From c8ef7a4934fb33c726b86bd1836a9dddb8ac9eeb Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 23 May 2024 13:07:53 +0000 Subject: [PATCH 35/59] Edit install.sh comments --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 1cc496e3f..b7d4270c3 100644 --- a/install.sh +++ b/install.sh @@ -6,7 +6,7 @@ pip uninstall mathgenerator -y # Installing package from the current directory pip install -e . -# Install AutoAWQ without dependencies, to avoid conflicts with lower version of transformers. +# Miner requirements: AutoAWQ without dependencies, to avoid conflicts with other modules. pip install zstandard --no-deps pip install autoawq-kernels --no-deps pip install autoawq==0.2.5 --no-deps From 34b52f8d307aa501db32614fa7c25592393d095f Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 23 May 2024 14:38:45 +0000 Subject: [PATCH 36/59] Simplify setup --- README.md | 7 ++++--- install.sh | 5 ----- requirements.txt | 3 ++- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 59ae86b0d..22fee573e 100644 --- a/README.md +++ b/README.md @@ -77,10 +77,11 @@ For ease of use, you can run the scripts as well with PM2. Installation of PM2 i sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm install pm2 -g && pm2 update ``` -Example of running a Llama3 miner: +Example of running a Llama3 miner: + ```bash pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 1 --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug -``` +``` # Testnet We highly recommend that you run your miners on testnet before deploying on main. This is give you an opportunity to debug your systems, and ensure that you will not lose valuable immunity time. The SN1 testnet is **netuid 61**. @@ -90,7 +91,7 @@ In order to run on testnet, you will need to go through the same hotkey registra To run: ```bash -pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug +pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug ``` # Limitations diff --git a/install.sh b/install.sh index b7d4270c3..5b46c863c 100644 --- a/install.sh +++ b/install.sh @@ -5,8 +5,3 @@ pip uninstall mathgenerator -y # Installing package from the current directory pip install -e . - -# Miner requirements: AutoAWQ without dependencies, to avoid conflicts with other modules. -pip install zstandard --no-deps -pip install autoawq-kernels --no-deps -pip install autoawq==0.2.5 --no-deps diff --git a/requirements.txt b/requirements.txt index 94fd7b9cc..c0d558a6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,4 +24,5 @@ wikipedia_sections vllm loguru argostranslate -transformers==4.41.1 \ No newline at end of file +transformers==4.41.1 +autoawq==0.2.5 From 241437d5b455092240b3af3738d1c8595119ce34 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 23 May 2024 15:19:01 +0000 Subject: [PATCH 37/59] Add load_in_4bits flag to miner --- README.md | 4 ++-- prompting/llms/hf.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 22fee573e..3728bb401 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ sudo apt update && sudo apt install jq && sudo apt install npm && sudo npm insta Example of running a Llama3 miner: ```bash -pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 1 --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug +pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 1 --subtensor.network finney --wallet.name my_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --neuron.load_in_4bit True --axon.port 21988 --logging.debug ``` # Testnet @@ -91,7 +91,7 @@ In order to run on testnet, you will need to go through the same hotkey registra To run: ```bash -pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --axon.port 21988 --logging.debug +pm2 start neurons/miners/huggingface/miner.py --interpreter python3 --name llama3_miner -- --netuid 61 --subtensor.network test --wallet.name my_test_wallet --wallet.hotkey m1 --neuron.model_id casperhansen/llama-3-70b-instruct-awq --neuron.load_in_4bit True --axon.port 21988 --logging.debug ``` # Limitations diff --git a/prompting/llms/hf.py b/prompting/llms/hf.py index 869b32411..be46dc1b8 100644 --- a/prompting/llms/hf.py +++ b/prompting/llms/hf.py @@ -19,7 +19,7 @@ from typing import List, Dict import bittensor as bt -from transformers import Pipeline, pipeline, AutoTokenizer, TextIteratorStreamer +from transformers import BitsAndBytesConfig, pipeline, AutoTokenizer, TextIteratorStreamer from prompting.mock import MockPipeline from prompting.cleaners.cleaner import CleanerPipeline from transformers import pipeline, TextIteratorStreamer, AutoTokenizer @@ -83,12 +83,15 @@ def load_hf_pipeline( streamer=streamer, ) else: + kwargs = model_kwargs.copy() + kwargs["bnb_4bit_compute_dtype"] = kwargs.pop("torch_dtype") + quant_config = BitsAndBytesConfig(**kwargs) llm_pipeline = pipeline( "text-generation", model=model_id, tokenizer=tokenizer, device_map=device, - model_kwargs=model_kwargs, + quant_config=quant_config, streamer=streamer, ) From 921e85890cc8b68b69e179bf09c68be224c21e97 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Tue, 28 May 2024 12:56:41 +0000 Subject: [PATCH 38/59] Exit on CUDA OOM --- prompting/base/validator.py | 3 --- prompting/forward.py | 28 +++++++++++++++------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 893bf2055..3ce252ee1 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -142,9 +142,6 @@ def run(self): self.loop.run_until_complete( asyncio.wait_for(task, timeout=forward_timeout) ) - except torch.cuda.OutOfMemoryError as e: - bt.logging.error(f"Out of memory error: {e}") - continue except MaxRetryError as e: bt.logging.error(f"MaxRetryError: {e}") continue diff --git a/prompting/forward.py b/prompting/forward.py index 170f85254..adf0d1785 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -16,30 +16,28 @@ # DEALINGS IN # THE SOFTWARE. +import asyncio +import random import sys import time -import random -import asyncio import traceback -import numpy as np +from dataclasses import dataclass +from typing import Awaitable, Dict, List + import bittensor as bt -from typing import List, Dict, Awaitable +import numpy as np +import torch + from prompting.agent import HumanAgent -from prompting.dendrite import DendriteResponseEvent from prompting.conversation import create_task +from prompting.dendrite import DendriteResponseEvent from prompting.protocol import StreamPromptingSynapse from prompting.rewards import RewardResult from prompting.tasks import QuestionAnsweringTask -from prompting.utils.uids import get_random_uids from prompting.utils.logging import log_event from prompting.utils.misc import async_log, serialize_exception_to_string -from dataclasses import dataclass +from prompting.utils.uids import get_random_uids -@async_log -async def generate_reference(agent): - loop = asyncio.get_running_loop() - result = await loop.run_in_executor(None, agent.task.generate_reference, agent.llm_pipeline) - return result @async_log async def execute_dendrite_call(dendrite_call): @@ -167,7 +165,6 @@ def log_stream_results(stream_results: List[StreamResult]): f"Failed response for uid {failed_response.uid}: {formatted_exception}" ) - async def run_step( self, agent: HumanAgent, roles: List[str], messages: List[str], k: int, timeout: float, exclude: list = None ): @@ -354,6 +351,11 @@ async def forward(self): messages.append(agent.challenge) turn += 1 + except torch.cuda.OutOfMemoryError as err: + bt.logging.error("Out of memory during validation", str(err)) + bt.logging.debug(traceback.print_exception(type(err), err, err.__traceback__)) + return + except BaseException as e: unexpected_errors = serialize_exception_to_string(e) bt.logging.error( From 854b7ad29b6cd56cbdcad03cb725728d346f4034 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 28 May 2024 20:54:15 +0000 Subject: [PATCH 39/59] remove unit test todo comment --- prompting/rewards/streaming.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prompting/rewards/streaming.py b/prompting/rewards/streaming.py index 039326a66..f6443915a 100644 --- a/prompting/rewards/streaming.py +++ b/prompting/rewards/streaming.py @@ -6,7 +6,6 @@ BatchRewardOutput, ) -## TODO: Create unit tests class StreamingRewardModel(BaseRewardModel): @property def name(self) -> str: From d1b73eb4227565d3cf54121c77e16feae0b47644 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 28 May 2024 20:58:41 +0000 Subject: [PATCH 40/59] updates versioning --- prompting/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/__init__.py b/prompting/__init__.py index 476009f42..8a3357f46 100644 --- a/prompting/__init__.py +++ b/prompting/__init__.py @@ -16,7 +16,7 @@ # DEALINGS IN THE SOFTWARE. # Define the version of the template module. -__version__ = "2.3.1" +__version__ = "2.3.2" version_split = __version__.split(".") __spec_version__ = ( (10000 * int(version_split[0])) From 3808f4e8101889c976dcaab8588f8ce304c2c657 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 28 May 2024 21:09:25 +0000 Subject: [PATCH 41/59] drops deprecated prompting synapse from mock code --- prompting/mock.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/prompting/mock.py b/prompting/mock.py index e8fd32193..25d92a843 100644 --- a/prompting/mock.py +++ b/prompting/mock.py @@ -287,15 +287,10 @@ async def forward( deserialize: bool = True, run_async: bool = True, streaming: bool = False, - ): - if streaming: - assert isinstance( - synapse, StreamPromptingSynapse - ), "Synapse must be a StreamPromptingSynapse object when is_stream is True." - else: - assert isinstance( - synapse, PromptingSynapse - ), "Synapse must be a PromptingSynapse object when is_stream is False." + ): + assert isinstance( + synapse, StreamPromptingSynapse + ), "Synapse must be a StreamPromptingSynapse object when is_stream is True." async def query_all_axons(is_stream: bool): """Queries all axons for responses.""" From bf61e977983dbc13755eb9be9deebece54c3e84d Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 28 May 2024 21:54:14 +0000 Subject: [PATCH 42/59] fix mock pipeline --- prompting/mock.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/prompting/mock.py b/prompting/mock.py index 25d92a843..cc8ea8167 100644 --- a/prompting/mock.py +++ b/prompting/mock.py @@ -7,6 +7,7 @@ from functools import partial from typing import Dict, List, Union, AsyncGenerator, Any, Iterator +from types import SimpleNamespace class MockTokenizer: @@ -44,6 +45,12 @@ class MockPipeline: @property def tokenizer(self): return self.model.tokenizer + + @property + def llm_engine(self): + return SimpleNamespace( + tokenizer=self.model.tokenizer + ) def __init__( self, From 68b9a28a2e00b674391e5b3e6d30696547dafc90 Mon Sep 17 00:00:00 2001 From: bkb2135 <98138173+bkb2135@users.noreply.github.com> Date: Wed, 29 May 2024 15:26:28 -0400 Subject: [PATCH 43/59] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 42e9be125..db8f1698c 100644 --- a/README.md +++ b/README.md @@ -47,8 +47,11 @@ bash install.sh # Compute Requirements -1. To run a **validator**, you will need at least 24GB of VRAM. -2. To run the default huggingface **miner**, you will need at least 18GB of VRAM. +1. To run a **validator**, you will need at least 62GB of VRAM. +2. To run the default huggingface **miner**, you will need at least 62GB of VRAM. + + +**It is important to note that the baseminers are not recommended for main, and exist purely as an example. Running a base miner on main will result in no emissions and a loss in your registration fee.** From 0a78c48059aad9f69ff62d63d0a118cda2906e31 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Fri, 31 May 2024 05:11:40 +0800 Subject: [PATCH 44/59] Update prompting/forward.py --- prompting/forward.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prompting/forward.py b/prompting/forward.py index adf0d1785..2cc2ff74f 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -353,8 +353,7 @@ async def forward(self): except torch.cuda.OutOfMemoryError as err: bt.logging.error("Out of memory during validation", str(err)) - bt.logging.debug(traceback.print_exception(type(err), err, err.__traceback__)) - return + raise err except BaseException as e: unexpected_errors = serialize_exception_to_string(e) From dacfc6ac8738017871022b2623c050623e7e09b9 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Thu, 30 May 2024 21:35:14 +0000 Subject: [PATCH 45/59] Address Pedro's comments --- prompting/forward.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/prompting/forward.py b/prompting/forward.py index 2cc2ff74f..a92cb80bd 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -271,6 +271,8 @@ async def forward(self): """ Encapsulates a full conversation between the validator and miners. Contains one or more rounds of request-response. + Raises: + torch.cuda.OutOfMemoryError: CUDA out of memory error. """ bt.logging.info("🚀 Starting forward loop...") forward_start_time = time.time() @@ -352,7 +354,7 @@ async def forward(self): turn += 1 except torch.cuda.OutOfMemoryError as err: - bt.logging.error("Out of memory during validation", str(err)) + bt.logging.error("CUDA out of memory", str(err)) raise err except BaseException as e: From 6840d1fc4ca9acbb9c8d25c38378e287c1f171e0 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Fri, 31 May 2024 11:39:58 +0000 Subject: [PATCH 46/59] Bittensor upgrade to 7.0.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 9a91e8df0..994bdaaec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ angle_emb -bittensor==6.10.1 +bittensor==7.0.0 bs4 click==8.1.3 datasets==2.14.6 From e62c99fc6d1c270db447ab4b17059b088e0c02a7 Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Fri, 31 May 2024 11:42:16 +0000 Subject: [PATCH 47/59] Merge with staging --- prompting/base/validator.py | 3 +++ prompting/forward.py | 29 +++++++++++++---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/prompting/base/validator.py b/prompting/base/validator.py index 3ce252ee1..893bf2055 100644 --- a/prompting/base/validator.py +++ b/prompting/base/validator.py @@ -142,6 +142,9 @@ def run(self): self.loop.run_until_complete( asyncio.wait_for(task, timeout=forward_timeout) ) + except torch.cuda.OutOfMemoryError as e: + bt.logging.error(f"Out of memory error: {e}") + continue except MaxRetryError as e: bt.logging.error(f"MaxRetryError: {e}") continue diff --git a/prompting/forward.py b/prompting/forward.py index a92cb80bd..170f85254 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -16,28 +16,30 @@ # DEALINGS IN # THE SOFTWARE. -import asyncio -import random import sys import time +import random +import asyncio import traceback -from dataclasses import dataclass -from typing import Awaitable, Dict, List - -import bittensor as bt import numpy as np -import torch - +import bittensor as bt +from typing import List, Dict, Awaitable from prompting.agent import HumanAgent -from prompting.conversation import create_task from prompting.dendrite import DendriteResponseEvent +from prompting.conversation import create_task from prompting.protocol import StreamPromptingSynapse from prompting.rewards import RewardResult from prompting.tasks import QuestionAnsweringTask +from prompting.utils.uids import get_random_uids from prompting.utils.logging import log_event from prompting.utils.misc import async_log, serialize_exception_to_string -from prompting.utils.uids import get_random_uids +from dataclasses import dataclass +@async_log +async def generate_reference(agent): + loop = asyncio.get_running_loop() + result = await loop.run_in_executor(None, agent.task.generate_reference, agent.llm_pipeline) + return result @async_log async def execute_dendrite_call(dendrite_call): @@ -165,6 +167,7 @@ def log_stream_results(stream_results: List[StreamResult]): f"Failed response for uid {failed_response.uid}: {formatted_exception}" ) + async def run_step( self, agent: HumanAgent, roles: List[str], messages: List[str], k: int, timeout: float, exclude: list = None ): @@ -271,8 +274,6 @@ async def forward(self): """ Encapsulates a full conversation between the validator and miners. Contains one or more rounds of request-response. - Raises: - torch.cuda.OutOfMemoryError: CUDA out of memory error. """ bt.logging.info("🚀 Starting forward loop...") forward_start_time = time.time() @@ -353,10 +354,6 @@ async def forward(self): messages.append(agent.challenge) turn += 1 - except torch.cuda.OutOfMemoryError as err: - bt.logging.error("CUDA out of memory", str(err)) - raise err - except BaseException as e: unexpected_errors = serialize_exception_to_string(e) bt.logging.error( From 425ab9b02d8d4ac074d1efb1f5c8aa26bea54b83 Mon Sep 17 00:00:00 2001 From: bkb2135 <98138173+bkb2135@users.noreply.github.com> Date: Mon, 3 Jun 2024 09:52:48 -0400 Subject: [PATCH 48/59] Add bit about discord --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index db8f1698c..885cf4c03 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ bash install.sh **It is important to note that the baseminers are not recommended for main, and exist purely as an example. Running a base miner on main will result in no emissions and a loss in your registration fee.** - +If you have any questions please reach out in the SN1 Discord. # How to Run From 86bdbbb41105a6b5aa8be9436179bb1e1490ea76 Mon Sep 17 00:00:00 2001 From: bkb2135 <98138173+bkb2135@users.noreply.github.com> Date: Mon, 3 Jun 2024 09:53:17 -0400 Subject: [PATCH 49/59] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 885cf4c03..a5352cad1 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ bash install.sh **It is important to note that the baseminers are not recommended for main, and exist purely as an example. Running a base miner on main will result in no emissions and a loss in your registration fee.** -If you have any questions please reach out in the SN1 Discord. +If you have any questions please reach out in the SN1 channel in the Bittensor Discord. # How to Run From 596e687d57d131c18722779b111fe30263f9c4cd Mon Sep 17 00:00:00 2001 From: Dmytro Bobrenko <17252809+dbobrenko@users.noreply.github.com> Date: Mon, 3 Jun 2024 13:59:21 +0000 Subject: [PATCH 50/59] Upgrade all requirements for vllm 0.4.2, specify versions --- requirements.txt | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index 994bdaaec..285483974 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,27 +1,29 @@ -angle_emb +angle_emb==0.4.4 bittensor==7.0.0 -bs4 +bs4==0.0.2 click==8.1.3 datasets==2.14.6 deprecation==2.1.0 -torch==2.1.1 +torch==2.3.0 torchmetrics -transformers==4.36.2 +transformers==4.41.2 pre-commit==3.3.2 git+https://github.com/synapse-alpha/mathgenerator.git@main#egg=mathgenerator numpy==1.22.0 -rouge +rouge==1.0.1 scipy==1.10.1 sentencepiece wandb==0.15.10 tenacity antlr4-python3-runtime==4.11 -wikipedia +wikipedia==1.4.0 openai==1.9.0 -langchain==0.1.5 +langchain==0.2.1 +langchain_core==0.2.3 langchainhub==0.1.14 -python-dotenv -wikipedia_sections -vllm -loguru -argostranslate \ No newline at end of file +python-dotenv==1.0.1 +wikipedia_sections==2.0.0 +vllm==0.4.3 +loguru==0.7.2 +argostranslate==1.9.6 + From ff0de980c5f3ca8911a94310142748ac8c510da6 Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 4 Jun 2024 14:48:34 +0000 Subject: [PATCH 51/59] fix stream synpase import --- tests/test_mock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_mock.py b/tests/test_mock.py index 12e5dfa6c..75cc6a0b5 100644 --- a/tests/test_mock.py +++ b/tests/test_mock.py @@ -2,7 +2,7 @@ import asyncio import bittensor as bt from prompting.mock import MockDendrite, MockMetagraph, MockSubtensor -from prompting.protocol import PromptingSynapse +from prompting.protocol import StreamPromptingSynapse wallet = bt.MockWallet() wallet.create(coldkey_use_password=False) @@ -70,7 +70,7 @@ def test_mock_dendrite_timings(timeout, min_time, max_time, n): async def run(): return await mock_dendrite( axons, - synapse=PromptingSynapse( + synapse=StreamPromptingSynapse( roles=["user"], messages=["What is the capital of France?"] ), timeout=timeout, From 1d3e6bcf2a02227b5dc209fdbcd2781adf4dfe8b Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Tue, 4 Jun 2024 15:07:16 +0000 Subject: [PATCH 52/59] fix mock dendrite call --- tests/test_mock.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_mock.py b/tests/test_mock.py index 75cc6a0b5..7ae6f8604 100644 --- a/tests/test_mock.py +++ b/tests/test_mock.py @@ -74,6 +74,7 @@ async def run(): roles=["user"], messages=["What is the capital of France?"] ), timeout=timeout, + deserialize=False, ) eps = 0.2 From e88e69480d341a3d0c141002dcc46e017760aad3 Mon Sep 17 00:00:00 2001 From: bkb2135 <98138173+bkb2135@users.noreply.github.com> Date: Tue, 4 Jun 2024 18:16:52 -0400 Subject: [PATCH 53/59] Sample all available uids --- prompting/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 2f4c148f6..9a03b321a 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -328,7 +328,7 @@ def add_validator_args(cls, parser): "--neuron.sample_size", type=int, help="The number of miners to query in a single step.", - default=50, + default=1024, ) parser.add_argument( From ee72240c0a808f304cbdc5d40cc63394b0c7626c Mon Sep 17 00:00:00 2001 From: bkb2135 Date: Tue, 4 Jun 2024 23:22:00 +0000 Subject: [PATCH 54/59] Do not exclude uids that were just queried --- prompting/forward.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/prompting/forward.py b/prompting/forward.py index 170f85254..cefafb571 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -330,8 +330,6 @@ async def forward(self): event["forward_time"] = time.time() - forward_start_time event["turn"] = turn log_event(self, event) - - exclude_uids += event["uids"] task.complete = True accepted_answer = event["best"] if random.random() < 0.5 else agent.task.reference From 1961f6c41f2c0f269ddcf2fbabf1f33fbaaadded Mon Sep 17 00:00:00 2001 From: p-ferreira <38992619+p-ferreira@users.noreply.github.com> Date: Wed, 5 Jun 2024 09:55:32 -0400 Subject: [PATCH 55/59] updates versioning --- prompting/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/__init__.py b/prompting/__init__.py index 8a3357f46..643f4f1ac 100644 --- a/prompting/__init__.py +++ b/prompting/__init__.py @@ -16,7 +16,7 @@ # DEALINGS IN THE SOFTWARE. # Define the version of the template module. -__version__ = "2.3.2" +__version__ = "2.4.0" version_split = __version__.split(".") __spec_version__ = ( (10000 * int(version_split[0])) From 54932c7ba7c0792ff515b879713ca2f3c311d05e Mon Sep 17 00:00:00 2001 From: bkb2135 <98138173+bkb2135@users.noreply.github.com> Date: Wed, 5 Jun 2024 11:37:40 -0400 Subject: [PATCH 56/59] Update config.py --- prompting/utils/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/utils/config.py b/prompting/utils/config.py index 9a03b321a..1bef2d2ee 100644 --- a/prompting/utils/config.py +++ b/prompting/utils/config.py @@ -328,7 +328,7 @@ def add_validator_args(cls, parser): "--neuron.sample_size", type=int, help="The number of miners to query in a single step.", - default=1024, + default=100, ) parser.add_argument( From ec2b764df2e0c76c2429f6f74630e34141762b6f Mon Sep 17 00:00:00 2001 From: Steffen Cruz Date: Wed, 5 Jun 2024 16:43:08 +0100 Subject: [PATCH 57/59] Update forward.py Reduces max conversation turns from 3 to 2. --- prompting/forward.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prompting/forward.py b/prompting/forward.py index 170f85254..e774cea1d 100644 --- a/prompting/forward.py +++ b/prompting/forward.py @@ -339,7 +339,7 @@ async def forward(self): messages.append(accepted_answer) # 50% chance of single turn conversation, 25% of two turns, 12.5% chance of 3 turns, 6.25% chance of 4 turns, 3.63% chance of 5... - if random.random()<0.5 or turn>=2: + if random.random()<0.5 or turn>=1: break history = '\n'.join([f"{role}: {message}" for role, message in zip(roles, messages)]) From 8d38f120132c0bc2c6afbe0b908e78d8a5aec1b9 Mon Sep 17 00:00:00 2001 From: p-ferreira <38992619+p-ferreira@users.noreply.github.com> Date: Wed, 5 Jun 2024 14:17:47 -0400 Subject: [PATCH 58/59] update bittensor requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1a7a3748b..7d11d95fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ angle_emb==0.4.4 -bittensor==7.0.0 +bittensor==7.0.1 bs4==0.0.2 click==8.1.3 datasets==2.14.6 From ea0843d51b8f1ab0073208f6749c1e68c503b09a Mon Sep 17 00:00:00 2001 From: p-ferreira Date: Wed, 5 Jun 2024 19:19:30 +0000 Subject: [PATCH 59/59] fix tokenizer issue, fix logging issue, adapt mock miner for unit test --- prompting/llms/hf.py | 6 +++++- prompting/llms/vllm_llm.py | 2 +- prompting/mock.py | 8 +++++--- prompting/utils/logging.py | 5 +---- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/prompting/llms/hf.py b/prompting/llms/hf.py index be46dc1b8..d2a095405 100644 --- a/prompting/llms/hf.py +++ b/prompting/llms/hf.py @@ -207,7 +207,11 @@ def __call__(self, messages: List[Dict[str, str]]): return self.forward(messages=messages) def _make_prompt(self, messages: List[Dict[str, str]]): - return self.llm_pipeline.tokenizer.apply_chat_template( + # The tokenizer.tokenizer is used for a integration with vllm and the mock pipeline, for real hf application, use: + # return self.llm_pipeline.tokenizer.apply_chat_template( + # messages, tokenize=False, add_generation_prompt=True + # ) + return self.llm_pipeline.tokenizer.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) diff --git a/prompting/llms/vllm_llm.py b/prompting/llms/vllm_llm.py index a8bf9e095..85ee7b974 100644 --- a/prompting/llms/vllm_llm.py +++ b/prompting/llms/vllm_llm.py @@ -67,7 +67,7 @@ def __init__( self.llm = load_vllm_pipeline(model_id, device, gpus, llm_max_allowed_memory_in_gb, mock) self.mock = mock self.gpus = gpus - self.tokenizer = self.llm.llm_engine.tokenizer + self.tokenizer = self.llm.llm_engine.tokenizer.tokenizer def __call__(self, composed_prompt: str, **model_kwargs: Dict) -> str: if self.mock: diff --git a/prompting/mock.py b/prompting/mock.py index cc8ea8167..34b172a82 100644 --- a/prompting/mock.py +++ b/prompting/mock.py @@ -30,14 +30,16 @@ class MockModel(torch.nn.Module): def __init__(self, phrase): super().__init__() - self.tokenizer = MockTokenizer() + self.tokenizer = SimpleNamespace( + tokenizer=MockTokenizer() + ) self.phrase = phrase def __call__(self, messages): return self.forward(messages) def forward(self, messages): - role_tag = self.tokenizer.role_expr.format(role="assistant") + role_tag = self.tokenizer.tokenizer.role_expr.format(role="assistant") return f"{role_tag} {self.phrase}" @@ -73,7 +75,7 @@ def forward(self, messages, **kwargs): return self.postprocess(output) def postprocess(self, output, **kwargs): - output = output.split(self.model.tokenizer.role_expr.format(role="assistant"))[ + output = output.split(self.model.tokenizer.tokenizer.role_expr.format(role="assistant"))[ -1 ].strip() return output diff --git a/prompting/utils/logging.py b/prompting/utils/logging.py index 381d68b62..d7029b89c 100644 --- a/prompting/utils/logging.py +++ b/prompting/utils/logging.py @@ -92,10 +92,7 @@ def init_wandb(self, reinit=False): tags=tags, notes=self.config.wandb.notes, ) - bt.logging.success( - prefix="Started a new wandb run", - sufix=f" {self.wandb.name} ", - ) + bt.logging.success(f"Started a new wandb run {self.wandb.name} ") def reinit_wandb(self):