diff --git a/prompting/llms/model_manager.py b/prompting/llms/model_manager.py index f62919478..79d7f878d 100644 --- a/prompting/llms/model_manager.py +++ b/prompting/llms/model_manager.py @@ -250,7 +250,7 @@ async def _vram_cleanup(self): torch.cuda.reset_accumulated_memory_stats() await asyncio.sleep(1.0) except BaseException as e: - logger.error(f"Error during CUDA empty cache: {e}") + logger.warning(f"Error during CUDA empty cache: {e}") else: logger.warning("CUDA is not available") diff --git a/prompting/rewards/exact_match.py b/prompting/rewards/exact_match.py index 3ad5f4c80..84be107da 100644 --- a/prompting/rewards/exact_match.py +++ b/prompting/rewards/exact_match.py @@ -86,6 +86,7 @@ async def reward( all_chunks: list[list[str]] = response_event.stream_results_all_chunks all_chunk_dicts_raw: list[list[ChatCompletionChunk]] = response_event.stream_results_all_chunk_dicts_raw + uids: np.ndarray | list[float] = response_event.uids all_timings: list[list[float]] = response_event.stream_results_all_chunks_timings completions: list[str] = response_event.completions timeout: float = response_event.timeout @@ -118,7 +119,7 @@ async def reward( # Iterate over each response event - for chunks, timings, chunk_dicts_raw in zip(all_chunks, all_timings, all_chunk_dicts_raw): + for chunks, timings, chunk_dicts_raw, uid in zip(all_chunks, all_timings, all_chunk_dicts_raw, uids): try: # If no response is provided, apply full penalty if not chunks: @@ -133,19 +134,21 @@ async def reward( for idx in verify_indices: check_idx = min(idx, completion_length - 1) if not chunk_dicts_raw[check_idx].choices[0].logprobs: + logger.debug(f"Miner {uid} failed to provide logprobs: {chunk_dicts_raw[check_idx]}") verification_scores.append(0.0) continue if chunk_dicts_raw[check_idx].choices[0].logprobs.content is None: - logger.debug(f"Miner failed to provide logits: {chunk_dicts_raw[check_idx]}") + logger.debug(f"Miner {uid} failed to provide logprobs content: {chunk_dicts_raw[check_idx]}") verification_scores.append(0.0) continue + original_logits = { info.token: info.logprob for info in chunk_dicts_raw[check_idx].choices[0].logprobs.content[0].top_logprobs } - verification_output, prompt = await self.model_manager.generate_logits( + verification_output, prompt = await model_manager.generate_logits( model=task.llm_model_id, messages=task.task_messages + [{"role": "assistant", "content": "".join(chunks[:check_idx])}], sampling_params=sampling_parameters, @@ -169,8 +172,8 @@ async def reward( rewards.append(float(final_score > VERIFICATION_THRESHOLD) * timing_reward) timing_outputs.append(np.array(valid_chunks).mean()) except Exception as e: - logger.warning(f"Error in reward calculation: {e}") - rewards.append(-INCORRECT_PENALTY) + logger.debug(f"Miner {uid} failed to provide logits chunk, setting reward to 0: {e}") + rewards.append(0.0) timing_outputs.append(0.0) reward_output = BatchRewardOutput( diff --git a/prompting/rewards/inference_reward_model.py b/prompting/rewards/inference_reward_model.py index eccb5fe56..a0d2ed828 100644 --- a/prompting/rewards/inference_reward_model.py +++ b/prompting/rewards/inference_reward_model.py @@ -16,16 +16,12 @@ async def reward( **kwargs, ) -> BatchRewardOutput: """Gives an exact reward of 1 if the response matches the reference, 0 otherwise""" - # Use self.model_manager if model_manager is None - model_manager = model_manager or self.model_manager if model_manager is None: raise ValueError("Model manager must be set") if model_id: logits_reward_model = LogitsRewardModel() - logits_reward_model.model_manager = model_manager return await logits_reward_model.reward(reference, response_event, task, model_manager=model_manager) relevance_reward_model = RelevanceRewardModel() - relevance_reward_model.model_manager = model_manager return await relevance_reward_model.reward(reference, response_event, model_manager=model_manager) diff --git a/prompting/rewards/reward.py b/prompting/rewards/reward.py index 59589a09b..9cffabe57 100644 --- a/prompting/rewards/reward.py +++ b/prompting/rewards/reward.py @@ -67,11 +67,12 @@ def rewards_normalized(self) -> np.ndarray: class BaseRewardModel(ABC, BaseModel): - model_manager: ModelManager = None weight: float = 1.0 @abstractmethod - async def reward(self, reference: str, response_event: DendriteResponseEvent, **kwargs) -> BatchRewardOutput: + async def reward( + self, reference: str, response_event: DendriteResponseEvent, model_manager: ModelManager = None, **kwargs + ) -> BatchRewardOutput: raise NotImplementedError("You must implement the reward method") async def apply( @@ -81,11 +82,14 @@ async def apply( challenge: str | None = None, reward_type: Literal["reward", "penalty"] = "reward", task: BaseTextTask | None = None, + model_manager: ModelManager | None = None, **kwargs, ) -> WeightedRewardEvent: t0 = time.time() comparator = reference if reward_type == "reward" else challenge - batch_rewards_output: BatchRewardOutput = await self.reward(comparator, response_event, task=task, **kwargs) + batch_rewards_output: BatchRewardOutput = await self.reward( + comparator, response_event, task=task, model_manager=model_manager, **kwargs + ) batch_rewards_time = time.time() - t0 return WeightedRewardEvent( @@ -123,7 +127,6 @@ class BaseRewardConfig(ABC, BaseModel): and weight it with <1. """ - model_manager: ModelManager = None reward_definitions: ClassVar[list[BaseRewardModel]] penalty_definitions: ClassVar[list[BaseRewardModel]] = [] @@ -150,10 +153,6 @@ async def apply( ) -> list[WeightedRewardEvent]: reward_events = [] for weighted_reward in cls.reward_definitions: - # Set the model_manager on the weighted_reward if it's None - if weighted_reward.model_manager is None and model_manager is not None: - weighted_reward.model_manager = model_manager - reward_events.append( await weighted_reward.apply( reference=reference, @@ -162,6 +161,7 @@ async def apply( reward_type="reward", model_id=model_id, task=task, + model_manager=model_manager, ), ) return reward_events