microsoft · safooray · May 23, 2025 · Dec 11, 2024 · Dec 12, 2024 · Jan 8, 2025
diff --git a/eureka_ml_insights/data_utils/aime_utils.py b/eureka_ml_insights/data_utils/aime_utils.py
@@ -21,13 +21,16 @@ def parse_output_answer(response):
         Parse the input string to extract answer of a given AIME question.
         Parameters:
             response (str): Input string containing answer X in the form of "Final Answer: X".
-        Returns: 
+        Returns:
             numerical_value (float): A numeric value representing the model's answer.
         """
         numerical_value = None
 
         # Try to find an answer in the "Final Answer: X" format
         match = re.search(r"Final Answer:\s*([\$]?-?[\d,]+(?:\.\d+)?%?)", response)
+        # If not found, try to find an answer in the "Final Answer: [X]" format
+        if not match:
+            match = re.search(r"Final Answer:\s*\[([\$]?-?[\d,]+(?:\.\d+)?%?)\]", response)
         if match:
             answer_str = match.group(1)
             # Remove $ and commas, handle percentages for numerical comparison
@@ -37,7 +40,7 @@ def parse_output_answer(response):
             else:
                 try:
                     numerical_value = float(answer_str)
-                except ValueError as e:
+                except ValueError:
                     numerical_value = None
 
-        return numerical_value
+        return numerical_value
diff --git a/eureka_ml_insights/models/__init__.py b/eureka_ml_insights/models/__init__.py
@@ -5,6 +5,7 @@
     ClaudeReasoningModel,
     DirectOpenAIModel,
     DirectOpenAIOModel,
+    EndpointModel,
     GeminiModel,
     HuggingFaceModel,
     KeyBasedAuthMixIn,
@@ -25,6 +26,7 @@
 __all__ = [
     AzureOpenAIOModel,
     DirectOpenAIOModel,
+    EndpointModel,
     HuggingFaceModel,
     KeyBasedAuthMixIn,
     LLaVAHuggingFaceModel,

diff --git a/eureka_ml_insights/prompt_templates/aime_templates/hint_creation.jinja b/eureka_ml_insights/prompt_templates/aime_templates/hint_creation.jinja
@@ -0,0 +1,8 @@
+- You are a teacher providing hints to guide a student.
+- The answer the student gave is INCORRECT. FIRST, list all the incorrect numerical answers so far in a enumeration like list.
+- Reflect verbally on what likely went wrong and offer a hint to the student, but not the solution.
+<question>{{ prompt }}</question>
+
+<studentanswer and past teacher hints>
+previous_messages[1:]
+</studentanswer and past teacher hints>"
diff --git a/eureka_ml_insights/prompt_templates/aime_templates/prompt_w_hint.jinja b/eureka_ml_insights/prompt_templates/aime_templates/prompt_w_hint.jinja
@@ -0,0 +1 @@
+That's incorrect. You have made {{attempt_id}} attempts, and they were all wrong. Here are some thoughts\n {{teacher_hint}}.\n\n Try harder, you can do it!
diff --git a/eureka_ml_insights/user_configs/__init__.py b/eureka_ml_insights/user_configs/__init__.py
@@ -4,6 +4,7 @@
     AIME_HYBRIDEXTRACT_PIPELINE,
     AIME_PIPELINE,
 )
+from .aime_seq import AIME_SEQ_PIPELINE
 from .ba_calendar import (
     BA_Calendar_Parallel_PIPELINE,
     BA_Calendar_PIPELINE,
@@ -151,6 +152,7 @@
     IFEval_Nondeterminism,
     Kitab_Nondeterminism,
     AIME_PIPELINE,
+    AIME_SEQ_PIPELINE,
     AIME2025_PIPELINE,
     AIME_HYBRIDEXTRACT_PIPELINE,
     AIME2025_HYBRIDEXTRACT_PIPELINE,

diff --git a/eureka_ml_insights/user_configs/aime_seq.py b/eureka_ml_insights/user_configs/aime_seq.py
@@ -0,0 +1,231 @@
+import os
+from typing import Any
+
+from eureka_ml_insights.configs import (
+    DataProcessingConfig,
+    DataSetConfig,
+    DataUnionConfig,
+    InferenceConfig,
+    ModelConfig,
+    PipelineConfig,
+    PromptProcessingConfig,
+)
+from eureka_ml_insights.core import (
+    DataProcessing,
+    DataUnion,
+    Inference,
+    PromptProcessing,
+)
+from eureka_ml_insights.data_utils import (
+    AddColumnAndData,
+    ColumnRename,
+    CopyColumn,
+    DataReader,
+    RunPythonTransform,
+    SamplerTransform,
+    SequenceTransform,
+)
+from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
+from eureka_ml_insights.data_utils.data import MMDataLoader
+from eureka_ml_insights.metrics.metrics_base import (
+    ExactMatch,
+    MetricBasedVerifier,
+)
+
+from .aime import AIME_PIPELINE
+
+DEFAULT_N_ITER = 3
+RESULT_COLS = [
+    "attempt_id",
+    "model_output",
+    "uid",
+    "prompt",
+    "ground_truth",
+    "Year",
+    "Part",
+    "ID",
+    "extracted_answer",
+    "verification_result",
+    "usage",
+]
+resume_from_dict = {}
+
+
+class AIME_SEQ_PIPELINE(AIME_PIPELINE):
+    """This class specifies the config for running AIME benchmark on any model"""
+
+    def configure_pipeline(
+        self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
+    ) -> PipelineConfig:
+
+        # this call to super will configure the initial prompt processing and final eval reporting comps that can be reused.
+        super().configure_pipeline(model_config, resume_from, **kwargs)
+
+        n_iter = kwargs.get("n_iter", DEFAULT_N_ITER)
+        # Uncomment if you want to sample a subset of the data for debugging
+        #self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
+        #    SamplerTransform(sample_count=2, random_seed=42)
+        #)
+        component_configs = [self.data_processing_comp]
+        for i in range(1, n_iter + 1):
+            # Student inference component, reads prompts from the last prompt processing component
+            last_prompt_proc_comp = component_configs[-1]
+            self.student_inference_comp = InferenceConfig(
+                component_type=Inference,
+                model_config=model_config,
+                data_loader_config=DataSetConfig(
+                    MMDataLoader,
+                    {
+                        "path": os.path.join(last_prompt_proc_comp.output_dir, "transformed_data.jsonl"),
+                        # if this is not the first iteration, we need to add the previous messages to the data loader config
+                        "misc_columns": ["previous_messages"] if i > 1 else None,
+                    },
+                ),
+                output_dir=os.path.join(self.log_dir, f"student_inference_result_{i}"),
+                resume_from=resume_from_dict.get(i, None),
+                chat_mode=True,
+            )
+
+            component_configs.append(self.student_inference_comp)
+
+            # Answer extraction and metric-based verification
+            self.verification_comp = DataProcessingConfig(
+                component_type=DataProcessing,
+                data_reader_config=DataSetConfig(
+                    DataReader,
+                    {
+                        "path": os.path.join(self.student_inference_comp.output_dir, "inference_result.jsonl"),
+                        "format": ".jsonl",
+                        "transform": SequenceTransform(
+                            [
+                                # extract and verify the student answer
+                                AIMEExtractAnswer(f"model_output", f"extracted_answer"),
+                                MetricBasedVerifier(ExactMatch, f"extracted_answer"),
+                                AddColumnAndData("attempt_id", i),
+                                CopyColumn(column_name_src="model_output", column_name_dst=f"student_output"),
+                            ]
+                        ),
+                    },
+                ),
+                output_dir=os.path.join(self.log_dir, f"verification_{i}"),
+            )
+            component_configs.append(self.verification_comp)
+
+            # Variable maintaining link to the most recent inference result results to be used for evaluation
+            # This will be updated to point to the concatenation of results from all iterations
+
+            if i > 1:
+                self.last_inference_result_join_comp = DataUnionConfig(
+                    component_type=DataUnion,
+                    data_reader_config=DataSetConfig(
+                        DataReader,
+                        {
+                            "path": os.path.join(self.verification_comp.output_dir, "transformed_data.jsonl"),
+                            "format": ".jsonl",
+                        },
+                    ),
+                    other_data_reader_config=DataSetConfig(
+                        DataReader,
+                        {
+                            "path": os.path.join(last_agg_dir, "transformed_data.jsonl"),
+                            "format": ".jsonl",
+                        },
+                    ),
+                    output_data_columns=RESULT_COLS,
+                    dedupe_cols=["ID", "attempt_id"],
+                    output_dir=os.path.join(self.log_dir, f"last_inference_result_join_{i}"),
+                )
+                last_agg_dir = self.last_inference_result_join_comp.output_dir
+                component_configs.append(self.last_inference_result_join_comp)
+            else:
+                last_agg_dir = self.verification_comp.output_dir
+
+            # Filtering out the rows with correct answer
+            self.filtering_comp = DataProcessingConfig(
+                component_type=DataProcessing,
+                data_reader_config=DataSetConfig(
+                    DataReader,
+                    {
+                        "path": os.path.join(self.verification_comp.output_dir, "transformed_data.jsonl"),
+                        "format": ".jsonl",
+                        "transform": RunPythonTransform(python_code="df = df[df['verification_result'] != 'correct']"),
+                    },
+                ),
+                output_dir=os.path.join(self.log_dir, f"filtering_{i}"),
+            )
+            component_configs.append(self.filtering_comp)
+
+            # Create a new prompt to ask the teacher model to provide hints.
+            self.hint_processing_comp = PromptProcessingConfig(
+                component_type=PromptProcessing,
+                data_reader_config=DataSetConfig(
+                    DataReader,
+                    {
+                        "path": os.path.join(self.filtering_comp.output_dir, "transformed_data.jsonl"),
+                        "format": ".jsonl",
+                    },
+                ),
+                prompt_template_path=os.path.join(
+                    os.path.dirname(__file__), "../prompt_templates/aime_templates/hint_creation.jinja"
+                ),
+                output_dir=os.path.join(self.log_dir, f"hint_processing_output_{i}"),
+            )
+            component_configs.append(self.hint_processing_comp)
+
+            # Inference component to ask teacher model to provide hints
+            self.teacher_inference_comp = InferenceConfig(
+                component_type=Inference,
+                model_config=model_config,
+                data_loader_config=DataSetConfig(
+                    MMDataLoader,
+                    {
+                        "path": os.path.join(self.hint_processing_comp.output_dir, "transformed_data.jsonl"),
+                        "misc_columns": ["previous_messages"],
+                    },
+                ),
+                output_dir=os.path.join(self.log_dir, f"teacher_inference_result_{i}"),
+                max_concurrent=10,
+                chat_mode=False,
+            )
+            component_configs.append(self.teacher_inference_comp)
+
+            # Prompt processing to ask the stundent to try again
+            self.prompt_processing_with_hint = PromptProcessingConfig(
+                component_type=PromptProcessing,
+                data_reader_config=DataSetConfig(
+                    DataReader,
+                    {
+                        "path": os.path.join(self.teacher_inference_comp.output_dir, "inference_result.jsonl"),
+                        "format": ".jsonl",
+                        "transform": ColumnRename(name_mapping={"model_output": "teacher_hint"}),
+                    },
+                ),
+                prompt_template_path=os.path.join(
+                    os.path.dirname(__file__), "../prompt_templates/aime_templates/prompt_w_hint.jinja"
+                ),
+                output_dir=os.path.join(self.log_dir, f"teacher_hint_prompt_{i}"),
+            )
+            component_configs.append(self.prompt_processing_with_hint)
+
+        # Pass the combined results from all iterations to the eval reporting component
+        self.final_preeval_data_processing.data_reader_config.init_args["path"] = os.path.join(
+            last_agg_dir, "transformed_data.jsonl"
+        )
+
+        component_configs.extend(
+            [
+                self.final_preeval_data_processing,
+                self.evalreporting_comp,
+                self.data_post_processing_addmv,
+                self.mv_evalreporting_comp,
+                self.posteval_data_post_processing_comp,
+                self.bon_evalreporting_comp,
+                self.won_evalreporting_comp,
+            ]
+        )
+
+        # Configure the pipeline
+        return PipelineConfig(
+            component_configs,
+            self.log_dir,
+        )
diff --git a/tests/pipeline_tests.py b/tests/pipeline_tests.py
@@ -23,6 +23,7 @@
 )
 from eureka_ml_insights.user_configs import (
     AIME_PIPELINE,
+    AIME_SEQ_PIPELINE,
     DNA_PIPELINE,
     GEOMETER_PIPELINE,
     GSM8K_PIPELINE,
@@ -55,6 +56,7 @@
     KitabTestModel,
     MultipleChoiceTestModel,
     SpatialReasoningTestModel,
+    TestChatModel,
     TestDataLoader,
     TestHFDataReader,
     TestKitabMetric,
@@ -353,6 +355,18 @@ def configure_pipeline(self):
         self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
         return config
 
+class TEST_AIME_SEQ_PIPELINE(AIME_SEQ_PIPELINE):
+    # Test config the AIME benchmark with TestChatModel on a small subset of AIME
+    def configure_pipeline(self):
+        config = super().configure_pipeline(
+            model_config=ModelConfig(TestChatModel, {})
+        )
+        self.data_processing_comp.data_reader_config.init_args["transform"].transforms.extend(
+            [
+                SamplerTransform(sample_count=N_ITER, random_seed=99),
+            ]
+        )
+        return config
 
 class TEST_GSM8K_PIPELINE(GSM8K_PIPELINE):
     # Test config the GSM8K benchmark with TestModel and TestDataLoader
@@ -624,6 +638,13 @@ def test_outputs_exist(self) -> None:
         super().test_outputs_exist()
         self.verify_n_aggregators(self.conf.component_configs[-2])
 
+class AIME_SEQ_PipelineTest(PipelineTest, unittest.TestCase):
+    def get_config(self):
+        return TEST_AIME_SEQ_PIPELINE().pipeline_config
+
+    def test_outputs_exist(self) -> None:
+        self.eval_config = self.conf.component_configs[-6]
+        super().test_outputs_exist()
 
 class NPHARD_TSP_PipelineTest(PipelineTest, unittest.TestCase):
     def get_config(self):

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -8,7 +8,7 @@
     MMDataLoader,
 )
 from eureka_ml_insights.metrics import ClassicMetric, CompositeMetric
-
+from eureka_ml_insights.models import EndpointModel
 
 class TestModel:
     def __init__(self, model_name="generic_test_model"):
@@ -18,6 +18,17 @@ def generate(self, text_prompt, *args, **kwargs):
         time.sleep(0.1)
         return {"model_output": "model output", "is_valid": True, "response_time": 0, "n_output_tokens": 0}
 
+class TestChatModel(EndpointModel):
+    chat_mode = None
+
+    def create_request(self, text_prompt, query_images=None, system_message=None, previous_messages=None):
+        return text_prompt
+
+    def get_response(self, request):
+        return {"model_output": "This is a test model output.", "is_valid": True, "response_time": 0, "n_output_tokens": 0}
+
+    def handle_request_error(self, e):
+        return True
 
 class TestHFDataReader(HFDataReader):
     def __init__(self, path, **kwargs):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		That's incorrect. You have made {{attempt_id}} attempts, and they were all wrong. Here are some thoughts\n {{teacher_hint}}.\n\n Try harder, you can do it!