Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
78 commits
Select commit Hold shift + click to select a range
bc915db
readme updates
Dec 11, 2024
e290211
fix links
Dec 12, 2024
72b9b9e
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 8, 2025
3a20480
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 17, 2025
f58154e
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 22, 2025
601c074
enable passing args and kwrgs from data loader to model
Jan 22, 2025
efde053
make model args consistent
Jan 22, 2025
9b9c5c0
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 24, 2025
e3a295c
enable chat mode inference
Jan 24, 2025
ef3238f
linting
Jan 24, 2025
2d6f72b
tmp sample usage
Jan 24, 2025
7c63409
merge main
Jan 24, 2025
2282fe1
fix failing tests
Jan 24, 2025
cdc87bf
abort chat after first failure
Jan 25, 2025
83445bc
fix datetime decoding issue
Jan 25, 2025
19c57bb
formatting
Jan 25, 2025
ce1b2fe
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 29, 2025
b2a8376
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 29, 2025
b2391c1
Merge branch 'main' into dl_updates
Jan 29, 2025
645eefa
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Jan 31, 2025
08c4fbe
Merge branch 'main' into aime-seq
Jan 31, 2025
a90494b
added prompt templates for sequential scenario
Jan 31, 2025
b622c03
makes model output col configurable
Jan 31, 2025
5df66a2
seq user conf
Jan 31, 2025
db64ccc
fix obv issues
Jan 31, 2025
b03cc11
cleanup
Jan 31, 2025
89daef5
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Feb 11, 2025
4669530
flow fix
Feb 14, 2025
77ebe46
prompt fixes
Feb 14, 2025
410cfe2
rm redundant line
Feb 14, 2025
d9988df
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Feb 27, 2025
12dffdf
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Mar 1, 2025
d832380
merge conflict
Mar 1, 2025
2e60522
union all iter results
Mar 4, 2025
da05363
cleanup
Mar 5, 2025
18c589f
emptiness
Mar 5, 2025
111ea88
typo fix
Mar 5, 2025
2db7e46
aime extractor fix
Mar 5, 2025
7053f65
formatting
Mar 5, 2025
173538a
resolve concurrency issues
Mar 11, 2025
d77dfad
formatting
Mar 12, 2025
382f452
thread safety
Mar 13, 2025
8fa0931
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Mar 14, 2025
03c848c
merge with main
Mar 14, 2025
5c790b1
bug fixes
Mar 15, 2025
2c45d8b
revert to single model uinstance
Mar 15, 2025
b88b67f
merge conflict res
Mar 15, 2025
37f2f6f
aggregation bug fix
Mar 15, 2025
a738891
bug fix
Mar 15, 2025
4123c89
col name simplification
Mar 15, 2025
e81748a
dedup subset
Mar 19, 2025
1b6af70
formatting
Mar 19, 2025
536b6f7
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 2, 2025
10b0aba
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 4, 2025
f9c2a48
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 4, 2025
e2947ba
allow sys msg as part of model config
Apr 4, 2025
57504cb
merge
Apr 7, 2025
f676e6e
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 8, 2025
9efef0d
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 9, 2025
269d296
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 9, 2025
fd84d48
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 9, 2025
81504c2
pull
Apr 9, 2025
3a1cd28
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
Apr 15, 2025
ca1bfea
merge with main
Apr 15, 2025
f758195
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
May 13, 2025
3d53322
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
May 14, 2025
f9b2046
merge with main
May 14, 2025
b4b56da
merge with main
May 14, 2025
ec366ed
merge with main
May 14, 2025
559d495
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
May 22, 2025
4f5e487
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
May 22, 2025
f71926e
Merge branch 'main' of https://github.com/microsoft/eureka-ml-insights
May 22, 2025
a54fe84
merge main
May 22, 2025
f7c1c5b
sync with latest aime
May 22, 2025
4b95930
aime seq test
May 23, 2025
10b14bb
new chat test model
May 23, 2025
d5a2e2c
made EndpointModel importable
May 23, 2025
71fc25c
revert
May 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions eureka_ml_insights/data_utils/aime_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ def parse_output_answer(response):
Parse the input string to extract answer of a given AIME question.
Parameters:
response (str): Input string containing answer X in the form of "Final Answer: X".
Returns:
Returns:
numerical_value (float): A numeric value representing the model's answer.
"""
numerical_value = None

# Try to find an answer in the "Final Answer: X" format
match = re.search(r"Final Answer:\s*([\$]?-?[\d,]+(?:\.\d+)?%?)", response)
# If not found, try to find an answer in the "Final Answer: [X]" format
if not match:
match = re.search(r"Final Answer:\s*\[([\$]?-?[\d,]+(?:\.\d+)?%?)\]", response)
if match:
answer_str = match.group(1)
# Remove $ and commas, handle percentages for numerical comparison
Expand All @@ -37,7 +40,7 @@ def parse_output_answer(response):
else:
try:
numerical_value = float(answer_str)
except ValueError as e:
except ValueError:
numerical_value = None

return numerical_value
return numerical_value
2 changes: 2 additions & 0 deletions eureka_ml_insights/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
ClaudeReasoningModel,
DirectOpenAIModel,
DirectOpenAIOModel,
EndpointModel,
GeminiModel,
HuggingFaceModel,
KeyBasedAuthMixIn,
Expand All @@ -25,6 +26,7 @@
__all__ = [
AzureOpenAIOModel,
DirectOpenAIOModel,
EndpointModel,
HuggingFaceModel,
KeyBasedAuthMixIn,
LLaVAHuggingFaceModel,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
- You are a teacher providing hints to guide a student.
- The answer the student gave is INCORRECT. FIRST, list all the incorrect numerical answers so far in a enumeration like list.
- Reflect verbally on what likely went wrong and offer a hint to the student, but not the solution.
<question>{{ prompt }}</question>

<studentanswer and past teacher hints>
previous_messages[1:]
</studentanswer and past teacher hints>"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
That's incorrect. You have made {{attempt_id}} attempts, and they were all wrong. Here are some thoughts\n {{teacher_hint}}.\n\n Try harder, you can do it!
2 changes: 2 additions & 0 deletions eureka_ml_insights/user_configs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
AIME_HYBRIDEXTRACT_PIPELINE,
AIME_PIPELINE,
)
from .aime_seq import AIME_SEQ_PIPELINE
from .ba_calendar import (
BA_Calendar_Parallel_PIPELINE,
BA_Calendar_PIPELINE,
Expand Down Expand Up @@ -151,6 +152,7 @@
IFEval_Nondeterminism,
Kitab_Nondeterminism,
AIME_PIPELINE,
AIME_SEQ_PIPELINE,
AIME2025_PIPELINE,
AIME_HYBRIDEXTRACT_PIPELINE,
AIME2025_HYBRIDEXTRACT_PIPELINE,
Expand Down
231 changes: 231 additions & 0 deletions eureka_ml_insights/user_configs/aime_seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
import os
from typing import Any

from eureka_ml_insights.configs import (
DataProcessingConfig,
DataSetConfig,
DataUnionConfig,
InferenceConfig,
ModelConfig,
PipelineConfig,
PromptProcessingConfig,
)
from eureka_ml_insights.core import (
DataProcessing,
DataUnion,
Inference,
PromptProcessing,
)
from eureka_ml_insights.data_utils import (
AddColumnAndData,
ColumnRename,
CopyColumn,
DataReader,
RunPythonTransform,
SamplerTransform,
SequenceTransform,
)
from eureka_ml_insights.data_utils.aime_utils import AIMEExtractAnswer
from eureka_ml_insights.data_utils.data import MMDataLoader
from eureka_ml_insights.metrics.metrics_base import (
ExactMatch,
MetricBasedVerifier,
)

from .aime import AIME_PIPELINE

DEFAULT_N_ITER = 3
RESULT_COLS = [
"attempt_id",
"model_output",
"uid",
"prompt",
"ground_truth",
"Year",
"Part",
"ID",
"extracted_answer",
"verification_result",
"usage",
]
resume_from_dict = {}


class AIME_SEQ_PIPELINE(AIME_PIPELINE):
"""This class specifies the config for running AIME benchmark on any model"""

def configure_pipeline(
self, model_config: ModelConfig, resume_from: str = None, **kwargs: dict[str, Any]
) -> PipelineConfig:

# this call to super will configure the initial prompt processing and final eval reporting comps that can be reused.
super().configure_pipeline(model_config, resume_from, **kwargs)

n_iter = kwargs.get("n_iter", DEFAULT_N_ITER)
# Uncomment if you want to sample a subset of the data for debugging
#self.data_processing_comp.data_reader_config.init_args["transform"].transforms.append(
# SamplerTransform(sample_count=2, random_seed=42)
#)
component_configs = [self.data_processing_comp]
for i in range(1, n_iter + 1):
# Student inference component, reads prompts from the last prompt processing component
last_prompt_proc_comp = component_configs[-1]
self.student_inference_comp = InferenceConfig(
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
MMDataLoader,
{
"path": os.path.join(last_prompt_proc_comp.output_dir, "transformed_data.jsonl"),
# if this is not the first iteration, we need to add the previous messages to the data loader config
"misc_columns": ["previous_messages"] if i > 1 else None,
},
),
output_dir=os.path.join(self.log_dir, f"student_inference_result_{i}"),
resume_from=resume_from_dict.get(i, None),
chat_mode=True,
)

component_configs.append(self.student_inference_comp)

# Answer extraction and metric-based verification
self.verification_comp = DataProcessingConfig(
component_type=DataProcessing,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.student_inference_comp.output_dir, "inference_result.jsonl"),
"format": ".jsonl",
"transform": SequenceTransform(
[
# extract and verify the student answer
AIMEExtractAnswer(f"model_output", f"extracted_answer"),
MetricBasedVerifier(ExactMatch, f"extracted_answer"),
AddColumnAndData("attempt_id", i),
CopyColumn(column_name_src="model_output", column_name_dst=f"student_output"),
]
),
},
),
output_dir=os.path.join(self.log_dir, f"verification_{i}"),
)
component_configs.append(self.verification_comp)

# Variable maintaining link to the most recent inference result results to be used for evaluation
# This will be updated to point to the concatenation of results from all iterations

if i > 1:
self.last_inference_result_join_comp = DataUnionConfig(
component_type=DataUnion,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.verification_comp.output_dir, "transformed_data.jsonl"),
"format": ".jsonl",
},
),
other_data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(last_agg_dir, "transformed_data.jsonl"),
"format": ".jsonl",
},
),
output_data_columns=RESULT_COLS,
dedupe_cols=["ID", "attempt_id"],
output_dir=os.path.join(self.log_dir, f"last_inference_result_join_{i}"),
)
last_agg_dir = self.last_inference_result_join_comp.output_dir
component_configs.append(self.last_inference_result_join_comp)
else:
last_agg_dir = self.verification_comp.output_dir

# Filtering out the rows with correct answer
self.filtering_comp = DataProcessingConfig(
component_type=DataProcessing,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.verification_comp.output_dir, "transformed_data.jsonl"),
"format": ".jsonl",
"transform": RunPythonTransform(python_code="df = df[df['verification_result'] != 'correct']"),
},
),
output_dir=os.path.join(self.log_dir, f"filtering_{i}"),
)
component_configs.append(self.filtering_comp)

# Create a new prompt to ask the teacher model to provide hints.
self.hint_processing_comp = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.filtering_comp.output_dir, "transformed_data.jsonl"),
"format": ".jsonl",
},
),
prompt_template_path=os.path.join(
os.path.dirname(__file__), "../prompt_templates/aime_templates/hint_creation.jinja"
),
output_dir=os.path.join(self.log_dir, f"hint_processing_output_{i}"),
)
component_configs.append(self.hint_processing_comp)

# Inference component to ask teacher model to provide hints
self.teacher_inference_comp = InferenceConfig(
component_type=Inference,
model_config=model_config,
data_loader_config=DataSetConfig(
MMDataLoader,
{
"path": os.path.join(self.hint_processing_comp.output_dir, "transformed_data.jsonl"),
"misc_columns": ["previous_messages"],
},
),
output_dir=os.path.join(self.log_dir, f"teacher_inference_result_{i}"),
max_concurrent=10,
chat_mode=False,
)
component_configs.append(self.teacher_inference_comp)

# Prompt processing to ask the stundent to try again
self.prompt_processing_with_hint = PromptProcessingConfig(
component_type=PromptProcessing,
data_reader_config=DataSetConfig(
DataReader,
{
"path": os.path.join(self.teacher_inference_comp.output_dir, "inference_result.jsonl"),
"format": ".jsonl",
"transform": ColumnRename(name_mapping={"model_output": "teacher_hint"}),
},
),
prompt_template_path=os.path.join(
os.path.dirname(__file__), "../prompt_templates/aime_templates/prompt_w_hint.jinja"
),
output_dir=os.path.join(self.log_dir, f"teacher_hint_prompt_{i}"),
)
component_configs.append(self.prompt_processing_with_hint)

# Pass the combined results from all iterations to the eval reporting component
self.final_preeval_data_processing.data_reader_config.init_args["path"] = os.path.join(
last_agg_dir, "transformed_data.jsonl"
)

component_configs.extend(
[
self.final_preeval_data_processing,
self.evalreporting_comp,
self.data_post_processing_addmv,
self.mv_evalreporting_comp,
self.posteval_data_post_processing_comp,
self.bon_evalreporting_comp,
self.won_evalreporting_comp,
]
)

# Configure the pipeline
return PipelineConfig(
component_configs,
self.log_dir,
)
21 changes: 21 additions & 0 deletions tests/pipeline_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
)
from eureka_ml_insights.user_configs import (
AIME_PIPELINE,
AIME_SEQ_PIPELINE,
DNA_PIPELINE,
GEOMETER_PIPELINE,
GSM8K_PIPELINE,
Expand Down Expand Up @@ -55,6 +56,7 @@
KitabTestModel,
MultipleChoiceTestModel,
SpatialReasoningTestModel,
TestChatModel,
TestDataLoader,
TestHFDataReader,
TestKitabMetric,
Expand Down Expand Up @@ -353,6 +355,18 @@ def configure_pipeline(self):
self.inference_comp.data_loader_config.init_args["n_iter"] = N_ITER
return config

class TEST_AIME_SEQ_PIPELINE(AIME_SEQ_PIPELINE):
# Test config the AIME benchmark with TestChatModel on a small subset of AIME
def configure_pipeline(self):
config = super().configure_pipeline(
model_config=ModelConfig(TestChatModel, {})
)
self.data_processing_comp.data_reader_config.init_args["transform"].transforms.extend(
[
SamplerTransform(sample_count=N_ITER, random_seed=99),
]
)
return config

class TEST_GSM8K_PIPELINE(GSM8K_PIPELINE):
# Test config the GSM8K benchmark with TestModel and TestDataLoader
Expand Down Expand Up @@ -624,6 +638,13 @@ def test_outputs_exist(self) -> None:
super().test_outputs_exist()
self.verify_n_aggregators(self.conf.component_configs[-2])

class AIME_SEQ_PipelineTest(PipelineTest, unittest.TestCase):
def get_config(self):
return TEST_AIME_SEQ_PIPELINE().pipeline_config

def test_outputs_exist(self) -> None:
self.eval_config = self.conf.component_configs[-6]
super().test_outputs_exist()

class NPHARD_TSP_PipelineTest(PipelineTest, unittest.TestCase):
def get_config(self):
Expand Down
13 changes: 12 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
MMDataLoader,
)
from eureka_ml_insights.metrics import ClassicMetric, CompositeMetric

from eureka_ml_insights.models import EndpointModel

class TestModel:
def __init__(self, model_name="generic_test_model"):
Expand All @@ -18,6 +18,17 @@ def generate(self, text_prompt, *args, **kwargs):
time.sleep(0.1)
return {"model_output": "model output", "is_valid": True, "response_time": 0, "n_output_tokens": 0}

class TestChatModel(EndpointModel):
chat_mode = None

def create_request(self, text_prompt, query_images=None, system_message=None, previous_messages=None):
return text_prompt

def get_response(self, request):
return {"model_output": "This is a test model output.", "is_valid": True, "response_time": 0, "n_output_tokens": 0}

def handle_request_error(self, e):
return True

class TestHFDataReader(HFDataReader):
def __init__(self, path, **kwargs):
Expand Down
Loading