Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
9a572e5
feat(benchmarks): add concurrency support for LLM calls
M1n9X Aug 25, 2025
c74f2e7
Merge dev branch updates
M1n9X Aug 25, 2025
9fdd2f7
refactor(benchmarks): remove staggered execution in concurrent evalua…
M1n9X Aug 25, 2025
6d94d02
feat(benchmarks): refactor evaluation process for concurrency and eff…
M1n9X Aug 26, 2025
48e3f4f
feat(benchmarks): integrate concurrent evaluation with retrieval trac…
savourylie Sep 1, 2025
7795cc7
fix(benchmarks): restore enhanced retrieval metrics in concurrent eva…
savourylie Sep 1, 2025
5840be7
feat(benchmarks): add detailed retrieval content tracking with verbos…
savourylie Sep 1, 2025
525eb07
Merge pull request #14 from memfuse/feat/110-retrieval-tracking
savourylie Sep 1, 2025
67b721c
feat(client): implement idempotent agent creation with singleflight p…
M1n9X Sep 2, 2025
7fa090b
chore: update .gitignore for specstory integration
M1n9X Sep 2, 2025
41bc077
Merge pull request #15 from memfuse/bug/118-none-agentname
savourylie Sep 2, 2025
fab8cd2
feat: add version field to health endpoint for SDK compatibility chec…
savourylie Sep 2, 2025
a7a50d6
Merge pull request #16 from memfuse/feat/116-add-version-health-endpoint
savourylie Sep 2, 2025
7baa182
refactor: replace openrouter with openai compatible client in MSC acc…
savourylie Sep 2, 2025
f7ec416
Merge pull request #17 from memfuse/feat/120-replace-litellm-with-openai
savourylie Sep 2, 2025
55c1363
fix: update OpenAI model to use environment variable for compatibility
savourylie Sep 12, 2025
099b3cf
feat: add metadata support to UsersApi and AsyncMemory, deprecate leg…
savourylie Sep 12, 2025
1ecdcf2
feat: enhance version parsing to support release candidates and multi…
savourylie Sep 12, 2025
5af0bd6
feat: add metadata support to messages and memory APIs, update reques…
savourylie Sep 12, 2025
d486e63
feat: update model references to use environment variable for compati…
savourylie Sep 12, 2025
038db17
Merge pull request #18 from memfuse/feat/146-prompts-handle-new-query…
savourylie Sep 12, 2025
861217b
feat: add debug logging support controlled by MEMFUSE_DEBUG environme…
savourylie Sep 16, 2025
2881bc1
Merge pull request #19 from memfuse/feat/148-add-debug-logging-for-ap…
savourylie Sep 16, 2025
43a0810
feat: update MemFuse base URL to use port 8765 across examples and tests
savourylie Sep 24, 2025
d6e0b9b
Merge pull request #20 from memfuse/fix/change-port-and-query-schema
savourylie Sep 24, 2025
69c3cc9
feat: add optional UI dependencies and update examples to handle miss…
savourylie Sep 24, 2025
933a8f3
Merge pull request #21 from memfuse/feat/move-gradio-dep-to-extras
savourylie Sep 24, 2025
958354a
feat: streamline server health checks during session initialization
savourylie Sep 29, 2025
50f376e
Merge pull request #22 from memfuse/fix/health-check-on-every-request
savourylie Sep 29, 2025
c13add7
Merge remote-tracking branch 'origin/main' into dev
savourylie Oct 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,7 @@ ANTHROPIC_API_KEY=sk-ant-your-anthropic-api-key-here
GEMINI_API_KEY=your-google-gemini-api-key-here

# MemFuse API Key
MEMFUSE_API_KEY=your-memfuse-api-key-here
MEMFUSE_API_KEY=your-memfuse-api-key-here

# MemFuse Debug Mode
MEMFUSE_DEBUG=0
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ htmlcov/
nosetests.xml
coverage.xml
*.cover
.cursorindexingignore
*.py,cover
.hypothesis/
.pytest_cache/
Expand Down Expand Up @@ -175,9 +176,13 @@ cython_debug/

.DS_Store
CLAUDE.md
AGENTS.md

.cursor/
results/

benchmarks/results/
.claude/
.claude/
.vscode/

.specstory/**
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,20 @@ cd memfuse-python
pip install -e .
```

### Optional extras

Some features are optional and shipped as extras:

- UI (Gradio demo UIs)
- pip: `pip install "memfuse[ui]"`
- poetry: add the `ui` extra

- Full (includes UI)
- pip: `pip install "memfuse[full]"`
- poetry: add the `full` extra

The Gradio-based examples in `examples/` require the `ui` extra. If you run those scripts without the extra installed, they will raise: `RuntimeError('Install memfuse[ui] to use the demo UI.')`.

## Quick Start

Here's a comprehensive example demonstrating how to use the MemFuse Python SDK with OpenAI:
Expand Down
86 changes: 58 additions & 28 deletions benchmarks/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def get_default_model(llm_provider):
}


def save_individual_results(results, dataset_name: str, llm_provider: str):
def save_individual_results(results, dataset_name: str, llm_provider: str, retrieval_verbose: bool = False):
"""Save detailed individual results to a file for analysis."""
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"individual_results_{dataset_name}_{llm_provider}_{timestamp}.txt"
Expand Down Expand Up @@ -145,7 +145,21 @@ def save_individual_results(results, dataset_name: str, llm_provider: str):
f.write(f" Recall: {recall:.3f}\n")
f.write(f" F1: {f1:.3f}\n")
f.write(f" Retrieved memories: {retrieved_memories_count}\n\n")


# Display retrieved memories content if verbose and available
if retrieval_verbose and 'retrieved_memories_content' in result:
retrieved_memories = result['retrieved_memories_content']
f.write("🧠 RETRIEVED MEMORIES CONTENT:\n")
for i, memory in enumerate(retrieved_memories, 1):
f.write(f" Memory {i} (Score: {memory['score']:.4f}):\n")
content = memory['content']
# Show first 500 characters, add ellipsis if truncated
if len(content) > 500:
f.write(f" \"{content[:500]}...\"\n")
else:
f.write(f" \"{content}\"\n")
f.write("\n")

f.write("=" * 80 + "\n\n")

# Summary
Expand All @@ -169,27 +183,29 @@ def save_individual_results(results, dataset_name: str, llm_provider: str):
def print_benchmark_summary(results, dataset_name):
"""Print detailed benchmark summary with histogram visualization."""

# Collect incorrect question IDs
incorrect_question_ids = []
# Collect incorrect question IDs with recall flags
incorrect_question_data = []
for result in results.question_results:
if 'is_correct' in result and not result['is_correct']:
question_id = result.get('question_id', 'N/A')
if question_id != 'N/A':
incorrect_question_ids.append(question_id)
recall = result.get('recall', 0.0)
recall_flag = 1 if recall > 0.0 else 0
incorrect_question_data.append((question_id, recall_flag))

# Write incorrect question IDs to file if any exist
if incorrect_question_ids:
# Write incorrect question IDs with recall flags to CSV file if any exist
if incorrect_question_data:
import datetime
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = os.path.join(os.path.dirname(__file__), 'results')
filename = f"incorrect_questions_{dataset_name}_{timestamp}.txt"
filename = f"incorrect_questions_{dataset_name}_{timestamp}.csv"
filepath = os.path.join(results_dir, filename)

try:
os.makedirs(results_dir, exist_ok=True)
with open(filepath, 'w') as f:
for question_id in incorrect_question_ids:
f.write(f"{question_id}\n")
for question_id, recall_flag in incorrect_question_data:
f.write(f"{question_id},{recall_flag}\n")
logger.info(f"Incorrect question IDs saved to: {filepath}")
except Exception as e:
logger.error(f"Failed to write incorrect question IDs to file: {e}")
Expand All @@ -205,8 +221,8 @@ def print_benchmark_summary(results, dataset_name):
print(f" Model: {result.get('model_choice_idx')} | Correct: {result.get('correct_choice_idx')}")
if 'retrieval_time_ms' in result:
print(f" Retrieval: {result['retrieval_time_ms']:.2f}ms")
# Show retrieval metrics for LME dataset
if dataset_name == "lme" and 'precision' in result:
# Show retrieval metrics for LME and MSC datasets
if dataset_name in ["lme", "msc"] and 'precision' in result:
print(f" Retrieval Metrics - P: {result['precision']:.3f}, R: {result['recall']:.3f}, F1: {result['f1']:.3f}")
else:
print(f"Q{i+1}: {result.get('question_id', 'N/A')} - ⚠️ {result.get('status', 'UNKNOWN')}")
Expand All @@ -229,18 +245,19 @@ def print_benchmark_summary(results, dataset_name):
else:
print(f"⚠️ {results.total_count - results.success_count} questions failed evaluation")

# Retrieval evaluation metrics (LME only)
if results.retrieval_metrics_available and dataset_name == "lme":
# Retrieval evaluation metrics (LME and MSC)
if results.retrieval_metrics_available and dataset_name in ["lme", "msc"]:
print(f"\n🎯 RETRIEVAL EVALUATION METRICS:")
print(f" Average Precision: {results.avg_precision:.3f}")
print(f" Average Recall: {results.avg_recall:.3f}")
print(f" Average F1 Score: {results.avg_f1:.3f}")

# Show incorrect question IDs if any
if incorrect_question_ids:
if incorrect_question_data:
incorrect_question_ids = [question_id for question_id, _ in incorrect_question_data]
print(f"\n❌ Incorrect Question IDs ({len(incorrect_question_ids)} total):")
print(", ".join(incorrect_question_ids))
print(f"💾 Incorrect question IDs also saved to benchmarks/results/")
print(f"💾 Incorrect question IDs with recall flags saved to benchmarks/results/")

# Retrieval time statistics
if results.query_times:
Expand Down Expand Up @@ -283,20 +300,22 @@ async def main():
parser.add_argument("--question-types", nargs="+", help="Filter by question types (LME only)")
parser.add_argument("--question-ids-file", type=str, help="File containing question IDs to test (one per line)")
parser.add_argument("--top-k", type=int, help="Override default TOP_K value for memory retrieval")
parser.add_argument("--llm-provider", type=str, choices=["gemini", "openai", "anthropic"],
default="gemini", help="LLM provider to use (default: gemini)")

# Parse args partially to get the provider first
known_args, _ = parser.parse_known_args()
default_model = get_default_model(known_args.llm_provider)

parser.add_argument("--model", type=str, default=default_model,
help=f"Model name (default for {known_args.llm_provider}: {default_model})")
parser.add_argument("--no-data-loading", action="store_true",
parser.add_argument("--llm-provider", type=str, choices=["gemini", "openai", "anthropic"],
default="openai", help="LLM provider to use (default: openai)")
parser.add_argument("--model", type=str, help="Model name (provider-specific default will be used if not specified)")
parser.add_argument("--no-data-loading", action="store_true",
help="Skip loading haystack data per question (assumes data already loaded)")
parser.add_argument("--concurrent", type=int, default=1,
help="Number of concurrent evaluations (default: 1)")
parser.add_argument("--retrieval-verbose", action="store_true",
help="Save and display retrieved memory content in detailed results")

args = parser.parse_args()

# Set provider-specific default model if not specified
if not args.model:
args.model = get_default_model(args.llm_provider)

# Validate question-types argument
if args.question_types and args.dataset != "lme":
logger.warning(f"--question-types is only supported for LME dataset, ignoring for {args.dataset}")
Expand All @@ -307,7 +326,16 @@ async def main():
if args.question_ids_file:
try:
with open(args.question_ids_file, 'r') as f:
question_ids_from_file = [line.strip() for line in f if line.strip()]
question_ids_from_file = []
for line in f:
line = line.strip()
if line:
# Handle CSV format (question_id,recall_flag) or plain text (question_id only)
if ',' in line:
question_id = line.split(',')[0].strip()
else:
question_id = line
question_ids_from_file.append(question_id)
logger.info(f"Loaded {len(question_ids_from_file)} question IDs from {args.question_ids_file}")

# When using question-ids-file, override conflicting options
Expand Down Expand Up @@ -372,11 +400,13 @@ async def main():
model_name=model_name,
llm_provider=args.llm_provider,
skip_data_loading=args.no_data_loading,
concurrent=args.concurrent,
retrieval_verbose=args.retrieval_verbose,
logger=logger
)

# Save individual results to file
save_individual_results(results, args.dataset, args.llm_provider)
save_individual_results(results, args.dataset, args.llm_provider, args.retrieval_verbose)

# Print detailed benchmark summary with visualization
print_benchmark_summary(results, args.dataset)
Expand Down
Loading