# üöÄ MAS Evaluation Framework Demo

This notebook implements a complete **Multi-Agent System (MAS)** evaluation pipeline using **GEMMAS** (Graph-based Metrics) and **MAST** (Failure Taxonomy).

## Pipeline Overview
1. **Trace Capture**: Instrumentation using OpenTelemetry & Google ADK.
2. **Execution**: Running a multi-agent scenario (Researcher & Writer).
3. **Graph Construction**: Building a Causal Reasoning Graph (CRG) from traces.
4. **GEMMAS Evaluation**: Computing Information Diversity Score (IDS) and Unnecessary Path Ratio (UPR).
5. **MAST Analysis**: Self-correcting failure diagnosis (Fine-tune -> Classify).
6. **Advisory**: Generating architectural recommendations using Gemini 2.5.

In [1]:
import os
import asyncio
import json
from typing import List, Dict, Any
from datetime import datetime
import networkx as nx
import matplotlib.pyplot as plt

# Ensure google-generativeai is installed
# !pip install google-generativeai google-adk networkx sentence-transformers scikit-learn

# Import MAS Eval Framework
from mas_eval.core.types import Span, TraceData
from mas_eval.adapters.adk_adapter import ADKAdapter, ADKTracingCallback
from mas_eval.graph.crg_builder import CRGModule
from mas_eval.metrics.gemmas import GEMMAS_Evaluator
from mas_eval.mast.fine_tuning import MASTFineTuner
from mas_eval.mast.classifier import MASTClassifier, ClassifierMode
from mas_eval.suggestions.advisor import MASAdvisor
from mas_eval.graph.visualizer import GraphVisualizer

# Set API Key (User should replace this or set env var)
# os.environ["GOOGLE_API_KEY"] = "YOUR_API_KEY"
if not os.environ.get("GOOGLE_API_KEY"):
    print("‚ö†Ô∏è Please set GOOGLE_API_KEY environment variable!")

ModuleNotFoundError: No module named 'mas_eval'

## 1. Trace Capture & Agent Setup
We use `ADKTracingCallback` to capture every thought, action, and output from our agents.

In [None]:
from google.adk.agents import Agent
from google.adk.runners import InMemoryRunner

# Define Agent Instructions
RESEARCHER_PROMPT = """
You are a Research Agent.
Role: Gather technical information on the given topic.
Output: a concise list of key facts.
"""

WRITER_PROMPT = """
You are a Writer Agent.
Role: Synthesize the research into a short, engaging paragraph.
Input: Research facts.
Output: Final summary.
"""

# Create Agents
model_name = "gemini-2.5-flash"
researcher = Agent(name="Researcher", model=model_name, system_prompt=RESEARCHER_PROMPT)
writer = Agent(name="Writer", model=model_name, system_prompt=WRITER_PROMPT)

# Initialize Tracing Callback
tracer = ADKTracingCallback(service_name="mas-demo", verbose=True)

## 2. Execution Loop
We run the agents in a simplified sequence to generate interaction traces.

In [None]:
async def run_scenario(topic: str):
    print(f"‚ñ∂Ô∏è Starting Scenario: {topic}")
    tracer.clear()
    
    # 1. Researcher Step
    print("\n--- Researcher Working ---")
    # Manually simulating ADK events for demo structure clarity 
    # In real usage, you'd attach tracer to the runner
    
    # Simulate Start
    span_id_1 = tracer.on_agent_start("Researcher", RESEARCHER_PROMPT)
    tracer.on_thought("Researcher", f"I need to find facts about {topic}.")
    tracer.on_action("Researcher", "Searching internal knowledge base...")
    research_output = f"Key facts about {topic}: 1. It is a complex system. 2. It involves multiple agents."
    tracer.on_output("Researcher", research_output)
    tracer._create_transfer_span("Researcher", "Writer")
    tracer.on_agent_end("Researcher", research_output)

    # 2. Writer Step
    print("\n--- Writer Working ---")
    span_id_2 = tracer.on_agent_start("Writer", WRITER_PROMPT)
    tracer.on_thought("Writer", "I have received facts. Now I must summarize them.")
    final_summary = f"The {topic} is characterized by its complexity and multi-agent nature."
    tracer.on_output("Writer", final_summary)
    tracer.on_agent_end("Writer", final_summary)
    
    print("\n‚úÖ Scenario Complete")
    return tracer.get_spans()

# Run the scenario
spans = await run_scenario("Autonomous Swarms")

## 3. Graph Construction (DAG)
Convert linear traces into a Causal Reasoning Graph (CRG).

In [None]:
def build_interaction_graph(spans: List[Span]) -> nx.DiGraph:
    crg_module = CRGModule()
    graph = crg_module.build(spans)
    
    # Add semantic edges for richer analysis
    num_semantic = crg_module.add_semantic_edges(similarity_threshold=0.5)
    print(f"Graph constructed: {graph.number_of_nodes()} nodes, {graph.number_of_edges()} edges")
    print(f"Added {num_semantic} semantic edges")
    
    return graph

graph = build_interaction_graph(spans)

# Visualize
visualizer = GraphVisualizer(graph)
visualizer.plot(output_path="interaction_graph.png")
# plt.imshow(plt.imread("interaction_graph.png"))
# plt.axis('off')
# plt.show()

## 4. GEMMAS Evaluation
Calculate Information Diversity Score (IDS) and Unnecessary Path Ratio (UPR).

In [None]:
evaluator = GEMMAS_Evaluator()
metrics = evaluator.evaluate(graph, spans)

print("=== GEMMAS Metrics ===")
print(f"IDS (Information Diversity): {metrics['IDS']:.4f} ({metrics['IDS_interpretation']})")
print(f"UPR (Unnecessary Paths):     {metrics['UPR']:.4f} ({metrics['UPR_interpretation']})")

## 5. MAST Analysis (Self-Correction)
Automatically check for a fine-tuned judge. If missing, self-train (fine-tune) one using the MAST dataset, then classify failures.

In [None]:
async def run_mast_analysis(spans: List[Span]):
    CONFIG_PATH = "tuned_mast_judge.json"
    
    # 1. Check for existing model
    if os.path.exists(CONFIG_PATH):
        print("‚úÖ Found existing fine-tuned MAST judge.")
        train_result = MASTFineTuner.load_model_config(CONFIG_PATH)
        model_name = train_result.model_name
        mode = ClassifierMode.FINE_TUNED
    else:
        print("‚ö†Ô∏è No fine-tuned judge found. Initiating self-training...")
        # Initialize Tuner
        tuner = MASTFineTuner()
        dataset_path = "mast_dataset/MAD_human_labelled_dataset.json"
        
        if not os.path.exists(dataset_path):
             # Fallback if dataset not found on disk (mock logic for demo reliability)
             print("   Dataset not found. Using Few-Shot ICL mode instead of training.")
             model_name = "gemini-2.5-flash"
             mode = ClassifierMode.FEW_SHOT_ICL
        else:
            # Train
            # Note: Running training in a notebook cell blocks until completion
            print("   Preparing dataset and starting fine-tuning job...")
            tuner.prepare_training_data(dataset_path, max_examples=50) # Small batch for demo
            result = tuner.start_training(wait=True)
            tuner.save_model_config(CONFIG_PATH)
            model_name = result.model_name
            mode = ClassifierMode.FINE_TUNED
    
    # 2. Classify Failures
    print(f"\nüîç Analyzing traces using {model_name} ({mode.value})...")
    classifier = MASTClassifier(model=model_name, mode=mode)
    result = classifier.classify(spans)
    
    print(classifier.summary(result))
    return result

mast_result = await run_mast_analysis(spans)

## 6. Final Report & Advisor (Gemini 2.5)
Synthesize all findings into a report and ask the "Senior MAS Architect" (Gemini 2.5) for specific improvements.

In [None]:
import google.generativeai as genai
from IPython.display import display, Markdown

def generate_advisory_report(metrics, mast_result, graph_path="interaction_graph.png"):
    # 1. Compile Data
    report_data = {
        "metrics": metrics,
        "failures": [f.to_dict() if hasattr(f, 'to_dict') else str(f) for f in mast_result.failure_modes],
        "trace_summary": mast_result.trace_summary
    }
    
    report_json = json.dumps(report_data, indent=2)
    
    # 2. Prompt Gemini 2.5 as "Senior MAS Architect"
    architect_prompt = f"""
    You are an expert Multi-Agent System (MAS) Architect.
    
    Analyze the following evaluation report for a MAS execution:
    {report_json}
    
    The Interaction Graph is attached (if available).
    
    Task:
    1. Analyze the GEMMAS metrics (IDS, UPR).
    2. Review the MAST failure modes detected.
    3. Recommend 3 specific, actionable changes to the agent prompts, topology, or logic to improve efficiency and reduce failures.
    
    Format your response as a professional Engineering Advisory Memo.
    """
    
    print("ü§î Asking Senior MAS Architect for advice...")
    
    try:
        model = genai.GenerativeModel("gemini-2.5-flash")
        # If we had the image loaded as a PIL object or bytes, we would pass it here.
        # For now, we pass the text context.
        response = model.generate_content(architect_prompt)
        
        display(Markdown("# üèóÔ∏è Senior MAS Architect Advisory Report"))
        display(Markdown(response.text))
        
    except Exception as e:
        print(f"Error generating advisory: {e}")

generate_advisory_report(metrics, mast_result)