# Chosen Workflow Pattern: Orchestrator - Worker

1. Generate a complex question to ask all models
2. Receive Models' Answers
3. Store Model Name and Answer in a List of LLMResult Objects
4. Have a reasoning model review all answers
5. Return a List of Model Names with Ranking and brief Explanation of why they were given their rank

__Potential JSON Responses__
```json
{
    "result": {
        "o4-mini": {
            "rank": "1",
            "reason": "This model showed a deep understanding of..."
        },
        "gemma3:12b": {
            "rank": "2",
            "reason": "This model had a good understanding of..."
        }
    }
}
```

In [3]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel
from IPython.display import Markdown, display

load_dotenv(override=True)

True

In [4]:
# Globals

GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
GEMINI_BASE_URL = os.getenv('GEMINI_BASE_URL')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL')
OLLAMA_API_KEY = "ollama"

In [None]:
# Generate Question
prompt = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
prompt += "Please respond only with the question, no explanation."

def generator(prompt: str) -> str:
    message: list = [{"role": "user", "content": prompt}]

    openai = OpenAI()
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=message
    )
    return response.choices[0].message.content or ""

question = generator(prompt)

Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. Please respond only with the question, no explanation.


In [None]:
# Data Structures

class LLMResult(BaseModel):
    model: str
    answer: str

results: list[LLMResult] = []
models = []
ranks = []
messages: list = [{"role": "user", "content": question}]

In [None]:
# Create Models
# Append Model Name and Answer to Results List

def gemini_answer() -> None:
    gemini = OpenAI(
        api_key=GEMINI_API_KEY,
        base_url=GEMINI_BASE_URL
    )

    model = "gemini-2.5-flash-lite"
    try:
        response = gemini.chat.completions.create(
            model=model,
            messages=messages
        )
        answer = response.choices[0].message.content or ""
        results.append(LLMResult(model=model, answer=answer))
    except Exception as e:
        print(f"Error: {e}")

def openai_answer() -> None:
    openai = OpenAI()

    model = "gpt-4o-mini"
    try:
        response = openai.chat.completions.create(
            model=model,
            messages=messages
        )
        answer = response.choices[0].message.content or ""
        results.append(LLMResult(model=model, answer=answer))
    except Exception as e:
        print(f"Error: {e}")

def ollama_answer() -> None:
    ollama = OpenAI(
        api_key=OLLAMA_API_KEY,
        base_url=OLLAMA_BASE_URL
    )

    model = "gemma3:12b"
    try:
        response = ollama.chat.completions.create(
            model=model,
            messages=messages
        )
        answer = response.choices[0].message.content or ""
        results.append(LLMResult(model=model, answer=answer))
    except Exception as e:
        print(f"Error: {e}")


openai_answer()
gemini_answer()
ollama_answer()

In [None]:
# Format Answer Together

together = ""
for res in results:
    model = res.model
    answer = res.answer
    # models.append(model)
    # answers.append(answer)
    together += f"## Response from {model}:\n"
    together += f"{answer}\n\n"


In [None]:
# {   # as a Dict of Dicts
#     "result": {
#         "<MODEL_NAME>": {
#             "rank": "<RANK>",
#             "why": "<BREIF EXPLANATION>"
#         },
#         "<MODEL_NAME>": {
#             "rank": "<RANK>",
#             "why": "<BREIF EXPLANATION>"
#         },
#         ...
#     }
# }
# {    # as a List of Dicts
#     "result": [
#         "<MODEL_NAME>": {
#             "rank": "1",
#             "why": "<BREIF EXPLANATION>"
#         },
#         "<MODEL_NAME>": {
#             "rank": "2",
#             "why": "<BREIF EXPLANATION>"
#         },
#         ...
#     ]
# }

In [None]:
# Compare Prompt

compare = f"""You are juding the responses of {len(results)} models. Each model was given the following question:

{question}

Please rank each model between 1 and {len(results)}, with 1 being the best, and the lowest being the worst.
The evaluation will be based off of: intelligence, reasoning, and response quality.
Please output your response in JSON and only in JSON. It should contain the rank, and a brief explaination why, following this format:

```json
{
    "result": {
        "<MODEL_NAME>": {
            "rank": "<RANK>",
            "why": "<BREIF EXPLANATION>"
        },
        "<MODEL_NAME>": {
            "rank": "<RANK>",
            "why": "<BREIF EXPLANATION>"
        },
        "<MODEL_NAME>": {
            "rank": "<RANK>",
            "why": "<BREIF EXPLANATION>"
        }
    }
}
```

Thank you, and here is the Content to review for ranking:

{together}
"""

In [None]:
# Get Score Response
ranking_response = generator(compare)


In [None]:
# Get Results

# 'ranking_response' is a String of JSON
# it needs to be converted to a Python Dict

# Convert from a String to JSON
try:
    json_ranking: dict = json.loads(ranking_response)
    rankings: dict = json_ranking["result"]
    
    # Sort by rank for better display
    sorted_rankings = sorted(rankings.items(), key=lambda x: int(x[1]["rank"]))
    
    output = ""
    for model_name, ranking_data in sorted_rankings:
        rank = ranking_data["rank"]
        reason = ranking_data["reason"]
        output += f"**Rank {rank}: {model_name}**\n"
        output += f"*Reason:* {reason}\n\n"
    
    display(Markdown(output))
    
    # Also display the original question and summary
    summary_output = f"""
## Original Question
{question}

## Summary
- **Total Models Evaluated:** {len(results)}
- **Winner:** {sorted_rankings[0][0]} 
- **Evaluation Criteria:** Intelligence, reasoning, and response quality
"""
    display(Markdown(summary_output))
    
except (json.JSONDecodeError, KeyError) as e:
    print(f"Error parsing ranking response: {e}")
    print(f"Raw response: {ranking_response}")
    
    # Fallback display
    output = "**Error in ranking - showing raw response:**\n\n"
    output += ranking_response
    display(Markdown(output))
