# Evaluate LLM results

Install Dependencies

In [1]:
# %%capture
# !pip install datasets==2.20.0
# !pip install -U langsmith==0.1.99
# !pip install langchain_openai==0.2.0 #0.1.22
# !pip install langchain==0.3.0 #0.2.13
# !pip install langchain_community==0.3.0 #0.2.12  
# !pip install langchain-huggingface==0.1.0                      
# !pip install transformers==4.44.0
# !pip install torch==2.1.0
# !pip install termcolor==2.4.0
# !pip install accelerate==0.33.0
# !pip install pandas==2.2.2
# !pip install openpyxl==3.1.5
# !pip install python-dotenv==1.0.1
# !pip install einops==0.8.0
# !pip install wheel==0.44.0
# !pip install sentencepiece==0.2.0
# !pip install protobuf==5.27.3 #Mistral models needs this
# !pip install groq==0.10.0 #Groq models needs this
# !pip install matplotlib==3.9.2
# !pip install seaborn==0.13.2
# !pip install scipy==1.14.1
# !pip install statsmodels==0.14.4
# !pip install anthropic==0.40.0 #Anthropic models needs this
# !pip install together==1.3.14 #Together models needs this
# !pip install google-generativeai==0.8.4
# !pip install google-genai==0.8.0
# !pip install sentence-transformers==3.3.1

# !pip install flash-attn==2.6.3 #Install it at the end after wheel has been installed

# #Only if CPU is used
# !pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html

In [2]:
# !jupyter lab --ServerApp.iopub_data_rate_limit=1e10

In [3]:
import warnings
warnings.filterwarnings('ignore')

RunPod specific parameters

In [4]:
#For RunPod change to persistent storage directory
import os
os.chdir('/workspace')

Specify Path and Load API Keys

In [5]:
file_path ='/workspace/DRACO_test_basic_questions.xlsx'#tough_modified.xlsx' #test_basic_questions.xlsx'# #Dataset generated with the help of GPT-4o - Has to be an excel file with 'input' and 'output' columns
#'/Users/nikolaossourlo/Desktop/Example_QA_data_raw.xlsx' #For MacOS
#'C:/Users/soyrl/Desktop/Example_QA_data_raw.xlsx' #For Windows
#'/content/drive/My Drive/Example_QA_data_raw.xlsx' #For Google Colab
#'/home/nikolaossourlo/Example_QA_data_raw.xlsx' #For Delft Blue
#'/workspace/Example_QA_data_raw.xlsx' #For RunPod

custom_cache_dir="/workspace/cache/huggingface" #Save models here so that we don't have to download them again
#"/scratch/nikolaossourlo/cache" in Delft Blue

# Check if custom_cache_dir is defined, otherwise use default behavior
try:
    cache_dir=custom_cache_dir
except:
    cache_dir=None

from dotenv import load_dotenv
import os
import json
import numpy as np
import traceback
import time
import subprocess

# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

# Get the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY_DRACO')
langsmith_api_key = os.getenv('LANGSMITH_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY_DRACO')
together_api_key = os.getenv('TOGETHER_API_KEY_DRACO')
open_router_api_key = os.getenv('OPEN_ROUTER_API_KEY')
gemini_api_key = os.getenv('GEMINI_API_KEY')
groq_api_key=os.getenv('GROQ_API_KEY')

#Login to Hugging Face
from huggingface_hub import login
# Log in with your Hugging Face token
login(token=os.getenv('HF_TOKEN'))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Select model and name for the experiment

In [6]:
#Model to generate responses to questions - Sometimes we might have to restart session and comment out the models that have already been run
models=[ #Reasoning models (deepseek only for now) should have longer max_tokens to include the reasoning steps

    # "openai/o3-2025-04-16", #200K context length, 100K output tokens
    # "openai/o4-mini", #200K context length, 100K output tokens
    # "together/Qwen/QwQ-32B", #131072 context length    # "Qwen/QwQ-32B-AWQ",
    # "together/deepseek-ai/DeepSeek-R1", #164K context length
    # "openai/gpt-4.1",
    # "openai/o1", #200K context length, Max Output Tokens 100K #o1-2024-12-17
    # "openai/o1-mini", #16384 completion tokens 128K context length, Max Output Tokens 65536 #o1-mini-2024-09-12
    # "openai/o3-mini", #200K context length, Max Output Tokens 100K #o3-mini-2025-01-31
    # "openai/gpt-4o-2024-08-06",
    "gemini/gemini-2.0-flash-exp",
    # "gemini/gemini-2.5-pro-exp-03-25", #1,048,576 input tokens length
    # "together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    # "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", #128K context length
    # 'microsoft/phi-4', #14B parameters

    'openai/gpt-4o-mini',

    # "together/deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
    # "Qwen/Qwen2.5-7B-Instruct",
    # "meta-llama/Llama-3.2-3B-Instruct",
    # "meta-llama/Meta-Llama-3.1-8B-Instruct", #A4500 (20GB VRAM) and in Delft Blue (V100 32GB)
    # "microsoft/Phi-3.5-mini-instruct", #A40 with 48GB VRAM, A4500 with 20GB VRAM, Delft Blue 
    # "mistralai/Mistral-7B-Instruct-v0.3", #A40 with 48GB VRAM, A4500 with 20GB VRAM and in Delft Blue
    # "Qwen/Qwen2-7B-Instruct", #A40 with 48GB VRAM, A4500 with 20GB VRAM, Delft Blue
    # 'AI-MO/NuminaMath-7B-TIR', #A4500 with 20GB VRAM and in Delft Blue - We can also try 01-ai/Yi-Coder-9B-Chat
    # 'microsoft/Phi-3-mini-4k-instruct', #RTX3090
    # "google/gemma-2-9b-it", #More than 20GB of GPU memory needed - Works with A40 with 48GB VRAM, but not with A4500 - 20GB, and V100 - 32GB, Delft Blue
    # 'mistralai/Mistral-Nemo-Instruct-2407', #12B parameters, 2 RTX3090, V100 with 32GB VRAM
    # "anthropic/claude-3-5-sonnet-20241022",
    # 'openai/gpt-4o-mini' #Costs very low ~0.01$ for 9 Q&A pairs.
    # "gemini/gemini-2.0-flash-thinking-exp", #Thoughts only in Google studio, not in API - https://discuss.ai.google.dev/t/thoughts-are-missing-cot-not-included-anymore/63653/8
    ] #Takes 7+hours in A40 for the above 13 models with 7Q&A paris and 4 resamples. Cost ±3$ (±180GB)

# Groq models are defined as: groq_website/model_name e.g. 'groq_website/llama-3.1-70b-versatile'
# OpenAI models are defined as: 'openai/model_name', e.g. 'openai/gpt-4o-mini'
# Anthropic models are defined as 'anthropic/model_name', e.g. 'anthropic/claude-3-haiku-20240307' - Couldn't use due to billing issues
# Together models are defined as 'together/model_name', e.g. 'together/meta-llama/Llama-3.3-70B-Instruct-Turbo-Free'
# OpenRouter models are defined as 'openrouter/model_name', e.g. 'openrouter/deepseek/deepseek-r1:free' - Do not work due to extremely limited quota
# Gemini models are defined as 'gemini/model_name', e.g. 'gemini/gemini-2.0-flash-exp'

# I couldn't run 'nvidia/Mistral-NeMo-Minitron-8B-Base', "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" (Conflicting dependencies),
# 'google/recurrentgemma-9b-it' # RecurrentGemmaForCausalLM.forward() got an unexpected keyword argument 'position_ids'
#Large models take more time (2min/generation for Mistral 12B)

#Define model to act as a judge
judge_model='openai/gpt-4o-mini' #If used with Llama, only 0.01$ for 9 Q&A pairs for gpt-4o-mini, and 0.22$ for gpt-4o
judge_model_2="gemini/gemini-2.5-flash-preview-04-17"
judges=[judge_model,judge_model_2]

#Used below to distinguish commercial and Hugging Face models
commercial_api_providers=['openai','groq_website','anthropic','together', 'openrouter', 'gemini']

#Define maximum number of tokes in the judge LLM output
max_output_tokens=2000

#Limit of tokens in the generated response from LLM - For reasoning models we increase it to 16000 - had to define it below. 
generate_max_tokens=1000
generation_max_tokens_thinking=16000 #This is the output generation tokens. We have to make sure that this along with input tokens not exceed context length

#Domain - Chemical/Water Engineering or anything else
domain="Water"

#Inference on whole dataset?
inference_on_whole_dataset=True

#Number of times to resample the dataset
n_resamples=1 #4 reduces the variance to 50%

#Decide if in our dataset we want to enable tool usage to answer questions
tool_usage=True

Define prompts for custom evaluation metrics

In [7]:
common_prompt=""" 
You are an autoregressive language model that acts as a judge in comparing a predicted vs an actual answer to a questions.
Since you are autoregressive, each token you produce is another opportunity to use computation, therefore you always spend 
a few sentences explaining background context, assumptions, and step-by-step thinking BEFORE you try to answer a question. 
Your users are experts in"""+ domain +"""engineering, so they already know you're a language model and your capabilities and limitations, so don't 
remind them of that. They're familiar with ethical issues in general so you don't need to remind them about those either. 
Don't be verbose in your answers, but do provide details and examples where it might help the explanation. 
""" #This is common for all prompts below - change with 'chemical engineering' for chemical engineer questions

In [8]:
completeness_descr = """
Your task is to evaluate responses predicted by an LLM with regards to completeness compared to the completeness of a given actual, golden standard answer. 
The completeness metric evaluates the extent to which the user's question is answered in full in the predicted response. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: There is no response.
2: No parts of a suitable answer are present.
3: Few elements of a complete answer are present.
4: Most elements of a complete answer are present.
5: The response covers all elements of a complete answer.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

relevance_descr = """
Your task is to evaluate responses predicted by an LLM with regards to relevance compared to the relevance of a given actual, golden standard answer. 
The relevance metric evaluates the amount of irrelevant information in the predicted response considering the user's original question. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response answers something else, not the user's question.
2: The response answers the user's question but the information provided is mostly irrelevant.
3: The response answers the user's question but contains more irrelevant information than relevant information.
4: The response answers the user's question, and shares a bit of irrelevant information.
5: The response answers the user's question and contains no irrelevant information.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

conciseness_descr = """
Your task is to evaluate responses predicted by an LLM with regards to conciseness compared to the conciseness of a given actual, golden standard answer. 
The conciseness metric evaluates the amount of unexpected extra information in the predicted response considering the user's original question. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response is too long and stops before completion or enters an infinite loop.
2: The response includes a lot of extra information and uses flowery language.
3: The response includes a lot of extra information or uses flowery language.
4: The response is short and includes a small amount of extra information.
5: The response is as short as possible while still answering the prompt.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

confidence_descr = """
Your task is to evaluate responses predicted by an LLM with regards to confidence compared to the confidence of a given actual, golden standard answer. 
The condifence metric evaluates the degree of assurance that is conveyed the response that the predicted answer is correct. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: Complete Rejection. The response makes it clear that the given answer is incorrect or that no correct answer can be provided.
2: Doubt and Disagreement. The response suggests that the answer is likely incorrect or raises significant concerns.
3: Uncertainty. The response indicates that the answer could be correct, but there is significant doubt or insufficient evidence.
4: Moderate Agreement. The response leans towards the answer being correct but acknowledges some uncertainty.
5: Full Endorsement. The reponse confidentely asserts that the given answer is correct.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

factuality_descr = """
Your task is to evaluate responses predicted by an LLM with regards to factuality compared to the factuality of a given actual, golden standard answer.
 The factuality metric evaluates the degree of hallucination contained in a response or, in other words, how accurate a given response is.
You can assign a score from 1 to 5, with the following interpretations:
1: The response is a complete hallucination
2: The response is mostly a hallucination but does not change key information from the prompt
3: The response contains large amounts of both hallucinations and factual information.
4: The response includes mostly factual information with slight hallucinations.
5: The response only includes factual information.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
""" #2: The response is mostly a hallucination but does not change key information from the prompt"""+ # (such as chemical identifiers).


judgement_descr = """
Your task is to evaluate responses predicted by an LLM with regards to judgement compared to the judgement of a given actual, golden standard answer.
The judgment metric assesses how strongly the response implies its correctness, taking into account the actual accuracy of the answer.
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response confidently claims a hallucination as truth.
2: The response misinterprets information received in the prompt.
3: The response shows that the model is unsure about the answer or states that information is theoretical.
4: The response is wrong but it is made clear that the answer is wrong or that the model is unable to provide a correct answer.
5: The response is correct.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

general_descr = """
You are a strict but fair expert in water engineering, acting as a judge. You will be given a question and an answer. 
Your task is to evaluate the answer on a scale from 0 to 5, where 5 indicates a fully correct and complete response, and 0 indicates a fully incorrect or irrelevant answer. 
If the question asks for a specific number or set of numbers, assign a score of 5 only if the answer matches exactly or is accurate within a tolerance of ±0.01 
(correct up to two decimal places). If any required number is outside this margin, assign a score of 0. For conceptual or open-ended questions, evaluate based on accuracy, 
completeness, and clarity, using the full 1–5 scale as appropriate.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

Define tools and functions to decide if they should be used

In [9]:
tool_definitions = [
    # {
    #     "type": "function",
    #     "function": {
    #         "name": "add",
    #         "description": "Adds two numbers together",
    #         "parameters": {
    #             "type": "object",
    #             "properties": {
    #                 "number1": {
    #                     "type": "number",
    #                     "description": "The first number to add",
    #                 },
    #                 "number2": {
    #                     "type": "number",
    #                     "description": "The second number to add",
    #                 },
    #             },
    #             "required": ["number1", "number2"],
    #         },
    #     },
    # },

    {
        "type": "function",
        "function": {
            "name": "extract_code",
            "description": "Determines whether the provided text contains code. Use this tool whenever a user asks for a function, script, or programming-related response.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_output": {
                        "type": "string",
                        "description": "The text to analyze for the presence of code.",
                    },
                },
                "required": ["model_output"],
            },
        },
    },

    {
        "type": "function",
        "function": {
            "name": "send_to_LLM",
            "description": "Send user question to an LLM to receive a response.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_output": {
                        "type": "string",
                        "description": "The text received back from the LLM.",
                    },
                },
                "required": ["model_output"],
            },
        },
    },

    {
        "type": "function",
        "function": {
            "name": "run_simulation",
            "description": "Decide whether a INP file needs to be created and/or a simulation should be run.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_output": {
                        "type": "string",
                        "description": "the content of the INP file to run the simulation",
                    },
                },
                "required": ["model_output"],
            },
        },
    },

    {
        "type": "function",
        "function": {
            "name": "no_tool_needed",
            "description": "Use this when no specialized tool is needed to answer the user's question. The LLM can respond directly with general knowledge.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_output": {
                        "type": "string",
                        "description": "The direct response from the LLM without using any specialized tools.",
                    },
                },
                "required": ["model_output"],
            },
        },
    },

]

In [10]:
def decide_tool_usage(query, tools=tool_definitions, judge_model=judge_model, openai_api_key=openai_api_key):
    """Decide if a tool should be used based on the query, and if yes, output the tool name(s)."""

    # Construct prompt for the judge
    tool_descriptions = "\n".join(
        f"Tool Name: {tool['function']['name']}\nDescription: {tool['function']['description']}\nParameters: {', '.join(tool['function']['parameters']['properties'].keys())}"
        for tool in tools
    )
    
    prompt = f"""Given a user question, determine if any tool from the provided list should be used to answer the question.
    Consider:
    1. The capability of each tool, based on its name, description, and parameters, to provide useful information for answering the question
    2. If using no tool might be better than using a potentially misleading tool

    User Question: {query}

    Available Tools:
    {tool_descriptions}

    Should a tool be used for answering the question? If yes, specify the tool name(s). Respond with 'No' or the tool name(s).
    """
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant that determines tool usage."},
        {"role": "user", "content": prompt}
    ]
    
    # Use OpenAI to judge tool usage
    import openai
    from langsmith.wrappers import wrap_openai
    client = wrap_openai(openai.Client(api_key=openai_api_key))
    
    response = client.chat.completions.create(
        messages=messages,
        temperature=0,
        model=judge_model.split('/')[1],
        seed=42
    )
    
    tool_decision = response.choices[0].message.content.strip()
    print("Tool Decision:", tool_decision)
    
    if tool_decision.lower() == 'no':
        return ['no_tool_needed']#None
    else:
        return tool_decision.split(', ') #This returns a list of tools

In [11]:
# aaaa

this was installed in the runpod_init_llm!

In [12]:
# #Create environment if tool will be used
# # Create venv instead
# venv_name = "test_LLM"
# command_install = f"python -m venv /workspace/{venv_name}"
# print("Command install", command_install)

# try:
#     # Execute the command and capture the output
#     result_install = subprocess.check_output(command_install, shell=True, stderr=subprocess.STDOUT, text=True)
#     print("Venv installation:", result_install)
# except subprocess.CalledProcessError as e:
#     result_install = f"Error in installation: {e.output}"
#     print(result_install)

# # Activate the virtual environment
# activate_cmd = f"source /workspace/{venv_name}/bin/activate"

# #Install requirements
# requirements_file="code_execution.txt"
# try:
#     result = subprocess.run(f"bash -c '{activate_cmd} && pip install -r {requirements_file}'", shell=True, stderr=subprocess.STDOUT, text=True)
#     print("Requirements installation:",result)
# except subprocess.CalledProcessError as e:
#     result = f"Error in pip install: {e.output}"
#     print(result)

just added in runpod llm init. check first if works and then delete from here

In [13]:
# #For WNTR package

# venv_name = "test_LLM"
# # Activate the virtual environment
# activate_cmd = f"source /workspace/{venv_name}/bin/activate"

# try:
#     result = subprocess.run(f"bash -c '{activate_cmd} && pip install --upgrade pip setuptools wheel && pip install --upgrade --force-reinstall numpy && \
#                             pip install --upgrade --force-reinstall wntr'", shell=True, stderr=subprocess.STDOUT, text=True)
#     print("WNTR installation:",result)
# except subprocess.CalledProcessError as e:
#     result = f"Error in pip install: {e.output}"
#     print(result)

In [14]:
# import wntr
# import numpy as np

# def load_and_simulate(inp_file):
#   try:
#       wn = wntr.network.WaterNetworkModel(inp_file)
#       sim = wntr.sim.EpanetSimulator(wn)
#       results = sim.run_sim()
#       return wn, results
#   except Exception as e:
#       raise RuntimeError(f"Failed to load or simulate {inp_file}: {e}")

# def compare_results(res_trial, res_ref, tolerance=1e-3):
#   comparisons = {}

#   # Pressure comparison
#   pressure_trial = res_trial.node['pressure']
#   pressure_ref = res_ref.node['pressure']
#   pressure_diff = abs(pressure_trial - pressure_ref).max().max()
#   comparisons['pressure'] = pressure_diff < tolerance
#   print(pressure_trial)

#   # Demand comparison
#   demand_trial = res_trial.node['demand']
#   demand_ref = res_ref.node['demand']
#   demand_diff = abs(demand_trial - demand_ref).max().max()
#   comparisons['demand'] = demand_diff < tolerance

#   # Flow comparison
#   flow_trial = res_trial.link['flowrate']
#   flow_ref = res_ref.link['flowrate']
#   flow_diff = abs(flow_trial - flow_ref).max().max()
#   comparisons['flowrate'] = flow_diff < tolerance

#   return comparisons

# def run(trial_file, benchmark_file):
#   print(f"Comparing trial file {trial_file} and benchmark file {benchmark_file}")
#   try:
#       wn_trial, res_trial = load_and_simulate(trial_file)
#   except RuntimeError as e:
#       print(str(e))
#       return

#   try:
#       wn_ref, res_ref = load_and_simulate(benchmark_file)
#   except RuntimeError as e:
#       print(f"Reference network could not be loaded: {e}")
#       return

#   results = compare_results(res_trial, res_ref)

#   passed_all = all(results.values())
#   print("\n--- UNIT TEST RESULTS ---")
#   for key, passed in results.items():
#       print(f"{key} comparison: {'PASSED' if passed else 'FAILED'}")

#   if passed_all:
#       print("\n✅ All unit tests passed. The models are essentially the same.")
#   else:
#       print("\n❌ Some unit tests failed. Differences were detected.")

# run('network_0.inp','benchmark.inp') #worked with 13GB RAM in Colab

this works!!

In [15]:
# venv_name = "test_LLM"
# venv_path = f"/workspace/{venv_name}/bin/activate"
# script_path = "compare_networks.py"

# bash_command = f"source {venv_path} && MPLBACKEND=Agg python {script_path}"

# try:
#     result = subprocess.run(['bash', '-c', bash_command], capture_output=True, text=True, check=True)
#     print("Script output:\n", result.stdout)
# except subprocess.CalledProcessError as e:
#     print("Error running script:\n", e.stdout, "\n", e.stderr)

In [16]:
venv_name = "test_LLM"
venv_path = f"/workspace/{venv_name}/bin/activate"
script_path = "compare_networks_test.py"

bash_command = f"source {venv_path} && MPLBACKEND=Agg python {script_path}"

try:
    result = subprocess.run(['bash', '-c', bash_command], capture_output=True, text=True, check=True)
    print("Script output:\n", result.stdout)
except subprocess.CalledProcessError as e:
    print("Error running script:\n", e.stdout, "\n", e.stderr)

Script output:
 Comparing trial file network_0.inp and benchmark file benchmark.inp
Reference network could not be loaded: Failed to load or simulate benchmark.inp: [Errno 2] No such file or directory: 'benchmark.inp'



In [17]:
# result.stdout

In [18]:
# import wntr
# import numpy as np

# run('network_0.inp','benchmark.inp') #worked with 13GB RAM in Colab

In [19]:
#Check if works
# run('network_0.inp','benchmark.inp') #worked with 13GB RAM in Colab

In [20]:
# import sys
# from io import StringIO

# # Redirect standard output to capture it
# captured_output = StringIO()
# sys.stdout = captured_output

# # Run the command
# run('network_0.inp', 'benchmark.inp')

# # Reset standard output
# sys.stdout = sys.__stdout__

# # Check if 'All unit tests passed' is in the output
# if 'All unit tests passed' in captured_output.getvalue():
#     print("All unit tests passed!")
#     print(captured_output)
# else:
#     print("Unit tests did not pass.")
#     print(captured_output)

In [21]:
# aaaa

Get dataset

In [22]:
#How the dataset will be named in Langsmith
def get_dataset_name(model_name, judge_model):
    # try: #For Hugging Face models
        if not any(provider in model_name for provider in commercial_api_providers): #For Hugging Face models
            return domain+"_Engineering_Evaluation_"+model_name.split('/')[1]+'_with_judge_'+judge_model+'_beam_search_tool_usage_'+str(tool_usage) #Chemical
        else: #For Commercial API providers
            return domain+"_Engineering_Evaluation_"+model_name+'_with_judge_'+judge_model+'_beam_search_tool_usage_'+str(tool_usage) #Chemical
    # except: #For OpenAI models
    #     return "Chemical_Engineering_Evaluation_"+model_name+'_with_judge_'+judge_model+'_beam_search_statistics_all'

Check if GPU is available

In [23]:
import torch
print(torch.version.cuda)

12.1


Google Drive mount (If run in Colab)

In [24]:
if 'content/drive/My Drive' in file_path:
    from google.colab import drive
    drive.mount('/content/drive')

Read Excel File

In [25]:
import pandas as pd
qa=pd.read_excel(file_path) #Read Excel
qa=qa[['input','output']]

Create Dataset from df

In [26]:
from datasets import Dataset
loaded_dataset=Dataset.from_pandas(qa)

if inference_on_whole_dataset==False:
    loaded_dataset = loaded_dataset.train_test_split(test_size=0.2, seed=42) #Used if going to fine-tune in part of the dataset

In [27]:
if inference_on_whole_dataset==False:
    dataset_train=loaded_dataset['train']
    dataset_test=loaded_dataset['test']
else:
    dataset_test=loaded_dataset #When we use the whole dataset

Create Langsmith Test Dataset

In [28]:
#https://docs.smith.langchain.com/old/evaluation/faq/manage-datasets

from langsmith import Client

example_inputs = [(x['input'],x['output']) for x in dataset_test]
print(example_inputs)

def create_langsmith_dataset(dataset_name, example_inputs, langsmith_api_key):

    client = Client(api_key=langsmith_api_key)

    try:
        #Load the dataset if already exists
        for existing_dataset in client.list_datasets():
            if existing_dataset.name==dataset_name:
                dataset_langsmith=existing_dataset
        for x in dataset_langsmith:
            print("Dataset Loaded")
            break

    except: #Otherwise create it
        print("Dataset not found. Creating new dataset")
        # Storing inputs in a dataset lets us run chains and LLMs over a shared set of examples.
        dataset_langsmith = client.create_dataset(dataset_name=dataset_name,
                                                description="Q&A"+ domain + "engineering.") #chemical

        for input_prompt, output_answer in example_inputs:
            client.create_example(
                inputs={"question": input_prompt.replace('\n', ' ')},
                outputs={"answer": output_answer.replace('\n', ' ')},
                # metadata={"source": "Wikipedia"},
                dataset_id=dataset_langsmith.id,
            )

    return dataset_langsmith

[('How are flow corrections applied around each loop in the Hardy Cross method, and why must flow continuity be maintained at every junction?', 'In the Hardy Cross method, one first selects individual loops in the network and makes an initial guess of the flow in each pipe. Then, each loop is corrected iteratively: the head loss around the loop is summed, and a flow correction is applied to reduce the total head-loss error to zero. This correction is added (or subtracted) to the assumed flow in every pipe forming that loop. Flow continuity must be maintained at every junction to ensure that all flow entering a node is accounted for by either leaving through connecting pipes or satisfying demand. If continuity is not enforced, the model would not reflect true network behavior, causing errors to propagate in subsequent iterations.'), ('Which factors influence the choice of initial flow guesses in the Hardy Cross approach, and how can poor choices affect convergence?', 'Typical choices fo

Custom Evaluation Metrics

In [29]:
# from langchain_core.prompts import ChatPromptTemplate
# list_of_metrics=['completeness_descr','relevance_descr','conciseness_descr','confidence_descr','factuality_descr','judgement_descr', 'general_descr']
# import os
# os.chdir('/workspace')
# from dotenv import load_dotenv
# from termcolor import colored
# judge_model_2="gemini/gemini-2.5-flash-preview-04-17"

# load_dotenv(dotenv_path=os.getcwd()+"/env")

# gemini_api_key = os.getenv('GEMINI_API_KEY')

# # for metric_name in list_of_metrics:  # Iterate through all metrics
# #     print("Evaluating based on:", metric_name)
#     # metric_value = common_prompt + eval(metric_name)  # Get the actual description of the metric
#     # Define roles and placeholders
# metric_value="""
# You are an autoregressive language model that acts as a judge in comparing a predicted vs an actual answer to a questions.
# Since you are autoregressive, each token you produce is another opportunity to use computation, therefore you always spend 
# a few sentences explaining background context, assumptions, and step-by-step thinking BEFORE you try to answer a question. 
# Your users are experts inWaterengineering, so they already know you're a language model and your capabilities and limitations, so don't 
# remind them of that. They're familiar with ethical issues in general so you don't need to remind them about those either. 
# Don't be verbose in your answers, but do provide details and examples where it might help the explanation. 

# Your task is to evaluate responses predicted by an LLM with regards to completeness compared to the completeness of a given actual, golden standard answer. 
# The completeness metric evaluates the extent to which the user's question is answered in full in the predicted response. 
# You can assign a score from 1 to 5 to the predicted response with the following interpretations:
# 1: There is no response.
# 2: No parts of a suitable answer are present.
# 3: Few elements of a complete answer are present.
# 4: Most elements of a complete answer are present.
# 5: The response covers all elements of a complete answer.
# IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
# """
# question="""
# What aspects of the Newton-Raphson technique generally lead to faster convergence compared to loop-based iterative methods?
# """

# actual_answer="""
# Newton-Raphson methods solve all governing equations simultaneously by using partial derivatives (the Jacobian) that link changes in flows or heads to changes in hydraulic equations. 
# This approach allows the solver to make a more direct beeline toward the solution, whereas loop-based methods adjust flows one loop at a time. By incorporating global information into 
# each iteration, Newton-Raphson typically reaches high accuracy in fewer iterations, especially for large or complex systems.
# """

# predicted_answer="""
# The Newton-Raphson technique generally exhibits faster convergence compared to loop-based iterative methods in water engineering (and other fields) due to several key aspects:

# *   **Quadratic Convergence:** This is the most significant factor. Newton-Raphson exhibits quadratic convergence, meaning that the number of correct digits in the solution approximately doubles with each iteration.  In other words, the error decreases proportionally to the square of the previous error.  Loop-based methods, such as simple fixed-point iteration or successive substitution, typically have linear convergence, where the error decreases linearly with each iteration.  This difference in convergence rate makes Newton-Raphson significantly faster, especially when high accuracy is required.

# *   **Use of Derivative Information:** Newton-Raphson utilizes the derivative (or Jacobian in multi-dimensional problems) of the function being solved. This provides information about the slope and curvature of the function near the root.  This information allows the algorithm to make a more informed "jump" towards the root in each iteration. Loop-based methods often lack this derivative information and rely on simpler, less informed updates.

# *   **Tangent Approximation:** The method approximates the function with its tangent line at the current guess. The root of this tangent line is then used as the next guess. This tangent approximation is generally a good representation of the function near the root, leading to a more efficient approach than simply iterating based on the function value itself, as is common in loop-based methods.

# *   **Adaptive Step Size:**  The step size in Newton-Raphson is implicitly determined by the derivative.  If the derivative is large, the step size will be smaller, preventing overshooting the root.  If the derivative is small, the step size will be larger, allowing for faster progress towards the root.  Loop-based methods often use a fixed or manually adjusted step size, which may not be optimal for all parts of the solution space.

# **In the context of water engineering, this translates to:**

# *   **Faster solutions for complex hydraulic models:**  Many water engineering problems, such as pipe network analysis or open channel flow calculations, involve solving systems of nonlinear equations. Newton-Raphson can significantly reduce the computational time required to obtain accurate solutions for these models.
# *   **More efficient calibration of models:**  Model calibration often involves iteratively adjusting parameters to match observed data.  Using Newton-Raphson to solve the underlying equations within the model can speed up the calibration process.
# *   **Improved real-time control systems:**  In applications such as reservoir operation or wastewater treatment, real-time control systems often need to solve equations quickly to make timely decisions.  The faster convergence of Newton-Raphson can be crucial in these scenarios.

# **However, it's important to note the limitations of Newton-Raphson:**

# *   **Requires derivative information:**  Calculating the derivative (or Jacobian) can be computationally expensive or even impossible for some functions.
# *   **Sensitivity to initial guess:**  Newton-Raphson can diverge or converge to a different root if the initial guess is too far from the actual root.
# *   **Potential for oscillations:**  In some cases, the method can oscillate around the root, especially if the function has a high degree of nonlinearity or the initial guess is poor.

# In summary, while Newton-Raphson has its drawbacks, its quadratic convergence, use of derivative information, and adaptive step size generally make it a much faster and more efficient method than loop-based iterative techniques for solving nonlinear equations in water engineering applications, provided that the derivative information is available and a reasonable initial guess can be made.
# """

# chat_template = ChatPromptTemplate.from_messages(
#     [("system", "help to answer the fdollowing question:"),
#         ("user", "Question: {question}, Actual answer: {actual_answer}, Predicted answer: {predicted_answer}"),
#         # ("ai", "It's sunny and warm outside."), #Use this if we want to use few shot prompts
#     ]
# )

# messages = chat_template.format_messages(question=question, actual_answer=actual_answer, predicted_answer=predicted_answer)
# print("Messages:", messages)

# formatted_messages = [(role, msg.content) for role, msg in zip(["system", "user"], messages)]
# print("Formatted messages:", formatted_messages)  # [('system', 'You are an autoregressive lan....', 'user':.....)]

# # Initialize the Gemini model and get response
# try:
#     from langchain_google_genai import ChatGoogleGenerativeAI
    
#     llm_gemini = ChatGoogleGenerativeAI(
#         model=judge_model_2.split('/')[1],#"gemini-2.5-flash-preview-04-17",
#         temperature=0,
#         max_output_tokens=2000,
#         convert_system_message_to_human=True,
#         google_api_key=gemini_api_key
#     )
    
#     ai_response_gemini = llm_gemini.invoke(formatted_messages)
    
#     # print(colored("User message:" + messages[1].content, 'green'))
#     print(colored("Gemini response:" + ai_response_gemini.content, 'blue'))
# except Exception as e:
#     print("eer",e)

In [30]:
# https://docs.smith.langchain.com/old/cookbook/introduction
# https://docs.smith.langchain.com/old/evaluation/faq/custom-evaluators
# https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#use-a-summary-evaluator

from langsmith.schemas import Run, Example
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from termcolor import colored
from langchain_google_genai import ChatGoogleGenerativeAI

list_of_metrics=['completeness_descr','relevance_descr','conciseness_descr','confidence_descr','factuality_descr','judgement_descr', 'general_descr']

# #Function that compares the real answer with the predicted answer of an LLM and returns a score based on the evaluation using OpenAI
# def openai_evaluator(run: Run, example: Example) -> dict: 
#     # print("Run:",run)

#     question=run.inputs.get("inputs")['question']
#     # print("Question:",question)
#     actual_answer = example.outputs.get("answer")
#     # print("Real answer:",example.outputs.get("answer"))
#     predicted_answer = run.outputs.get("output")
#     # print("Predicted Answer:",answer)

#     scores_openai={} #Store scores for OpenAI judge model
#     descriptions_openai={} #Store descriptions for OpenAI judge model
    
#     # Check if there is output from LLM
#     if not predicted_answer:
#         print("No output from LLM")
#         keys=[]
#         for metric_name in list_of_metrics: #o1 sometimes doesn't give an answer
#             keys.append(metric_name.split('_descr')[0])
#             scores_openai[metric_name]=0
#             descriptions_openai[metric_name]='-'
#         results = {
#             "results": [
#                 {"key": key, "score": scores_openai[key + "_descr"], "value": descriptions_openai[key + "_descr"]} for key in keys
#             ]}
#         return results

#     else:
#         scores_openai = {}  # Store scores for OpenAI judge model
#         descriptions_openai = {}  # Store descriptions for OpenAI judge model
        
#         for metric_name in list_of_metrics:  # Iterate through all metrics
#             print("Evaluating based on:", metric_name)
#             metric_value = common_prompt + eval(metric_name)  # Get the actual description of the metric

#             # Define roles and placeholders
#             chat_template = ChatPromptTemplate.from_messages(
#                 [("system", metric_value),
#                  ("user", "Question: {question}, Actual answer: {actual_answer}, Predicted answer: {predicted_answer}"),
#                  # ("ai", "It's sunny and warm outside."), #Use this if we want to use few shot prompts
#                 ]
#             )

#             messages = chat_template.format_messages(question=question, actual_answer=actual_answer, predicted_answer=predicted_answer)
#             # print("Messages:", messages)

#             formatted_messages = [(role, msg.content) for role, msg in zip(["system", "user"], messages)]
#             # print("Formatted messages:", formatted_messages)  # [('system', 'You are an autoregressive lan....', 'user':.....)]

#             # Initialize the OpenAI model and get response
#             llm_openai = ChatOpenAI(model_name=judge_model.split('/')[1], api_key=openai_api_key, temperature=0, max_tokens=max_output_tokens, seed=42)
#             ai_response_openai = llm_openai.invoke(formatted_messages)

#             # Output for OpenAI
#             # print(colored("System message:"+ messages[0].content,'blue'))
#             print(colored("User message:" + messages[1].content, 'green'))
#             print(colored("OpenAI response:" + ai_response_openai.content, 'red'))

#             # Decide what the final score is based on OpenAI output
#             if "FINAL SCORE:" in ai_response_openai.content:
#                 score_openai = int(ai_response_openai.content.split("FINAL SCORE:")[1])
#             else:
#                 print("Invalid response from OpenAI:", ai_response_openai.content)
#                 score_openai = 0  # For cases where the LLM doesn't return a score

#             scores_openai[metric_name] = score_openai
#             descriptions_openai[metric_name] = ai_response_openai.content
            
#             print("OpenAI Scores:", scores_openai)
#             print("\n")

#     return {
#         "results": [  # We always need 'key', 'score' pairs
#             {"key": "completeness", "score": scores_openai['completeness_descr'], "value": descriptions_openai['completeness_descr']},
#             {"key": "relevance", "score": scores_openai['relevance_descr'], "value": descriptions_openai['relevance_descr']},
#             {"key": "conciseness", "score": scores_openai['conciseness_descr'], "value": descriptions_openai['conciseness_descr']},
#             {"key": "confidence", "score": scores_openai['confidence_descr'], "value": descriptions_openai['confidence_descr']},
#             {"key": "factuality", "score": scores_openai['factuality_descr'], "value": descriptions_openai['factuality_descr']},
#             {"key": "judgement", "score": scores_openai['judgement_descr'], "value": descriptions_openai['judgement_descr']},
#             {"key": "general", "score": scores_openai['general_descr'], "value": descriptions_openai['general_descr']},
#         ]
#     }

# #Function that compares the real answer with the predicted answer of an LLM and returns a score based on the evaluation using Gemini
# def gemini_evaluator(run: Run, example: Example) -> dict: 
#     # print("Run:",run)

#     question=run.inputs.get("inputs")['question']
#     # print("Question:",question)
#     actual_answer = example.outputs.get("answer")
#     # print("Real answer:",example.outputs.get("answer"))
#     predicted_answer = run.outputs.get("output")
#     # print("Predicted Answer:",answer)

#     scores_gemini={} #Store scores for Gemini judge model
#     descriptions_gemini={} #Store descriptions for Gemini judge model
    
#     # Check if there is output from LLM
#     if not predicted_answer:
#         print("No output from LLM")
#         keys=[]
#         for metric_name in list_of_metrics: #o1 sometimes doesn't give an answer
#             keys.append(metric_name.split('_descr')[0])
#             scores_gemini[metric_name]=0
#             descriptions_gemini[metric_name]='-'
#         results = {
#             "results": [
#                 {"key": key + "_gemini", "score": scores_gemini[key + "_descr"], "value": descriptions_gemini[key + "_descr"]} for key in keys
#             ]}
#         return results

#     else:
#         scores_gemini = {}  # Store scores for Gemini judge model
#         descriptions_gemini = {}  # Store descriptions for Gemini judge model
        
#         for metric_name in list_of_metrics:  # Iterate through all metrics
#             print("Evaluating based on:", metric_name)
#             metric_value = common_prompt + eval(metric_name)  # Get the actual description of the metric

#             # Define roles and placeholders
#             chat_template = ChatPromptTemplate.from_messages(
#                 [("system", metric_value),
#                  ("user", "Question: {question}, Actual answer: {actual_answer}, Predicted answer: {predicted_answer}"),
#                  # ("ai", "It's sunny and warm outside."), #Use this if we want to use few shot prompts
#                 ]
#             )

#             messages = chat_template.format_messages(question=question, actual_answer=actual_answer, predicted_answer=predicted_answer)
#             # print("Messages:", messages)

#             formatted_messages = [(role, msg.content) for role, msg in zip(["system", "user"], messages)]
#             # print("Formatted messages:", formatted_messages)  # [('system', 'You are an autoregressive lan....', 'user':.....)]

#             # Initialize the Gemini model and get response
#             try:
                
#                 llm_gemini = ChatGoogleGenerativeAI(
#                     model=judge_model_2.split('/')[1],
#                     temperature=0,
#                     max_output_tokens=max_output_tokens,
#                     convert_system_message_to_human=True,
#                     google_api_key=gemini_api_key
#                 )

#                 ai_response_gemini = llm_gemini.invoke(formatted_messages)
                
#                 print(colored("User message:" + messages[1].content, 'green'))
#                 print(colored("Gemini response:" + ai_response_gemini.content, 'blue'))
                
#                 # Decide what the final score is based on Gemini output
#                 if "FINAL SCORE:" in ai_response_gemini.content:
#                     score_gemini = int(ai_response_gemini.content.split("FINAL SCORE:")[1])
#                 else:
#                     print("Invalid response from Gemini:", ai_response_gemini.content)
#                     score_gemini = 0  # For cases where the LLM doesn't return a score
                    
#             except Exception as e:
#                 print(f"Error with Gemini model: {e}")
#                 score_gemini = 0
#                 ai_response_gemini = type('obj', (object,), {'content': f"Error: {e}"})
            
#             scores_gemini[metric_name] = score_gemini
#             descriptions_gemini[metric_name] = ai_response_gemini.content
            
#             print("Gemini Scores:", scores_gemini)
#             print("\n")

#     return {
#         "results": [  # We always need 'key', 'score' pairs
#             {"key": "completeness_gemini", "score": scores_gemini['completeness_descr'], "value": descriptions_gemini['completeness_descr']},
#             {"key": "relevance_gemini", "score": scores_gemini['relevance_descr'], "value": descriptions_gemini['relevance_descr']},
#             {"key": "conciseness_gemini", "score": scores_gemini['conciseness_descr'], "value": descriptions_gemini['conciseness_descr']},
#             {"key": "confidence_gemini", "score": scores_gemini['confidence_descr'], "value": descriptions_gemini['confidence_descr']},
#             {"key": "factuality_gemini", "score": scores_gemini['factuality_descr'], "value": descriptions_gemini['factuality_descr']},
#             {"key": "judgement_gemini", "score": scores_gemini['judgement_descr'], "value": descriptions_gemini['judgement_descr']},
#             {"key": "general_gemini", "score": scores_gemini['general_descr'], "value": descriptions_gemini['general_descr']},
#         ]
#     }

# # Original combined function for backward compatibility
# def factor_evaluator(run: Run, example: Example) -> dict:
#     openai_results = openai_evaluator(run, example)
#     gemini_results = gemini_evaluator(run, example)
    
#     # Combine results from both evaluators
#     combined_results = {
#         "results": openai_results["results"] + gemini_results["results"]
#     }
    
#     return combined_results

# Function that compares the real answer with the predicted answer of an LLM and returns a score based on the evaluation
def factor_evaluator(run: Run, example: Example, judge_model: str) -> dict: 
    # print("Run:",run)

    question = run.inputs.get("inputs")['question']
    # print("Question:",question)
    actual_answer = example.outputs.get("answer")
    # print("Real answer:",example.outputs.get("answer"))
    predicted_answer = run.outputs.get("output")
    # print("Predicted Answer:",answer)

    # Parse judge model string to determine model type and name
    model_parts = judge_model.split('/')
    model_type = model_parts[0].lower()  # e.g., "openai", "gemini"
    # model_name = judge_model  # Keep the full string for reference
    
    # # Set prefix for result keys based on model type
    # prefix = "_" + model_type

    scores = {}  # Store scores for judge model
    descriptions = {}  # Store descriptions for judge model

    # Check if there is output from LLM
    if not predicted_answer:
        print("No output from LLM")
        keys = []
        for metric_name in list_of_metrics:  # o1 sometimes doesn't give an answer
            keys.append(metric_name.split('_descr')[0])
            scores[metric_name] = 0
            descriptions[metric_name] = '-'
        results = {
            "results": [
                {"key": key, "score": scores[key + "_descr"], "value": descriptions[key + "_descr"]} for key in keys
            ]}
        return results

    else:
        scores = {}  # Store scores for judge model
        descriptions = {}  # Store descriptions for judge model
        
        for metric_name in list_of_metrics:  # Iterate through all metrics
            print("Evaluating based on:", metric_name)
            metric_value = common_prompt + eval(metric_name)  # Get the actual description of the metric

            # Define roles and placeholders
            chat_template = ChatPromptTemplate.from_messages(
                [("system", metric_value),
                 ("user", "Question: {question}, Actual answer: {actual_answer}, Predicted answer: {predicted_answer}"),
                 # ("ai", "It's sunny and warm outside."), #Use this if we want to use few shot prompts
                ]
            )

            messages = chat_template.format_messages(question=question, actual_answer=actual_answer, predicted_answer=predicted_answer)
            # print("Messages:", messages)

            formatted_messages = [(role, msg.content) for role, msg in zip(["system", "user"], messages)]
            # print("Formatted messages:", formatted_messages)  # [('system', 'You are an autoregressive lan....', 'user':.....)]

            # Initialize the model and get response based on model type
            try:
                if model_type == "openai":
                    llm = ChatOpenAI(model_name=model_parts[1], api_key=openai_api_key, temperature=0, max_tokens=max_output_tokens, seed=42)
                    ai_response = llm.invoke(formatted_messages)
                    model_color = 'red'
                elif model_type == "gemini":
                    llm = ChatGoogleGenerativeAI(
                        model=model_parts[1],
                        temperature=0,
                        max_output_tokens=max_output_tokens,
                        convert_system_message_to_human=True,
                        google_api_key=gemini_api_key
                    )
                    ai_response = llm.invoke(formatted_messages)
                    model_color = 'blue'
                else:
                    raise ValueError(f"Unsupported model type: {model_type}")
                
                print(colored("User message:" + messages[1].content, 'green'))
                print(colored(f"{model_type} response:" + ai_response.content, model_color))
                
                # Decide what the final score is based on model output
                if "FINAL SCORE:" in ai_response.content:
                    score = int(ai_response.content.split("FINAL SCORE:")[1])
                else:
                    print(f"Invalid response from {model_type}:", ai_response.content)
                    score = 0  # For cases where the LLM doesn't return a score
                    
            except Exception as e:
                print(f"Error with {model_type} model: {e}")
                score = 0
                ai_response = type('obj', (object,), {'content': f"Error: {e}"})
            
            scores[metric_name] = score
            descriptions[metric_name] = ai_response.content
            
            print(f"{model_type} Scores:", scores)
            print("\n")

    # Create result keys with appropriate prefix
    return {
        "results": [  # We always need 'key', 'score' pairs
            {"key": "completeness", "score": scores['completeness_descr'], "value": descriptions['completeness_descr']},
            {"key": "relevance", "score": scores['relevance_descr'], "value": descriptions['relevance_descr']},
            {"key": "conciseness", "score": scores['conciseness_descr'], "value": descriptions['conciseness_descr']},
            {"key": "confidence", "score": scores['confidence_descr'], "value": descriptions['confidence_descr']},
            {"key": "factuality", "score": scores['factuality_descr'], "value": descriptions['factuality_descr']},
            {"key": "judgement", "score": scores['judgement_descr'], "value": descriptions['judgement_descr']},
            {"key": "general", "score": scores['general_descr'], "value": descriptions['general_descr']},
        ]
    }

def multiple_judges_evaluator(my_variable):
    def wrapped_evaluator(run, example):
        return factor_evaluator(run, example, my_variable)
    return wrapped_evaluator

Define Models that Generate Responses

In [31]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.random.manual_seed(0) #Set for reproducibility

def initialize_model(model_id):
    # # Check if mps acceleration is available (For MacOS)
    # device = "mps" if torch.backends.mps.is_available() else "cpu"
    # print(f"Using device {device}")
    # model.to(device)

    # transformers.set_seed(42) #Tried for reproducibility but didn't work
    
    pipeline = transformers.pipeline( 
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16, "cache_dir":cache_dir},
            # trust_remote_code=True,
            device_map="auto" #Use 'cuda' if one GPU available (works in Delft Blue with 32GB VRAM) - 'auto' the alternative for distributed over all available GPUs
        )
    return pipeline

def get_model(model_id):
    """Given a model name, return the loaded model, tokenizer, and pipeline"""

    # if 'openai' not in model_id and 'groq_website' not in model_id and 'anthropic' not in model_id and 'together' not in model_id: #For Hugging Face models
    if not any(provider in model_id for provider in commercial_api_providers): #For Hugging Face models
        pipeline=initialize_model(model_id)

    #Returns below variables if defined, and returns None for any that are not.
    model = locals().get('model', None)
    tokenizer = locals().get('tokenizer', None)
    pipeline = locals().get('pipeline', None)

    return model, tokenizer, pipeline

Generate Responses

In [32]:
# import time

# def predict(inputs: dict) -> dict:
#     """Given a question, return the answer from the model"""
    
#     #Get these variables from the global scope
#     global model_name, generate_max_tokens, generation_max_tokens_thinking
    
#     messages = [ #Only use the questions to ask the model to generate the response
#       {"role": "user", "content": inputs['question']},
#     ]

#     #They do not support system message
#     if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name: 
#       messages.insert(0, {"role": "system", "content": "You are a language model specialized in "+ domain + " engineering. Answer the following question:"}) #chemical
#     elif 'gemini' in model_name: #Google Gemini has this format since it also gets images
#       messages = {"role": "user", "parts": [{"text": "You are a language model specialized in "+ domain + "engineering. Answer the following question: " + messages[0]['content']}]}
#     else: #For gemma add system prompt in user message
#       messages[0]['content']="You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']
#     # print("Prompt:",messages)

#     #Reasoning model with CoT should have longer max_tokens to include the reasoning steps - For now deepseek and Gemini Flash Thinking
#     if 'deepseek' in model_name or 'thinking' in model_name or '/o1' in model_name or '/o3' in model_name or 'QwQ-32B' in model_name:
#       generate_max_tokens=generation_max_tokens_thinking
#       print("Generation limit increased due to reasoning model:", model_name, "to:",generate_max_tokens)
#     else: #Since it's global it might stay to 16000 otherwise
#       generate_max_tokens=1000

#     generation_args = { 
#         "max_new_tokens": generate_max_tokens,
#         "return_full_text": False, 
#         "temperature": 0.05, #Has to be positive number - not considered from model when do_sample is False (reproducible results)
#         "do_sample": True, #Selects highest probability token if sets to False
#         "num_beams" : 5, #3 can also work if computationally intensive - more info on https://huggingface.co/blog/how-to-generate
#         #Warnings will be raised by some models

#         #If we only set temp!=0 or if we also set do_sample=False then warning: `do_sample` is set to `False`. However, `temperature` is set to `1e-08` 
#         # -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
#         # That means that the temperature is probably ignored
#         # Sometimes, results not reproducible if only temp is set
#         # A temparature of 0.01 or lower results in: "Error running target function: probability tensor contains either `inf`, `nan` or element < 0"
#       } 
    
#     if not any(provider in model_name for provider in commercial_api_providers): #For Hugging Face models
#       response=pipeline(messages, **generation_args)[0]['generated_text']
#       print(model_name,':',response)

#     else: 
#       if 'openai' in model_name:
#         try:
#           import openai
#           from langsmith.wrappers import wrap_openai
                  
#           # Define OpenAI client
#           openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
          
#           if '/o1' not in model_name and '/o3' not in model_name:
#             response = openai_client.chat.completions.create(messages=messages, temperature=0, model=model_name.split('/')[1], max_tokens=generate_max_tokens, seed=42) 
#           else: #For thinking models
#             print("Thinking....")
#             response = openai_client.chat.completions.create(messages=messages, model=model_name.split('/')[1], max_completion_tokens=generate_max_tokens, seed=42) 

#           # print("Full Response from OpenAI:",response)
#           response=response.choices[0].message.content #That's the response without formatting
#           print("Response from OpenAI:",response)
#           time.sleep(5) #To avoid rate limiting

#         except Exception as e:
#           print("Error:",e)
#           print("OpenAI Model ID:",model_name)

#       elif 'groq_website' in model_name:
#         try:
#           from groq import Groq
#           client = Groq(api_key=groq_api_key)
#           actual_model_name=model_name.split('/')[1]
#           response = client.chat.completions.create(
#               model=actual_model_name,
#               max_tokens=generate_max_tokens,
#               temperature=0,
#               messages=messages)
          
#           response=response.choices[0].message.content #That's the response without formatting
#           print("Response from Groq:",response)#.choices[0].message.content)
#           time.sleep(5) #To avoid rate limiting

#         except Exception as e:
#           print("Error:",e)
#           print("Groq Model ID:",model_name)

#       elif 'anthropic' in model_name:
#         try:
#           import anthropic
#           client = anthropic.Anthropic(api_key=anthropic_api_key)
#           response = client.messages.create(
#               model=model_name.split('/')[1],
#               messages=messages,
#               temperature=0,
#               max_tokens=generate_max_tokens,
#           )
#           response=response.content[0].text #That's the response without formatting
#           print("Response from Anthropic:",response)#.content[0].text)
#           time.sleep(5) #To avoid rate limiting

#         except Exception as e:
#           print("Error:",e)
#           print("Anthropic Model ID:",model_name)

#       elif 'together' in model_name:
#         try:
#           from together import Together
#           client = Together(api_key=together_api_key)
#           response = client.chat.completions.create(
#               model="/".join(model_name.split("/")[1:]),
#               messages=messages,
#               temperature=0,
#               max_tokens=generate_max_tokens
#           )
#           # print("Full Response from Together:",response_full)
#           response=response.choices[0].message.content #That's the response without formatting
#           print("Response from Together:",response)
#           time.sleep(5) #To avoid rate limiting
#           if "<think>" in response:
#             time.sleep(180) #To avoid rate limiting need to wait 3 minutes

#         except Exception as e:
#           print("Error:",e)
#           print("Together Model ID:",model_name)
      
#       elif 'openrouter' in model_name: #Very limited quota for free models
#         try:
#           from openai import OpenAI

#           client = OpenAI(
#             base_url="https://openrouter.ai/api/v1",
#             api_key=open_router_api_key,
#           )
#           response = client.chat.completions.create(
#             model="/".join(model_name.split("/")[1:]),
#             messages=messages,
#             temperature=0,
#             max_tokens=generate_max_tokens,
#           )
#           response=response.choices[0].message.content #That's the response without formatting
#           print("Response from OpenRouter:",response)#.choices[0].message.content)
#           time.sleep(5) #To avoid rate limiting

#         except Exception as e:
#           print("Error:",e)
#           print("OpenRouter Model ID:",model_name)

#       elif 'gemini' in model_name:
#         try:
#           if 'thinking' in model_name or 'gemini-2.5-pro' in model_name: #Thinking model has different call
#             from google.generativeai.types import GenerationConfig
#             import google.generativeai as genai
#             # Explicitly define API key
#             genai.configure(api_key=gemini_api_key)
#             model = genai.GenerativeModel(model_name.split('/')[1])
#             response = model.generate_content(
#                 contents=messages,
#                 generation_config=GenerationConfig(
#                     temperature=0,
#                     max_output_tokens=generate_max_tokens,
#                 )
#             )
#             print("Full Response from Gemini ('thinking') model:",response)
#             response=response.text
#             print("Response from Gemini ('thinking') model:",response)
#             time.sleep(5) #To avoid rate limiting
#           else: #for the rest of the models
#             from google import genai
#             from google.genai import types
#             client = genai.Client(api_key=gemini_api_key)
#             response = client.models.generate_content(
#               model=model_name.split('/')[1],
#               contents=messages,
#               config=types.GenerateContentConfig(
#                   temperature=0,
#                   max_output_tokens=generate_max_tokens,
#               )
#             )
#             response=response.text
#             print("Response from Gemini:",response)
#             time.sleep(5) #To avoid rate limiting
#         except Exception as e:
#           print("Error:",e)
#           print("Gemini Model ID:",model_name)


#     return {"output": response}

In [33]:
# def predict(inputs: dict) -> dict:
#     """Given a question, return the answer from the model, optionally using tools if tool_usage is True"""
    
#     #Get these variables from the global scope
#     global model_name, generate_max_tokens, generation_max_tokens_thinking, tool_usage

#     #Reasoning model with CoT should have longer max_tokens to include the reasoning steps - For now deepseek and Gemini Flash Thinking
#     if 'deepseek' in model_name or 'thinking' in model_name or '/o1' in model_name or '/o3' in model_name or 'QwQ-32B' in model_name:
#       generate_max_tokens=generation_max_tokens_thinking
#       print("Generation limit increased due to reasoning model:", model_name, "to:",generate_max_tokens)
#     else: #Since it's global it might stay to 16000 otherwise
#       generate_max_tokens=1000

#     generation_args = { 
#         "max_new_tokens": generate_max_tokens,
#         "return_full_text": False, 
#         "temperature": 0.05, #Has to be positive number - not considered from model when do_sample is False (reproducible results)
#         "do_sample": True, #Selects highest probability token if sets to False
#         "num_beams" : 5, #3 can also work if computationally intensive - more info on https://huggingface.co/blog/how-to-generate
#         #Warnings will be raised by some models

#         #If we only set temp!=0 or if we also set do_sample=False then warning: `do_sample` is set to `False`. However, `temperature` is set to `1e-08` 
#         # -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
#         # That means that the temperature is probably ignored
#         # Sometimes, results not reproducible if only temp is set
#         # A temparature of 0.01 or lower results in: "Error running target function: probability tensor contains either `inf`, `nan` or element < 0"
#     } 
    
    
#     # If tool_usage is enabled, check if we should use a tool for this question
#     if tool_usage:
#         tool_name = decide_tool_usage(inputs['question'])
#         if tool_name:
#             print(f"Using tool: {tool_name}")
            
#             # Start with just the question
#             messages = [
#                 {"role": "user", "content": inputs['question']},
#             ]

#             # Set the appropriate system message based on tool type
#             system_message = "Return only the code from the following message to be directly copy pasted in a py file. Do not return it in quotes, just plain code. \
#                 Correct any typos, errors and undefined variables. Example of errors are KeyErrors of variable not defined or not properly accessed. \
#                 Some other steps that might be needed: Add checks for edge direction using .has_edge(u, v) because some pipes in the loops might be defined in the opposite direction \
#                 in our graph. When a pipe is flowing in the opposite direction of how it's defined in the graph, we need to Use negative flow value (e.g. -G[v][u]['Q']). The message is:" \
#                 if tool_name[0] == 'extract_code' else \
#                 "Return only the text from the following message to be directly copy pasted into a file. Do not return it in quotes, just plain text. The message is:"

#             # Add system message based on model type
#             if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name:
#                 messages.insert(0, {"role": "system", "content": system_message})
#             elif 'gemini' in model_name:
#                 messages = {"role": "user", "parts": [{"text": system_message + " " + messages[0]['content']}]}
#             else:  # For gemma add system prompt in user message
#                 messages[0]['content'] = system_message + " " + messages[0]['content']

#             # Common API call section
#             try:
#                 if 'openai' in model_name:
#                     try:
#                         import openai
#                         from langsmith.wrappers import wrap_openai
                                
#                         # Define OpenAI client
#                         openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
                        
#                         if '/o1' not in model_name and '/o3' not in model_name:
#                             response = openai_client.chat.completions.create(messages=messages, temperature=0, model=model_name.split('/')[1], max_tokens=generate_max_tokens, seed=42) 
#                         else: #For thinking models
#                             print("Thinking....")
#                             response = openai_client.chat.completions.create(messages=messages, model=model_name.split('/')[1], max_completion_tokens=generate_max_tokens, seed=42) 

#                         # print("Full Response from OpenAI:",response)
#                         response=response.choices[0].message.content #That's the response without formatting
#                         print("Response from OpenAI:",response)
#                         time.sleep(5) #To avoid rate limiting

#                     except Exception as e:
#                         print("Error:",e)
#                         print("OpenAI Model ID:",model_name)

#                 elif 'groq_website' in model_name:
#                     try:
#                         from groq import Groq
#                         client = Groq(api_key=groq_api_key)
#                         actual_model_name=model_name.split('/')[1]
#                         response = client.chat.completions.create(
#                             model=actual_model_name,
#                             max_tokens=generate_max_tokens,
#                             temperature=0,
#                             messages=messages)
                        
#                         response=response.choices[0].message.content #That's the response without formatting
#                         print("Response from Groq:",response)#.choices[0].message.content)
#                         time.sleep(5) #To avoid rate limiting

#                     except Exception as e:
#                         print("Error:",e)
#                         print("Groq Model ID:",model_name)

#                 elif 'anthropic' in model_name:
#                     try:
#                         import anthropic
#                         client = anthropic.Anthropic(api_key=anthropic_api_key)
#                         response = client.messages.create(
#                             model=model_name.split('/')[1],
#                             messages=messages,
#                             temperature=0,
#                             max_tokens=generate_max_tokens,
#                         )
#                         response=response.content[0].text #That's the response without formatting
#                         print("Response from Anthropic:",response)#.content[0].text)
#                         time.sleep(5) #To avoid rate limiting

#                     except Exception as e:
#                         print("Error:",e)
#                         print("Anthropic Model ID:",model_name)

#                 elif 'together' in model_name:
#                     try:
#                         from together import Together
#                         client = Together(api_key=together_api_key)
#                         response = client.chat.completions.create(
#                             model="/".join(model_name.split("/")[1:]),
#                             messages=messages,
#                             temperature=0,
#                             max_tokens=generate_max_tokens
#                         )
#                         # print("Full Response from Together:",response_full)
#                         response=response.choices[0].message.content #That's the response without formatting
#                         print("Response from Together:",response)
#                         time.sleep(5) #To avoid rate limiting
#                         if "<think>" in response:
#                             time.sleep(180) #To avoid rate limiting need to wait 3 minutes

#                     except Exception as e:
#                         print("Error:",e)
#                         print("Together Model ID:",model_name)
                
#                 elif 'openrouter' in model_name: #Very limited quota for free models
#                     try:
#                         from openai import OpenAI

#                         client = OpenAI(
#                             base_url="https://openrouter.ai/api/v1",
#                             api_key=open_router_api_key,
#                         )
#                         response = client.chat.completions.create(
#                             model="/".join(model_name.split("/")[1:]),
#                             messages=messages,
#                             temperature=0,
#                             max_tokens=generate_max_tokens,
#                         )
#                         response=response.choices[0].message.content #That's the response without formatting
#                         print("Response from OpenRouter:",response)#.choices[0].message.content)
#                         time.sleep(5) #To avoid rate limiting

#                     except Exception as e:
#                         print("Error:",e)
#                         print("OpenRouter Model ID:",model_name)

#                 elif 'gemini' in model_name:
#                     try:
#                         if 'thinking' in model_name or 'gemini-2.5-pro' in model_name: #Thinking model has different call
#                             from google.generativeai.types import GenerationConfig
#                             import google.generativeai as genai
#                             # Explicitly define API key
#                             genai.configure(api_key=gemini_api_key)
#                             model = genai.GenerativeModel(model_name.split('/')[1])
#                             response = model.generate_content(
#                                 contents=messages,
#                                 generation_config=GenerationConfig(
#                                     temperature=0,
#                                     max_output_tokens=generate_max_tokens,
#                                 )
#                             )
#                             print("Full Response from Gemini ('thinking') model:",response)
#                             response=response.text
#                             print("Response from Gemini ('thinking') model:",response)
#                             time.sleep(5) #To avoid rate limiting
#                         else: #for the rest of the models
#                             from google import genai
#                             from google.genai import types
#                             client = genai.Client(api_key=gemini_api_key)
#                             response = client.models.generate_content(
#                             model=model_name.split('/')[1],
#                             contents=messages,
#                             config=types.GenerateContentConfig(
#                                 temperature=0,
#                                 max_output_tokens=generate_max_tokens,
#                             )
#                             )
#                             response=response.text
#                             print("Response from Gemini:",response)
#                             time.sleep(5) #To avoid rate limiting
#                     except Exception as e:
#                         print("Error:",e)
#                         print("Gemini Model ID:",model_name)
                    


# #Tool specific
#                 if tool_name[0] == 'extract_code':
#                     code_result = response
#                     print("Type", type(code_result))
#                     print("Resulting code:", code_result)

#                     # Save code_result as a py file
#                     with open("code_result.py", "w") as file:
#                         file.write(code_result)

#                     # Run above created script with generated code
#                     command = f"bash -c '{activate_cmd} && python code_result.py'"
#                     print("Command to run script:", command)

#                     try:
#                         # Execute the command and capture the output
#                         result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
#                         final_answer = result
#                         print("Execution Result:", result)
#                     except subprocess.CalledProcessError as e:
#                         result = f"Error in execution: {e.output}"
#                         print(result)

#                         # Check if there was an error in execution
#                         max_attempts = 3  # Original + 2 retries
#                         attempt = 1
#                         current_code = code_result
#                         current_result = result

#                         while current_result.startswith("Error in execution:") and attempt < max_attempts:
#                             print(f"\nAttempt {attempt} failed, trying correction...")
                            
#                             # Send error and code to LLM for correction
#                             error_prompt = f"""
#                             The following code resulted in an error:
                            
#                             {current_code}
                            
#                             Error message:
#                             {current_result}
                            
#                             Please correct the code to fix this error. Return only the code from the following message to be directly copy pasted in a py file. \
#                             Do not return it in quotes, just plain code.
#                             """
                            
#                             try:
#                                 # Use the same model for error correction
#                                 if not any(provider in model_name for provider in commercial_api_providers):
#                                     response = pipeline([{"role": "user", "content": error_prompt}], **generation_args)[0]['generated_text']
#                                 else:
#                                     if 'openai' in model_name:
#                                         try:
#                                             import openai
#                                             from langsmith.wrappers import wrap_openai
                                                    
#                                             # Define OpenAI client
#                                             openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
                                            
#                                             if '/o1' not in model_name and '/o3' not in model_name:
#                                                 response = openai_client.chat.completions.create(messages=messages, temperature=0, model=model_name.split('/')[1], max_tokens=generate_max_tokens, seed=42) 
#                                             else: #For thinking models
#                                                 print("Thinking....")
#                                                 response = openai_client.chat.completions.create(messages=messages, model=model_name.split('/')[1], max_completion_tokens=generate_max_tokens, seed=42) 

#                                             # print("Full Response from OpenAI:",response)
#                                             response=response.choices[0].message.content #That's the response without formatting
#                                             print("Response from OpenAI:",response)
#                                             time.sleep(5) #To avoid rate limiting

#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("OpenAI Model ID:",model_name)

#                                     elif 'groq_website' in model_name:
#                                         try:
#                                             from groq import Groq
#                                             client = Groq(api_key=groq_api_key)
#                                             actual_model_name=model_name.split('/')[1]
#                                             response = client.chat.completions.create(
#                                                 model=actual_model_name,
#                                                 max_tokens=generate_max_tokens,
#                                                 temperature=0,
#                                                 messages=messages)
                                            
#                                             response=response.choices[0].message.content #That's the response without formatting
#                                             print("Response from Groq:",response)#.choices[0].message.content)
#                                             time.sleep(5) #To avoid rate limiting

#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("Groq Model ID:",model_name)

#                                     elif 'anthropic' in model_name:
#                                         try:
#                                             import anthropic
#                                             client = anthropic.Anthropic(api_key=anthropic_api_key)
#                                             response = client.messages.create(
#                                                 model=model_name.split('/')[1],
#                                                 messages=messages,
#                                                 temperature=0,
#                                                 max_tokens=generate_max_tokens,
#                                             )
#                                             response=response.content[0].text #That's the response without formatting
#                                             print("Response from Anthropic:",response)#.content[0].text)
#                                             time.sleep(5) #To avoid rate limiting

#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("Anthropic Model ID:",model_name)

#                                     elif 'together' in model_name:
#                                         try:
#                                             from together import Together
#                                             client = Together(api_key=together_api_key)
#                                             response = client.chat.completions.create(
#                                                 model="/".join(model_name.split("/")[1:]),
#                                                 messages=messages,
#                                                 temperature=0,
#                                                 max_tokens=generate_max_tokens
#                                             )
#                                             # print("Full Response from Together:",response_full)
#                                             response=response.choices[0].message.content #That's the response without formatting
#                                             print("Response from Together:",response)
#                                             time.sleep(5) #To avoid rate limiting
#                                             if "<think>" in response:
#                                                 time.sleep(180) #To avoid rate limiting need to wait 3 minutes

#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("Together Model ID:",model_name)
                                    
#                                     elif 'openrouter' in model_name: #Very limited quota for free models
#                                         try:
#                                             from openai import OpenAI

#                                             client = OpenAI(
#                                                 base_url="https://openrouter.ai/api/v1",
#                                                 api_key=open_router_api_key,
#                                             )
#                                             response = client.chat.completions.create(
#                                                 model="/".join(model_name.split("/")[1:]),
#                                                 messages=messages,
#                                                 temperature=0,
#                                                 max_tokens=generate_max_tokens,
#                                             )
#                                             response=response.choices[0].message.content #That's the response without formatting
#                                             print("Response from OpenRouter:",response)#.choices[0].message.content)
#                                             time.sleep(5) #To avoid rate limiting

#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("OpenRouter Model ID:",model_name)

#                                     elif 'gemini' in model_name:
#                                         try:
#                                             if 'thinking' in model_name or 'gemini-2.5-pro' in model_name: #Thinking model has different call
#                                                 from google.generativeai.types import GenerationConfig
#                                                 import google.generativeai as genai
#                                                 # Explicitly define API key
#                                                 genai.configure(api_key=gemini_api_key)
#                                                 model = genai.GenerativeModel(model_name.split('/')[1])
#                                                 response = model.generate_content(
#                                                     contents=messages,
#                                                     generation_config=GenerationConfig(
#                                                         temperature=0,
#                                                         max_output_tokens=generate_max_tokens,
#                                                     )
#                                                 )
#                                                 print("Full Response from Gemini ('thinking') model:",response)
#                                                 response=response.text
#                                                 print("Response from Gemini ('thinking') model:",response)
#                                                 time.sleep(5) #To avoid rate limiting
#                                             else: #for the rest of the models
#                                                 from google import genai
#                                                 from google.genai import types
#                                                 client = genai.Client(api_key=gemini_api_key)
#                                                 response = client.models.generate_content(
#                                                 model=model_name.split('/')[1],
#                                                 contents=messages,
#                                                 config=types.GenerateContentConfig(
#                                                     temperature=0,
#                                                     max_output_tokens=generate_max_tokens,
#                                                 )
#                                                 )
#                                                 response=response.text
#                                                 print("Response from Gemini:",response)
#                                                 time.sleep(5) #To avoid rate limiting
#                                         except Exception as e:
#                                             print("Error:",e)
#                                             print("Gemini Model ID:",model_name)
                                
#                                 # Get corrected code
#                                 current_code = response
                                
#                                 # Try running corrected code
#                                 with open("code_result.py", "w") as file:
#                                     file.write(current_code)
                                    
#                                 try:
#                                     current_result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
#                                     print(f"Execution with corrected code (attempt {attempt}):", current_result)
#                                     if not current_result.startswith("Error in execution:"):
#                                         code_result = current_code
#                                         final_answer = current_result
#                                         break
#                                 except subprocess.CalledProcessError as e:
#                                     current_result = f"Error in execution: {e.output}"
#                                     print(current_result)
                                
#                             except Exception as e:
#                                 print(f"Error getting correction: {e}")
#                                 break
                                
#                             attempt += 1
                            
#                         if current_result.startswith("Error in execution:"):
#                             print("\nError still persists after maximum correction attempts")

#                     print("Code final output:", code_result)
#                     return {"output": final_answer}
#                 else:  # For other tools
#                     print("Returned response from", model_name, ':', response)
#                     return {"output": response}
                    
#             except Exception as e:
#                 error_msg = f"Error in code extraction: {str(e)}" if tool_name[0] == 'extract_code' else f"Tool {tool_name} execution failed: {str(e)}"
#                 print("Error:", e)
#                 print("Model ID:", model_name)
#                 return {"output": error_msg}


# #Initial predict when no tools are used
#     else:

#         messages = [ #Only use the questions to ask the model to generate the response
#             {"role": "user", "content": inputs['question']},
#         ]

#         #They do not support system message
#         if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name: 
#             messages.insert(0, {"role": "system", "content": "You are a language model specialized in "+ domain + " engineering. Answer the following question:"}) #chemical
#         elif 'gemini' in model_name: #Google Gemini has this format since it also gets images
#             messages = {"role": "user", "parts": [{"text": "You are a language model specialized in "+ domain + "engineering. Answer the following question: " + messages[0]['content']}]}
#         else: #For gemma add system prompt in user message
#             messages[0]['content']="You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']
#         # print("Prompt:",messages)

#         if not any(provider in model_name for provider in commercial_api_providers): #For Hugging Face models
#             response=pipeline(messages, **generation_args)[0]['generated_text']
#             print(model_name,':',response)

#         else: 
#             if 'openai' in model_name:
#                 try:
#                     import openai
#                     from langsmith.wrappers import wrap_openai
                  
#                     # Define OpenAI client
#                     openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
                    
#                     if '/o1' not in model_name and '/o3' not in model_name:
#                         response = openai_client.chat.completions.create(messages=messages, temperature=0, model=model_name.split('/')[1], max_tokens=generate_max_tokens, seed=42) 
#                     else: #For thinking models
#                         print("Thinking....")
#                         response = openai_client.chat.completions.create(messages=messages, model=model_name.split('/')[1], max_completion_tokens=generate_max_tokens, seed=42) 

#                     # print("Full Response from OpenAI:",response)
#                     response=response.choices[0].message.content #That's the response without formatting
#                     print("Response from OpenAI:",response)
#                     time.sleep(5) #To avoid rate limiting

#                 except Exception as e:
#                     print("Error:",e)
#                     print("OpenAI Model ID:",model_name)

#             elif 'groq_website' in model_name:
#                 try:
#                     from groq import Groq
#                     client = Groq(api_key=groq_api_key)
#                     actual_model_name=model_name.split('/')[1]
#                     response = client.chat.completions.create(
#                         model=actual_model_name,
#                         max_tokens=generate_max_tokens,
#                         temperature=0,
#                         messages=messages)
                    
#                     response=response.choices[0].message.content #That's the response without formatting
#                     print("Response from Groq:",response)#.choices[0].message.content)
#                     time.sleep(5) #To avoid rate limiting

#                 except Exception as e:
#                     print("Error:",e)
#                     print("Groq Model ID:",model_name)

#             elif 'anthropic' in model_name:
#                 try:
#                     import anthropic
#                     client = anthropic.Anthropic(api_key=anthropic_api_key)
#                     response = client.messages.create(
#                         model=model_name.split('/')[1],
#                         messages=messages,
#                         temperature=0,
#                         max_tokens=generate_max_tokens,
#                     )
#                     response=response.content[0].text #That's the response without formatting
#                     print("Response from Anthropic:",response)#.content[0].text)
#                     time.sleep(5) #To avoid rate limiting

#                 except Exception as e:
#                     print("Error:",e)
#                     print("Anthropic Model ID:",model_name)

#             elif 'together' in model_name:
#                 try:
#                     from together import Together
#                     client = Together(api_key=together_api_key)
#                     response = client.chat.completions.create(
#                         model="/".join(model_name.split("/")[1:]),
#                         messages=messages,
#                         temperature=0,
#                         max_tokens=generate_max_tokens
#                     )
#                     # print("Full Response from Together:",response_full)
#                     response=response.choices[0].message.content #That's the response without formatting
#                     print("Response from Together:",response)
#                     time.sleep(5) #To avoid rate limiting
#                     if "<think>" in response:
#                         time.sleep(180) #To avoid rate limiting need to wait 3 minutes

#                 except Exception as e:
#                     print("Error:",e)
#                     print("Together Model ID:",model_name)
        
#             elif 'openrouter' in model_name: #Very limited quota for free models
#                 try:
#                     from openai import OpenAI

#                     client = OpenAI(
#                         base_url="https://openrouter.ai/api/v1",
#                         api_key=open_router_api_key,
#                     )
#                     response = client.chat.completions.create(
#                         model="/".join(model_name.split("/")[1:]),
#                         messages=messages,
#                         temperature=0,
#                         max_tokens=generate_max_tokens,
#                     )
#                     response=response.choices[0].message.content #That's the response without formatting
#                     print("Response from OpenRouter:",response)#.choices[0].message.content)
#                     time.sleep(5) #To avoid rate limiting

#                 except Exception as e:
#                     print("Error:",e)
#                     print("OpenRouter Model ID:",model_name)

#             elif 'gemini' in model_name:
#                 try:
#                     if 'thinking' in model_name or 'gemini-2.5-pro' in model_name: #Thinking model has different call
#                         from google.generativeai.types import GenerationConfig
#                         import google.generativeai as genai
#                         # Explicitly define API key
#                         genai.configure(api_key=gemini_api_key)
#                         model = genai.GenerativeModel(model_name.split('/')[1])
#                         response = model.generate_content(
#                             contents=messages,
#                             generation_config=GenerationConfig(
#                                 temperature=0,
#                                 max_output_tokens=generate_max_tokens,
#                             )
#                         )
#                         print("Full Response from Gemini ('thinking') model:",response)
#                         response=response.text
#                         print("Response from Gemini ('thinking') model:",response)
#                         time.sleep(5) #To avoid rate limiting
#                     else: #for the rest of the models
#                         from google import genai
#                         from google.genai import types
#                         client = genai.Client(api_key=gemini_api_key)
#                         response = client.models.generate_content(
#                         model=model_name.split('/')[1],
#                         contents=messages,
#                         config=types.GenerateContentConfig(
#                             temperature=0,
#                             max_output_tokens=generate_max_tokens,
#                             )
#                         )
#                         response=response.text
#                         print("Response from Gemini:",response)
#                         time.sleep(5) #To avoid rate limiting
#                 except Exception as e:
#                         print("Error:",e)
#                         print("Gemini Model ID:",model_name)

#         return {"output": response}

In [34]:
def predict(inputs: dict) -> dict:
    """Given a question, return the answer from the model, optionally using tools if tool_usage is True"""
    
    # Get these variables from the global scope
    global model_name, generate_max_tokens, generation_max_tokens_thinking, tool_usage

    # Configure token limits based on model type
    if 'deepseek' in model_name or 'thinking' in model_name or '/o1' in model_name or '/o3' in model_name or 'gemini-2.5-pro' in model_name or 'QwQ-32B' in model_name or 'o4' in model_name:
        generate_max_tokens = generation_max_tokens_thinking
        # if 'DeepSeek-R1-Distill-Llama-70B-free' in model_name: #This has a limit of 8193
        #     generate_max_tokens=6000
        print("Generation limit increased due to reasoning model:", model_name, "to:", generate_max_tokens)
    else:
        generate_max_tokens = 1000

    # Standard generation arguments
    generation_args = { 
        "max_new_tokens": generate_max_tokens,
        "return_full_text": False, 
        "temperature": 0.05,
        "do_sample": True,
        "num_beams": 5,
    }
    
    # API call handlers for different providers
    def call_openai_api(messages):
        try:
            import openai
            from langsmith.wrappers import wrap_openai
            
            openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
            
            if '/o1' not in model_name and '/o3' not in model_name and '/o4' not in model_name:
                response = openai_client.chat.completions.create(
                    messages=messages, 
                    temperature=0, 
                    model=model_name.split('/')[1], 
                    max_tokens=generate_max_tokens, 
                    seed=42
                ) 
            else:  # For thinking models
                print("Thinking....")
                response = openai_client.chat.completions.create(
                    messages=messages, 
                    model=model_name.split('/')[1], 
                    max_completion_tokens=generate_max_tokens, 
                    seed=42
                ) 

            result = response.choices[0].message.content
            print("Response from OpenAI:", result)
            time.sleep(5)  # To avoid rate limiting
            return result
        except Exception as e:
            print("Error:", e)
            print("OpenAI Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with OpenAI API: {str(e)}"

    def call_groq_api(messages):
        try:
            from groq import Groq
            client = Groq(api_key=groq_api_key)
            actual_model_name = model_name.split('/')[1]
            response = client.chat.completions.create(
                model=actual_model_name,
                max_tokens=generate_max_tokens,
                temperature=0,
                messages=messages
            )
            
            result = response.choices[0].message.content
            print("Response from Groq:", result)
            time.sleep(5)  # To avoid rate limiting
            return result
        except Exception as e:
            print("Error:", e)
            print("Groq Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with Groq API: {str(e)}"

    def call_anthropic_api(messages):
        try:
            import anthropic
            client = anthropic.Anthropic(api_key=anthropic_api_key)
            response = client.messages.create(
                model=model_name.split('/')[1],
                messages=messages,
                temperature=0,
                max_tokens=generate_max_tokens,
            )
            result = response.content[0].text
            print("Response from Anthropic:", result)
            time.sleep(5)  # To avoid rate limiting
            return result
        except Exception as e:
            print("Error:", e)
            print("Anthropic Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with Anthropic API: {str(e)}"

    def call_together_api(messages):
        try:
            from together import Together
            client = Together(api_key=together_api_key)
            response = client.chat.completions.create(
                model="/".join(model_name.split("/")[1:]),
                messages=messages,
                temperature=0,
                max_tokens=generate_max_tokens
            )
            result = response.choices[0].message.content
            print("Response from Together:", result)
            time.sleep(5)  # To avoid rate limiting
            if "<think>" in result:
                time.sleep(180)  # To avoid rate limiting need to wait 3 minutes
            return result
        except Exception as e:
            print("Error:", e)
            print("Together Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with Together API: {str(e)}"

    def call_openrouter_api(messages):
        try:
            from openai import OpenAI
            client = OpenAI(
                base_url="https://openrouter.ai/api/v1",
                api_key=open_router_api_key,
            )
            response = client.chat.completions.create(
                model="/".join(model_name.split("/")[1:]),
                messages=messages,
                temperature=0,
                max_tokens=generate_max_tokens,
            )
            result = response.choices[0].message.content
            print("Response from OpenRouter:", result)
            time.sleep(5)  # To avoid rate limiting
            return result
        except Exception as e:
            print("Error:", e)
            print("OpenRouter Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with OpenRouter API: {str(e)}"

    def call_gemini_api(messages):
        try:
            if 'thinking' in model_name or 'gemini-2.5-pro' in model_name:  # Thinking model has different call
                from google.generativeai.types import GenerationConfig
                import google.generativeai as genai
                genai.configure(api_key=gemini_api_key)
                model = genai.GenerativeModel(model_name.split('/')[1])
                response = model.generate_content(
                    contents=messages,
                    generation_config=GenerationConfig(
                        temperature=0,
                        max_output_tokens=generate_max_tokens,
                    )
                )
                # print("Full response from Gemini ('thinking') model:", response)
                result = response.text
                print("Response from Gemini ('thinking') model:", result)
                time.sleep(5)  # To avoid rate limiting
                return result
            else:  # for the rest of the models
                from google import genai
                from google.genai import types
                client = genai.Client(api_key=gemini_api_key)
                response = client.models.generate_content(
                    model=model_name.split('/')[1],
                    contents=messages,
                    config=types.GenerateContentConfig(
                        temperature=0,
                        max_output_tokens=generate_max_tokens,
                    )
                )
                print("Full response from Gemini model:", response)
                result = response.text
                print("Response from Gemini:", result)
                time.sleep(5)  # To avoid rate limiting
                return result
        except Exception as e:
            print("Error:", e)
            print("Gemini Model ID:", model_name)
            print("Traceback:", traceback.format_exc())
            return f"Error with Gemini API: {str(e)}"

    def call_huggingface_api(messages):
        response = pipeline(messages, **generation_args)[0]['generated_text']
        print(model_name, ':', response)
        return response

    # Main API caller function
    def get_model_response(messages):
        # Use the original commercial_api_providers list
        if not any(provider in model_name for provider in commercial_api_providers):
            print("Using HuggingFace model...")
            return call_huggingface_api(messages)
        
        if 'openai' in model_name:
            print("Using OpenAI model...")
            return call_openai_api(messages)
        elif 'groq_website' in model_name:
            print("Using Groq model...")
            return call_groq_api(messages)
        elif 'anthropic' in model_name:
            print("Using Anthropic model...")
            return call_anthropic_api(messages)
        elif 'together' in model_name:
            print("Using Together AI model...")
            return call_together_api(messages)
        elif 'openrouter' in model_name:
            print("Using OpenRouter model...")
            return call_openrouter_api(messages)
        elif 'gemini' in model_name:
            print("Using Gemini model...")
            return call_gemini_api(messages)
        else:
            print("Error: Not known model provider")
            return "Unknown model provider"

    # Function to handle code extraction and execution
    def handle_code_extraction(code_result):
        print("Type", type(code_result))
        print("Resulting code:", code_result)

        # Save code_result as a py file
        with open("code_result.py", "w") as file:
            file.write(code_result)

        # Run above created script with generated code
        venv_name = "test_LLM"
        activate_cmd = f"source /workspace/{venv_name}/bin/activate"
        command = f"bash -c '{activate_cmd} && python code_result.py'"
        print("Command to run script:", command)

        try:
            # Execute the command and capture the output
            result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
            final_answer = result
            print("Execution Result:", result)
            return final_answer, code_result
        except subprocess.CalledProcessError as e:
            result = f"Error in execution: {e.output}"
            print("Output error was:",result)

            # Check if there was an error in execution
            max_attempts = 3  # Original + 2 retries
            attempt = 1
            current_code = code_result
            current_result = result

            while current_result.startswith("Error in execution:") and attempt < max_attempts:
                print(f"\nAttempt {attempt} failed, trying correction...")
                
                # Send error and code to LLM for correction
                error_prompt = f"""
                The following code resulted in an error:
                
                {current_code}
                
                Error message:
                {current_result}
                
                Please correct the code to fix this error. Return only the code from the following message to be directly copy pasted in a py file. \
                Do not return it in quotes, just plain code.
                """
                
                # Create error correction messages
                error_messages = [{"role": "user", "content": error_prompt}]
                # print("Error prompt:", error_prompt, "\n")
                
                try:
                    # Get corrected code
                    # current_code = get_model_response(error_messages)

                    import openai
                    from langsmith.wrappers import wrap_openai
                    
                    openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
                    response = openai_client.chat.completions.create(
                        messages=error_messages, 
                        temperature=0, 
                        model=judge_model.split('/')[1], 
                        max_tokens=generate_max_tokens, 
                        seed=42
                    ) 
                    current_code=response.choices[0].message.content
                    
                    # Try running corrected code
                    with open("code_result.py", "w") as file:
                        file.write(current_code)
                        
                    try:
                        current_result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
                        print(f"Execution with corrected code (attempt {attempt}):", current_result)
                        if not current_result.startswith("Error in execution:"):
                            code_result = current_code
                            final_answer = current_result
                            break
                    except subprocess.CalledProcessError as e:
                        current_result = f"Error in execution: {e.output}"
                        print(current_result)
                    print("\n")
                    
                except Exception as e:
                    print(f"Error getting correction: {e}")
                    break
                    
                attempt += 1
                
            if current_result.startswith("Error in execution:"):
                print("\nError still persists after maximum correction attempts")

            # # If we have a successful execution, set final_answer to current_result
            # if 'final_answer' not in locals() and not current_result.startswith("Error in execution:"):
            #     final_answer = current_result
                
            # return final_answer, code_result
            return locals().get('final_answer', '-'), locals().get('code_result', '-')
        
    def text_for_simulation(response):
        """
        Handle simulation execution with the provided INP file content.
        Attempts to run the simulation and corrects errors if needed.
        
        Args:
            response (str): The INP file content to be executed
            
        Returns:
            tuple: (final_answer, inp_content) where final_answer is the execution result
                  and inp_content is the final INP file content
        """
        print("Running simulation with provided INP file content...")

        def extract_inp_section(inp_text):
            lines = inp_text.strip().splitlines()
            extracting = False
            extracted_lines = []

            for line in lines:
                if '[TITLE]' in line:
                    extracting = True
                if extracting:
                    extracted_lines.append(line)
                if '[END]' in line:
                    break  # Stop after reaching [END]

            return '\n'.join(extracted_lines)
        
        # Initialize variables
        max_attempts = 3
        attempt = 1
        final_answer = None
        inp_content = extract_inp_section(response)
        current_result = "Error in execution: Initial attempt"
        
        # Write the initial INP file
        with open("simulation.inp", "w") as file:
            file.write(inp_content)

        # Load the an example inp file content for comparison
        try:
            with open("network_test.inp", "r") as benchmark_file:
                example_inp = benchmark_file.read()
                print("Successfully loaded benchmark.inp file")
        except FileNotFoundError:
            print("Warning: benchmark.inp file not found")
            example_inp = ""
        except Exception as e:
            print(f"Error loading benchmark.inp file: {e}")
            example_inp = ""


        venv_name = "test_LLM"
        venv_path = f"/workspace/{venv_name}/bin/activate"
        script_path = "compare_networks.py"

        bash_command = f"source {venv_path} && MPLBACKEND=Agg python {script_path}"

        # try:
        #     result = subprocess.run(['bash', '-c', bash_command], capture_output=True, text=True, check=True)
        #     print("Script output:\n", result.stdout)
        # except subprocess.CalledProcessError as e:
        #     print("Error running script:\n", e.stdout, "\n", e.stderr)


        #  # Activate env
        # command = f"bash -c '{activate_cmd}'"
        # print("Command to activate env:", command)

        # try:
        #     # Execute the command and capture the output
        #     install_result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
        #     print("Execution Result of env installation:", install_result)
        # except subprocess.CalledProcessError as e:
        #     result = f"Error in execution: {e.output}"
        #     print(result)

        
        # # Command to run the simulation
        # command = "python -c \"import wntr; import numpy as np; from run import run; from io import StringIO; import sys; captured_output = StringIO(); sys.stdout = captured_output; run('network_0.inp', 'simulation.inp'); sys.stdout = sys.__stdout__; print(captured_output.getvalue())\""

        while attempt <= max_attempts:
            print(f"\nAttempt {attempt} to run simulation:")
            
            try:
                # Run the simulation
                # current_result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
                # print(f"Execution result (attempt {attempt}):", current_result)
                current_result=subprocess.run(['bash', '-c', bash_command], capture_output=True, text=True, check=True)
                print("Script output:\n", current_result.stdout)
                print("Script error:\n", current_result.stderr)
                print("Continue....")
                
                # Check if the simulation was successful
                if "All unit tests passed" in current_result.stdout:
                    final_answer = current_result.stdout
                    break
                else:
                    print("Trying to fix error....")
                    # If there's an error, prepare error correction prompt
                    error_prompt = f"""I tried to run a simulation with the following INP file content, but it failed. The content of it was:

                    {inp_content}

                    The output was:
                    {current_result.stdout}

                    An example of the format of a random INP file is: {example_inp}

                    Please provide a corrected version of the content of the ORIGINAL INP file that will pass all unit tests. Make sure all numbers and values are aligned, as in the example.
                    Return only the corrected INP file content without any explanations or quotes. 
                    Keep the same sections and format as in the example INP file. Do not add any additional sections and do not change their order. 
                    Make sure everything has the same alignment as in the example file.
                    
                    DO NOT use quotes,  or things like e.g. ```plaintext! All the columns should be aligned.
                    """ #semicolons (;) Replace semicolons with empty characters ('').
                    # An example of how an INP file should look like is: {example_inp}

                    # The error message was:
                    # {current_result.stderr}

                    # These sections are:
                    # Required or Common Sections
                    #     •	[TITLE]
                    #     •	[JUNCTIONS]
                    #     •	[RESERVOIRS]
                    #     •	[PIPES]
                    #     •	[COORDINATES]
                    #     •	[OPTIONS]
                    #     •	[END]

                    # Optional but Common Sections
                    #     •	[TANKS]
                    #     •	[PUMPS]
                    #     •	[VALVES]
                    #     •	[DEMANDS]
                    #     •	[PATTERNS]
                    #     •	[CURVES]
                    #     •	[CONTROLS]
                    #     •	[RULES]
                    #     •	[QUALITY]
                    #     •	[ENERGY]
                    #     •	[TIMES]
                    #     •	[EMITTERS]
                    #     •	[REPORT] 

                    # print("Checking error prompt:", error_prompt)
                    # print("______________________________________________________________________________________________________________")
                    # Create error correction messages
                    error_messages = [{"role": "user", "content": error_prompt}]
                    
                    try:
                        # Get corrected INP file content
                        import openai
                        from langsmith.wrappers import wrap_openai
                        
                        openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
                        response = openai_client.chat.completions.create(
                            messages=error_messages, 
                            temperature=0, 
                            model=judge_model.split('/')[1], 
                            max_tokens=generate_max_tokens, 
                            seed=42
                        ) 
                        inp_content = extract_inp_section(response.choices[0].message.content)
                        # print("Content before extraction:", response.choices[0].message.content)
                        print("Extracted inp content:", inp_content)
                        
                        # Try running with corrected INP file
                        with open("simulation.inp", "w") as file:
                            file.write(inp_content)
                            
                        try:
                            current_result=subprocess.run(['bash', '-c', bash_command], capture_output=True, text=True, check=True)
                            # current_result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT, text=True)
                            print(f"Execution with corrected INP file (attempt {attempt}):", current_result.stdout)
                            print("Error of the current execution was:", current_result.stderr)
                            if "All unit tests passed" in current_result.stdout:
                                final_answer = current_result.stdout
                                break
                        except subprocess.CalledProcessError as e:
                            # current_result = f"Error in execution: {e.output}"
                            # print(current_result)
                            print("Error running script:\n", e.stdout, "\n", e.stderr)
                        
                    except Exception as e:
                        print(f"Error getting correction: {e}")
                        break
                        
            except subprocess.CalledProcessError as e:
                # current_result = f"Error in execution: {e.output}"
                # print(current_result)
                print("Major Error running script:\n", e.stdout, "\n", e.stderr)
            
            attempt += 1
            
        if not final_answer or "All unit tests passed" not in current_result.stdout:
            print("\nError still persists after maximum correction attempts")

        # # If final_answer is None, set it to current_result.stdout
        # if final_answer is None and 'current_result' in locals() and hasattr(current_result, 'stdout'):
        #     final_answer = current_result.stdout
        #     print("Setting final answer to current result stdout")
            
        return locals().get('final_answer', '-'), locals().get('inp_content', '-')


    # MAIN LOGIC STARTS HERE
    
    # If tool_usage is enabled, check if we should use a tool for this question
    if tool_usage:
        tool_name = decide_tool_usage(inputs['question'])
        if tool_name[0]!='no_tool_needed':
            print(f"Using tool: {tool_name}")
            
            # Start with just the question
            messages = [
                {"role": "user", "content": inputs['question']},
            ]

            # Set the appropriate system message based on tool type
            if tool_name[0] == 'extract_code':
                system_message = "Return only the code from the following message to be directly copy pasted in a py file. Do not return it in quotes, just plain code. \
                    Correct any typos, errors and undefined variables. Example of errors are KeyErrors of variable not defined or not properly accessed. \
                    Some other steps that might be needed: Add checks for edge direction using .has_edge(u, v) because some pipes in the loops might be defined in the opposite direction \
                    in our graph. When a pipe is flowing in the opposite direction of how it's defined in the graph, we need to Use negative flow value (e.g. -G[v][u]['Q']). The message is:"
            elif tool_name[0] == 'run_simulation':
                system_message = "Return only the text corresponding to the content of the INP file to run a simulation. Do not return it in quotes, just plain text.  \
                     Make sure all the columns are aligned. The message is:" #Do not use semicolons or other strange characters. Replace them with empty character ('').
            else:
                system_message = "Return only the text from the following message to be directly copy pasted into a file. Do not return it in quotes, just plain text. The message is:"

            # Add system message based on model type
            if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name:
                messages.insert(0, {"role": "system", "content": system_message})
            elif 'gemini' in model_name:
                messages = {"role": "user", "parts": [{"text": system_message + " " + messages[0]['content']}]}
            else:  # For gemma add system prompt in user message
                messages[0]['content'] = system_message + " " + messages[0]['content']
            
            # Get response from API
            response = get_model_response(messages)

            # Process based on tool type
            if tool_name[0] == 'extract_code':
                final_answer, output_code = handle_code_extraction(response)
                print("Detailed output:", response)
                print("Code output:", output_code)
                with open(f"code_output_log_{model_name.split('/')[1]}.txt", "a") as log_file:
                    log_file.write(f"Code output:\n{output_code}\n\n")
                print("Final answer:", final_answer)
                print("\n")
                if os.path.exists("code_result.py"):
                    os.remove("code_result.py")
                return {"output": final_answer} #we return '-' if didn't work, even if correct code
            elif tool_name[0] == 'run_simulation':
                final_answer, inp_content = text_for_simulation(response)
                print("Printing responses....")
                print("Detailed output for simulation:", response)
                print("Code output for simulation:", inp_content)
                print("Final answer for simulation:", final_answer)
                with open(f"inp_final_answer_output_log_{model_name.split('/')[1]}.txt", "a") as log_file:
                    log_file.write(f"Final answer output:\n{final_answer}\n\n")
                print("\n")
                return {"output": inp_content} #inp_content if we want the text to be fed in the simulation software to be evaluated instead
            else:  # For other tools
                print("Returned response from", model_name, ':', response)
                return {"output": response}
       
        else:
            print("No tool will be used")
             # Default case when not using tools - use the original message format
            messages = [
                {"role": "user", "content": inputs['question']},
            ]

            # Add system message based on model type - same as original
            if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name:
                messages.insert(0, {"role": "system", "content": "You are a language model specialized in "+ domain + " engineering. Answer the following question:"})
            elif 'gemini' in model_name:
                messages = {"role": "user", "parts": [{"text": "You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']}]}
            else:  # For gemma add system prompt in user message
                messages[0]['content'] = "You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']
            
            response = get_model_response(messages)
            
            return {"output": response}
    
    else:
        # Default case when not using tools - use the original message format
        messages = [
            {"role": "user", "content": inputs['question']},
        ]

        # Add system message based on model type - same as original
        if 'gemma' not in model_name and 'anthropic' not in model_name and 'openrouter' not in model_name and 'gemini' not in model_name and '/o1' not in model_name:
            messages.insert(0, {"role": "system", "content": "You are a language model specialized in "+ domain + " engineering. Answer the following question:"})
        elif 'gemini' in model_name:
            messages = {"role": "user", "parts": [{"text": "You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']}]}
        else:  # For gemma add system prompt in user message
            messages[0]['content'] = "You are a language model specialized in "+ domain + " engineering. Answer the following question: " + messages[0]['content']
        
        response = get_model_response(messages)
        
        return {"output": response}

In [35]:
# import wntr

# # Load INP file
# wn = wntr.network.WaterNetworkModel('network_0.inp')

# # Step 1: Inspect network components
# print(f"Number of junctions: {len(wn.junctions())}")
# print(f"Number of pipes: {len(wn.pipes())}")
# print(f"Number of reservoirs: {len(wn.reservoirs())}")
# print(f"Junctions: {list(wn.junctions().keys())}")
# print(f"Pipes: {list(wn.pipes().keys())}")

# # Step 2: Check and assign minimal elevations (for junctions with elevation <= 0)
# for name, j in wn.junctions():
#     if j.elevation <= 0:
#         print(f"Assigning minimal elevation to junction {name}")
#         j.elevation = 0.01

# # Step 3: Check demands and set them to 0 temporarily for J5 and J6
# wn.get_node('J5').base_demand = 0  # Temporarily zero demand for J5
# wn.get_node('J6').base_demand = 0  # Temporarily zero demand for J6

# # Step 4: Set headloss formula to 'H-W' for simplicity
# wn.options.hydraulic.headloss = 'H-W'

# # Step 5: Set roughness coefficient to 10 for all pipes
# for name, p in wn.pipes():
#     p.roughness = 10.0

# # Step 6: Steady-state simulation only
# wn.options.time.duration = 0  # Steady-state only
# wn.options.time.hydraulic_timestep = 3600

# # Step 7: Run EPANET simulator
# try:
#     sim = wntr.sim.EpanetSimulator(wn)
#     results = sim.run_sim()

#     # Step 8: Print results
#     print("Node Pressures (first 5 values):")
#     print(results.node['pressure'].head())

#     print("Link Flows (first 5 values):")
#     print(results.link['flowrate'].head())
# except Exception as e:
#     print(f"Simulation failed: {e}")

In [36]:
# def predict_tools(inputs: dict, tool_usage=tool_usage) -> dict:
#     """Given a question, return the answer from the model, optionally using tools if tool_usage is True"""
    
#     # If tool_usage is enabled, check if we should use a tool for this question
#     if tool_usage:
#         tool_name = decide_tool_usage(inputs['question'])
#         if tool_name:
#             print(f"Using tool: {tool_name}")
#             code, answer = execute_tool(tool_name[0], inputs['question'])
#             print("Code to obtain answer:", code)
#             print("Response based on code execution:", answer)
#             return {"output": answer}
#     else: 
#         # If no tool is needed, use the regular predict function
#         return predict(inputs)

In [37]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import numpy as np
from collections import Counter

def calculate_statistics(values):
    """Calculate mean, standard error, and confidence intervals."""
    mean_value = np.mean(values)  # Mean of the metric over single run and over single metric (but over all questions)
    std_error = np.std(values, ddof=1) / np.sqrt(len(values))  # ddof=1 to divide by n-1 to calculate the sample sd
    
    assert np.std(values, ddof=1) == np.sqrt(np.sum((values-mean_value)**2)/(len(values)-1)), "Standard deviation calculation mismatch"
    
    margin_of_error = 1.96 * std_error  # didn't use t_critical=t.ppf(0.975, df=len(values)-1) since we're using sample standard deviation

    return {
        'mean': mean_value,
        'std_error': std_error,
        'ci_low': mean_value - margin_of_error,
        'ci_high': mean_value + margin_of_error
    }

def plot_metric_distributions(metric_values, axes, colors, bin_edges, metric_names):
    """Plot individual metric distributions with error bars."""
    error_bars = []
    run_stats = {}
    
    for metric_idx, (metric_name, values) in enumerate(metric_values.items()):  # Loop over runs' metric names and values
        clean_metric_name = metric_name.replace('_descr', '')  # This is over one run and over one metric (but over all questions)
        metric_name = metric_names[metric_idx]
        assert clean_metric_name == metric_name, "Metric name mismatch"
        
        stats = calculate_statistics(values)
        sns.histplot(values, bins=bin_edges, color=colors[metric_idx], ax=axes[metric_idx], kde=False)
        
        #Store error bars
        if metric_idx == 0:
            error_bars = []
        error_bars.append((stats['mean'], axes[metric_idx].get_ylim()[1]/2, stats['ci_high'] - stats['mean']))
        
        run_stats[metric_name] = stats

        axes[metric_idx].set_title(f'{metric_name} (Mean: {stats["mean"]:.2f} ± {stats["std_error"]:.2f} SE, CI: {stats["ci_low"]:.2f}-{stats["ci_high"]:.2f})')
        axes[metric_idx].set_xlim(0, 5.5)  # Keep 0 in case of errors
        axes[metric_idx].set_ylabel('Frequency')
        axes[metric_idx].set_xlabel('Values' if metric_idx == len(metric_values)-1 else '')
        
    return error_bars, run_stats

def plot_question_scores(metric_names, grouped_values, colors):
    """Plot scores for each question across metrics."""
    
    plt.figure(figsize=(10, 6))

    # Define colors for each metric
    colors = plt.cm.Set3(np.linspace(0, 1, len(metric_names)))

    # First count all frequencies per score (1-5) per metric for one run over all questions
    question_scores_by_metric = {metric: [] for metric in metric_names}
    score_metric_counts = {}

    #Plot each metric's values and store question scores
    for i, (metric, question_scores) in enumerate(zip(metric_names, grouped_values)):
        width = 0.8 / len(question_scores)  # Width of each metric's bar
        
        for j, val in enumerate(question_scores): #Create a bar for each question's score
            plt.bar(i + j * width, val, width=width, color=colors[i], alpha=0.5, 
                    label=metric if j == 0 else "")
                    # i is the index of metric and determines the base position of a group of bars corresponding to that metric.
                    # j*width adds an offset to the base position to separate individual bars within the same group (metric). 
                    # Each j corresponds to a different value in question_scores, creating distinct bars for the values of question_scores for the same metric.
                    # By combining the above two, we get the exact x-position of a specific bar     
            question_scores_by_metric[metric].append((j, val))

        counts = Counter(question_scores)  # Count frequency of each score in question_scores (e.g. {4: 1, 3: 2, 2: 2, 1: 1, 0: 1}, where key is score)
        for score, freq in counts.items():
            if score not in score_metric_counts:
                score_metric_counts[score] = {}
            score_metric_counts[score][metric] = freq  #Keeps track of how many times each metric gets a specific score over all questions (for one run)
            # {4: {'completeness': 1, 'confidence': 1, 'factuality': 1, 'judgement': 1}, 3: {'completeness': 1, 'relevance': 2, 'conciseness': 2, ....}

    return question_scores_by_metric, score_metric_counts

def plot_ordered_scores(metric_names, question_scores_by_metric, colors):
    """Plot metrics ordered by score values."""
    plt.figure(figsize=(15, 10))
    
    for i, metric in enumerate(metric_names):
        plt.subplot(len(metric_names), 1, i+1)
        sorted_questions = sorted(question_scores_by_metric[metric], key=lambda x: x[1]) #Sort questions by score
        
        #Plot bars
        x_pos = range(len(sorted_questions))
        scores = [q[1] for q in sorted_questions] #q[1] is the score, q[0] is the index
        plt.bar(x_pos, scores, color=colors[i], alpha=0.5)

        #Add question indices as x-axis labels
        plt.xticks(x_pos, [str(q[0]) for q in sorted_questions])
        
        plt.ylabel(metric)
        plt.ylim(0, 5.5)
        plt.yticks(range(6))  # Set y-axis ticks from 0 to 5

        if i == len(metric_names)-1:
            plt.xlabel('Question number (ordered by score)')

def plot_accumulated_distributions(score_metric_counts, metric_names, colors):
    """Plot accumulated distribution of scores by metric."""
    legend_added = set()

    #For each score, plot metrics in order of frequency (highest frequency at bottom)
    for score in sorted(score_metric_counts.keys()):
        #Sort metrics by frequency for this score
        sorted_metrics = sorted(score_metric_counts[score].items(),
                            key=lambda x: x[1], #Use the frequency (second element of each tuple) as the sorting key
                            reverse=True)  # highest frequency first
        bottom = 0
        for metric, freq in sorted_metrics:
            i = metric_names.index(metric) #get index for color
            plt.bar(score, freq,
                    width=0.4,
                    color=colors[i],
                    alpha=0.5,
                    label=metric if metric not in legend_added else "",
                    bottom=bottom)
            bottom += freq
            legend_added.add(metric)

           
def plot_figures_metrics(all_runs_model_metrics, metric_names, model_name, judge_model):
    """
    Creates visualizations and calculates statistics for evaluation metrics across multiple runs.

    Args:
        all_runs_model_metrics (dict): Nested dictionary containing evaluation metrics for each model and run.
            Structure: {model_id: [{metric1_descr_run1: [q1_score, q2_score, ...], 
                                  metric2_descr_run1: [q1_score, q2_score, ...], ...}, 
                                 {metric1_descr_run2: [q1_score, q2_score, ...],
                                  metric2_descr_run2: [q1_score, q2_score, ...], ...},
                                 ...num_runs]}
            Example: {'model1': [{'completeness_descr_run1': [4.5, 3.0, 4.0], 
                                'relevance_descr_run1': [3.5, 4.0, 3.0]}, ...,
                               {'completeness_descr_run2': [4.0, 3.5, 4.5],
                                'relevance_descr_run2': [3.0, 4.5, 3.5], ...},
                               ...num_runs]}
            Where each inner dictionary represents one run containing scores for each metric across all questions
        metric_names (list): Names of metrics to analyze and plot (e.g. ['completeness', 'relevance'])
        model_name (str): Name/identifier of the model being evaluated
        judge_model (str): Name/identifier of the model used for judging the evaluations

    Returns:
        dict: Summary statistics for each model, run and metric.
            Structure: {model_name: {run_idx: {metric_name: {
                'mean': float,
                'std_error': float, 
                'ci_low': float,
                'ci_high': float
            }}}}
            Example: {'anthropic/claude-3-5-sonnet': {
                '0': {'completeness': {'mean': 4.5, 'std_error': 0.5, 
                                     'ci_low': 3.52, 'ci_high': 5.48},
                      'relevance': {'mean': 3.5, 'std_error': 0.5,
                                  'ci_low': 2.52, 'ci_high': 4.48} , ...},
                '1': {'completeness': {'mean': 4.5, 'std_error': 0.5,
                                     'ci_low': 3.52, 'ci_high': 5.48},
                      'relevance': {'mean': 3.5, 'std_error': 0.5,
                                  'ci_low': 2.52, 'ci_high': 4.48}, ...},
                ...num_runs}}

    The function generates several visualization types:
    - Individual histograms for each metric showing score distributions
    - Error bars indicating means and confidence intervals
    - Overlapping bar plots comparing metrics
    - Stacked distribution plots showing relative frequencies of scores

    All plots are saved as PNG files with names indicating the judge model,
    evaluated model, run index, and plot type.
    """

    summary_stats_all_runs = {}  # Keep track of summary statistics over all runs

    for run_idx, metric_values_run in enumerate(all_runs_model_metrics[model_name]): #Loop over runs

        colors = sns.color_palette("Set3", len(metric_names))
        
        # Create two figures - one with separate subplots and one overlaid
        fig, axes = plt.subplots(len(metric_names), 1, figsize=(10, 18))
        plt.subplots_adjust(hspace=0.6, top=0.94)
        fig.suptitle(f'Metric Distributions for {model_name} (Run {run_idx})', fontsize=16)
        
        bin_edges = np.arange(0.0, 5.6, 0.2)  # Bins for range 0-5
        metric_names = [name.replace('_descr', '') for name in metric_values_run]
        
        error_bars, run_stats = plot_metric_distributions(metric_values_run, axes, colors, bin_edges, metric_names)
        
        # Save version without error bars
        plt.figure(fig.number)
        plt.savefig(f"{judge_model.split('/')[1]}_judge_with_{model_name.replace('/', '_')}_run_{run_idx}_metric_distributions_no_error_bars.png")
        
        # Add error bars and save updated version
        for i, (mean, ylim, margin) in enumerate(error_bars):
            axes[i].errorbar(mean, ylim, xerr=margin, color='black', capsize=5, 
                           capthick=1, elinewidth=2, marker='o')
        
        plt.savefig(f"{judge_model.split('/')[1]}_judge_with_{model_name.replace('/', '_')}_run_{run_idx}_metric_distributions.png")
        plt.close('all')

        # Print summary statistics - Can also be seen in txt file. 
        print(f"\nSummary Statistics over run {run_idx}:")
        print("-" * 50)
        for metric, stats in run_stats.items():
            print(f"{metric}:")
            for key, value in stats.items():
                print(f"  {key}: {value:.2f}")
            print("-" * 50)

        summary_stats_all_runs[run_idx] = run_stats #For one run

        grouped_values=list(metric_values_run.values()) #Values of all metrics for one run over all questions. There are num_metrics lists in that list. 
        values = [val for sublist in grouped_values for val in sublist] #Flatten the list - Size is num_questions*num_metrics (1st metric questions, 2nd metric questions, etc)
        
        question_scores_by_metric, score_metric_counts = plot_question_scores(metric_names, grouped_values, colors)
        plt.xlabel('Metrics')
        plt.ylabel('Score')
        plt.title('Per-Metric Question Scores Distribution')
        plt.xticks(np.arange(len(metric_names)) + 0.1, metric_names)
        plt.yticks(range(6)) #Set y-ticks to 0-5
        plt.savefig(f"{judge_model.split('/')[1]}_judge_with_{model_name.replace('/', '_')}_run_{run_idx}_per_metric_question_scores.png")
        plt.close('all')

        # Plot ordered scores
        plot_ordered_scores(metric_names, question_scores_by_metric, colors)
        plt.suptitle('Question indices ordered by metric value')
        plt.tight_layout()
        plt.savefig(f"{judge_model.split('/')[1]}_judge_with_{model_name.replace('/', '_')}_run_{run_idx}_question_indices_ordered_by_metric_value.png")
        plt.close('all')

        # Plot accumulated distributions
        plt.figure(figsize=(10, 6))
        plot_accumulated_distributions(score_metric_counts, metric_names, colors)
        plt.xlabel('Score')
        plt.ylabel('Frequency')
        plt.title('Score Distribution Histogram by Metric') 
        plt.legend(title='Metrics', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(np.arange(0, 6))
        plt.tight_layout()
        plt.savefig(f"{judge_model.split('/')[1]}_judge_with_{model_name.replace('/', '_')}_run_{run_idx}_score_distribution_histogram_by_metric.png")
        plt.close('all')

    return summary_stats_all_runs

def plot_model_comparison(models, metrics, metric_means, metric_stds, save_prefix=""):
    """
    Plot comparison charts for multiple models across different metrics.
    
    Args:
        models (list): List of model names to compare
        metrics (list): List of metric names to display
        metric_means (dict): Dictionary mapping metrics to lists of mean values for each model
        metric_stds (dict): Dictionary mapping metrics to lists of standard deviation values for each model
        save_prefix (str, optional): Prefix for saved image filenames
    
    Returns:
        None: Plots are displayed and saved to files
    """
    # Your preferred colors first
    base_colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b','#e377c2']

    # If we need more colors than in the base list
    if len(models) > len(base_colors):
        # Generate the additional colors needed
        extra_needed = len(models) - len(base_colors)
        hsv_colors = [(i/extra_needed, 0.8, 0.8) for i in range(extra_needed)]
        rgb_colors = [mcolors.hsv_to_rgb(hsv) for hsv in hsv_colors]
        extra_colors = [mcolors.to_hex(rgb) for rgb in rgb_colors]
        
        # Combine base colors with extra colors
        model_colors = base_colors + extra_colors
    else:
        # Use just the base colors up to the number needed
        model_colors = base_colors[:len(models)]

    # Plot 1: Grid of metrics
    # Calculate the number of rows and columns needed for the subplots
    num_metrics = len(metrics)
    num_cols = 3  # Keep 3 columns as in original
    num_rows = (num_metrics + num_cols - 1) // num_cols  # Ceiling division to ensure enough subplots
    
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(16, 5 * num_rows))
    axs = axs.flatten()

    for i, metric in enumerate(metrics):
        means = metric_means[metric]
        stds = metric_stds[metric]
        
        # Filter out zero values
        valid_indices = [j for j, mean in enumerate(means) if mean > 0]
        valid_means = [means[j] for j in valid_indices]
        valid_stds = [stds[j] for j in valid_indices]
        valid_colors = [model_colors[j] for j in valid_indices]
        valid_model_labels = [models[j] for j in valid_indices]
        
        # Create new x positions without gaps
        valid_x = np.arange(len(valid_indices))
        
        bars = axs[i].bar(valid_x, valid_means, yerr=valid_stds, capsize=5, color=valid_colors)
        axs[i].set_title(metric)
        axs[i].set_xticks(valid_x)
        axs[i].set_xticklabels(valid_model_labels, rotation=45, ha='right')
        axs[i].set_ylim(0, 6.2)  # higher to accommodate error bar labels
        axs[i].set_yticks(np.arange(0, 6, 1))
        axs[i].grid(axis='y', linestyle='dotted', color='gray', linewidth=0.8)
        if i in [0, 3]:
            axs[i].set_ylabel("Score")
        if i >= 3:
            axs[i].set_xlabel("LLM")
        for j, bar in enumerate(bars):
            top = valid_means[j] + valid_stds[j]
            axs[i].text(bar.get_x() + bar.get_width() / 2, top + 0.05, f"{valid_means[j]:.2f}",
                        ha='center', va='bottom', fontsize=9, rotation=90)

    # Hide any unused subplots
    for i in range(num_metrics, len(axs)):
        axs[i].set_visible(False)

    fig.suptitle("Metric Comparison Across LLMs (± std dev)", fontsize=16)
    fig.tight_layout(rect=[0, 0, 1, 0.96])
    plt.savefig(f"{save_prefix}metric_comparison_grid.png", dpi=300, bbox_inches='tight')
    plt.show()

    # Plot 2: Grouped bar chart
    width = 0.12
    fig, ax = plt.subplots(figsize=(18, 7))

    # Ensure we have enough colors for all metrics
    metric_colors = ['#1f77b4','#ff7f0e','#2ca02c','#d62728','#9467bd','#8c564b']
    if len(metrics) > len(metric_colors):
        # Generate additional colors if needed
        extra_needed = len(metrics) - len(metric_colors)
        hsv_colors = [(i/extra_needed, 0.8, 0.8) for i in range(extra_needed)]
        rgb_colors = [mcolors.hsv_to_rgb(hsv) for hsv in hsv_colors]
        extra_colors = [mcolors.to_hex(rgb) for rgb in rgb_colors]
        metric_colors = metric_colors + extra_colors
    else:
        metric_colors = metric_colors[:len(metrics)]
        
    max_y = 0

    # Get all valid model indices (models with at least one non-zero metric)
    all_valid_indices = set()
    for metric in metrics:
        for j, mean in enumerate(metric_means[metric]):
            if mean > 0:
                all_valid_indices.add(j)

    all_valid_indices = sorted(list(all_valid_indices))
    valid_models = [models[j] for j in all_valid_indices]

    # Create new x positions without gaps
    x = np.arange(len(all_valid_indices))

    for i, metric in enumerate(metrics):
        means = metric_means[metric]
        stds = metric_stds[metric]
        
        # Filter out zero values but maintain position for valid models
        valid_means = []
        valid_stds = []
        valid_positions = []
        
        for idx, j in enumerate(all_valid_indices):
            if means[j] > 0:
                valid_means.append(means[j])
                valid_stds.append(stds[j])
                valid_positions.append(x[idx])
        
        # Skip if no valid data for this metric
        if not valid_means:
            continue
            
        offset = (i - len(metrics)/2 + 0.5) * width
        positions = [pos + offset for pos in valid_positions]
        
        bars = ax.bar(positions, valid_means, width, yerr=valid_stds, label=metric, color=metric_colors[i], capsize=4)
        for j, bar in enumerate(bars):
            top = valid_means[j] + valid_stds[j]
            max_y = max(max_y, top)
            ax.text(bar.get_x() + bar.get_width() / 2, top + 0.1, f"{valid_means[j]:.2f}",
                    ha='center', va='bottom', fontsize=9, rotation=90)

    ax.set_ylabel('Score')
    ax.set_title('LLM Metric Comparison (Mean ± Std Dev)')
    ax.set_xticks(x)
    ax.set_xticklabels(valid_models, rotation=45, ha='right')
    ax.set_yticks(np.arange(0, 6, 1))
    ax.set_ylim(0, max_y + 0.5)
    ax.grid(axis='y', linestyle='dotted', color='gray', linewidth=0.8)
    ax.legend(title='Metric', bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
    plt.tight_layout(rect=[0, 0, 0.88, 0.97])
    plt.savefig(f"{save_prefix}summary_by_LLM.png", dpi=300, bbox_inches='tight')
    plt.show()

Perform the Evaluation over all models

In [38]:
#https://python.langchain.com/v0.2/docs/integrations/chat/openai/
from langsmith.evaluation import evaluate

def load_model_stats(judge_model): #In case we had to restart the loop - some models didn't run - Keep track of all model stats
    """Load existing stats from files or initialize empty dictionaries."""
    try:
        with open(f'stats_{judge_model.split("/")[1]}.json', 'r') as f:
            all_models_stats = json.load(f)
    except FileNotFoundError:
        all_models_stats = {}  # Used in comparison between models

    try:
        with open(f'all_runs_model_metrics_{judge_model.split("/")[1]}.json', 'r') as f:
            all_runs_model_metrics = json.load(f)
    except FileNotFoundError:
        all_runs_model_metrics = {}  # Used in plotting metrics
        
    return all_models_stats, all_runs_model_metrics

def perform_evaluation(model_id, judge_model, n_resamples, example_inputs, factor_evaluator, langsmith_api_key):
    """Perform evaluation runs and collect results."""
    dataset_name = get_dataset_name(model_id, judge_model) #How the dataset will be named in Langsmith
    dataset_langsmith = create_langsmith_dataset(dataset_name, example_inputs, langsmith_api_key)

    evaluation_all_resamples = [] #Used below to obtain the unique questions/answers and also the results of each resample
    
    begin = time.time()
    for resample_idx in range(n_resamples):
        print(f"\nPerforming evaluation of resample {resample_idx+1}/{n_resamples} of {model_id}")

        evaluation_results = evaluate(
            predict, #Function that call our LLM and returns its output
            data=dataset_langsmith.name, #Just using dataset_langsmith doesn't work 
            evaluators=[factor_evaluator], #Evaluators to use
            max_concurrency=1, #Run one question through langsmith each time - Other values will give errors in resulting excels
            # metadata={"revision_id": "the version of your pipeline you are testing"},
            experiment_prefix=str(judge_model)+'_judge_with_'+str(model_id)+'_resample_'+str(resample_idx) # A prefix for your experiment names to easily identify them
        )
        evaluation_all_resamples.extend(evaluation_results) #Used below to get unique questions/answers and to select the predicted answers
        #This has n_resamples*num_questions elements, for just one model

    with open('evaluation_all_resamples_'+str(model_id.split('/')[1])+'_'+str(judge_model.split('/')[1])+'.txt', 'w') as f:
        f.write(str(evaluation_all_resamples))

    assert len(evaluation_all_resamples)==n_resamples*len(example_inputs), f"Number of evaluation results not matching num_resamples*num_questions. \
        Got {len(evaluation_all_resamples)} evaluation results but expected {n_resamples*len(example_inputs)}"
    
    print(f"Total time for evaluation: {time.time() - begin}")

    return evaluation_all_resamples, dataset_langsmith

def process_evaluation_results(langsmith_api_key, dataset_langsmith):
    """Extract questions and answers from evaluation results."""
    #https://docs.smith.langchain.com/tutorials/Developers/evaluation

    # Get unique questions/answers
    client = Client(api_key=langsmith_api_key)
    questions_answers=[x for x in client.list_examples(dataset_id=dataset_langsmith.id)]
    list_of_questions=[x.inputs['question'] for x in questions_answers]
    list_of_answers=[x.outputs['answer'] for x in questions_answers]
        
    # with open('list_of_questions.txt', 'w') as f:
    #     f.write(str(list_of_questions))
        
    # with open('list_of_answers.txt', 'w') as f:
    #     f.write(str(list_of_answers))
    
    results_df = pd.DataFrame({
        'questions': list_of_questions,
        'answers': list_of_answers
    })
    return results_df, list_of_questions

def process_metrics(resample_results, list_of_metrics, list_of_questions, resample_idx, results_df, model_name):
    """
    Process metrics for a single resample and update results DataFrame.
    
    Args:
        resample_results: Results from current resample
        list_of_metrics: List of metrics to process
        resample_idx: Current resample index
        results_df: DataFrame to update with metrics
        model_name: Name of the model being evaluated
        
    Returns:
        individual_run_metric_scores, metrics, results_df
    """

    metrics = [] #This should be the same as resample_results (list) except when there are 'traceback' errors where it will be 0.
    # metrics format will be:[[EvaluationResult(key='completeness', score=4, value='To evaluate the .... - It has num_questions sublists, each with num_metrics values

    for result in resample_results:
        if result['run'].outputs['output'] is None or not result['evaluation_results']['results']: #or result['run'].error is not None - Same as first condition
            metrics.append(0)  # Use 0 to indicate failed evaluation - We might even get in here when LangSmith API connection issues
            print("Error: No metric value found!")
            #Also print which condition is true
            print("result['run'].outputs['output'] is None",result['run'].outputs['output'] is None)
            print("not result['evaluation_results']['results']",not result['evaluation_results']['results'])
            # print("result['run'].error is not None",result['run'].error is not None)
        else:
            metrics.append(result['evaluation_results']['results'])

    with open('resample_results_'+str(resample_idx)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
        f.write(str(resample_results))

    assert len(resample_results)==len(list_of_questions), f"Number of resample results not matching num_questions. Got {len(resample_results)} resample \
        results but expected {len(list_of_questions)}"
    
    with open('metrics_'+str(resample_idx)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
        f.write(str(metrics))

    assert len(metrics)==len(list_of_questions), f"Number of metrics not matching num_questions. Got {len(metrics)} metrics but expected {len(list_of_questions)}"
    
    #This is at the end a dict with num_metrics keys and each key has num_questions values.
    #Example: {'completeness_descr': [4, 3, 3, 5, 5, 4, 3], 'relevance_descr': [4, 3, 3, 3, 4, 3, 1], ....} assuming 7 questions
    individual_run_metric_scores = {} #Keep track of scores of all metrics over all questions for one resample

    for metric_idx, metric_name in enumerate(list_of_metrics): #Get specific metric name and values over all questions for the current resample

        clean_metric_names, metric_scores, metric_prompts = [], [], [] #Metric scores and prompts for all questions for a given resample - Should be num_questions elements each time
        
        #Get all metric keys for the current resample over all questions, handling potential missing keys (values set to 0 for those - they are errors)
        for m in metrics:
            if m == 0: #If there is an error
                key = metric_name.replace('_descr','')
                score = 0
                prompt=""
            else:
                try:
                    key = m[metric_idx].key #Metric name
                    score = m[metric_idx].score ##Scores of a given metric over all questions for a given resample
                    prompt = m[metric_idx].value #Prompt used for the evaluation
                except:
                    print("Error: Metric not found - Shouldn't get here")
                    key = metric_name.replace('_descr','')
                    score = 0
                    prompt = ""
                
            clean_metric_names.append(key)
            metric_scores.append(score)
            metric_prompts.append(prompt)
            
        assert all(name == metric_name.replace('_descr','') for name in clean_metric_names), f"Metric keys not matching: clean_metric_names={clean_metric_names}, \
            expected={metric_name.replace('_descr','')} and their values: {metric_scores}"
            
        # with open('metric_scores_'+str(resample_idx)+'_'+str(metric_name)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
        #     f.write(str(metric_scores))

        assert len(metric_scores)==len(list_of_questions), f"Number of metric scores not matching num_questions. Got {len(metric_scores)} metric scores \
            but expected {len(list_of_questions)}"
            
        # with open('metric_prompts_'+str(resample_idx)+'_'+str(metric_name)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
        #     f.write(str(metric_prompts))

        assert len(metric_prompts)==len(list_of_questions), f"Number of metric prompts not matching num_questions. Got {len(metric_prompts)} metric prompts \
            but expected {len(list_of_questions)}"

        # Update results DataFrame
        clean_metric_name = clean_metric_names[0] #Just one metric name without the _descr
        results_df[f'metric_{clean_metric_name}_{resample_idx+1}'] = metric_scores
        results_df[f'prompt_{clean_metric_name}_{resample_idx+1}'] = metric_prompts
        
        # Store scores for return
        individual_run_metric_scores[metric_name] = metric_scores #len is num_metrics
    
    # with open('individual_run_metric_scores_'+str(resample_idx)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
    #     f.write(str(individual_run_metric_scores))

    return individual_run_metric_scores, metrics, results_df

def calculate_metric_statistics(all_runs_metric_scores, list_of_metrics, num_questions, model_name):
    """Calculate statistical metrics across resamples (reduce variance - step 3.1)."""
    metric_stats_resampling = {} # Calculate mean and standard error for each metric and question across K resamples
    #The above dict will have num_metrics elements, each with metric keys (e.g. mean, std, etc), that will have num_questions values
    #Example: {'completeness': {'mean':[4, 3, 3, 5, 5, 4, 3]}, #here for a dataset with 7 questions
    #          'relevance': {'mean':[4, 3, 3, 3, 4, 3, 2]}, ...}
    
    for metric in list_of_metrics:
        metric_stats_resampling[metric] = {
            'means': [],  # Mean score across K resamples for each question
            'standard_errors': [],  # Standard error of the mean for each question
            'conditional_vars': []  # Conditional variance reduced by factor of K
        }
        
        # For each question
        for q in range(num_questions):
            # Get K scores for this metric/question across all resamples (num_resamples elements each time in that list)
            scores = [run[metric][q] for run in all_runs_metric_scores]
            K = len(scores)  # Number of resamples
            assert len(scores)==n_resamples, f"Number of scores not matching num_resamples. Got {len(scores)} scores but expected {n_resamples}"
            
            # Calculate statistics
            mean = np.mean(scores) #Average score of each question for a given metric over all resamples
            var = np.var(scores) #Variance of the scores of each question for a given metric over all resamples
            # Calculate conditional variance reduced by factor of K. Var(mean) = σ²/K where σ² is the variance of individual scores
            conditional_var = var / K if K > 0 else 0
            standard_error = np.sqrt(conditional_var)
            
            # Store results
            metric_stats_resampling[metric]['means'].append(mean)
            metric_stats_resampling[metric]['standard_errors'].append(standard_error)
            metric_stats_resampling[metric]['conditional_vars'].append(conditional_var)
    
    with open('metric_stats_resampling_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
        f.write(str(metric_stats_resampling))

    assert len(metric_stats_resampling)==len(list_of_metrics), f"Number of metric_stats_resampling not matching num_metrics. \
        Got {len(metric_stats_resampling)} metric_stats_resampling but expected {len(list_of_metrics)}"
    
    for metric in list_of_metrics:
        assert len(metric_stats_resampling[metric]['means']) == num_questions, f"Number of values for metric '{metric}' ({len(metric_stats_resampling[metric]['means'])}) \
            not matching expected number of questions ({num_questions})"

    return metric_stats_resampling

def handle_zero_values(results_df, n_resamples, list_of_metrics):
    """
    Handle zero values in results.
    
    Args:
        results_df (pd.DataFrame): DataFrame containing results
        n_resamples (int): Number of resamples
        list_of_metrics (list): List of metrics to check
        
    Returns:
        dict: Indices of rows containing zero values for each metric
    """
    zero_rows_columns = {}
    
    try:
        # Handle 0 values across all resamples - These are errors
        for resample_idx in range(n_resamples):
            for metric in list_of_metrics:
                try:
                    simple_metric_name = metric.replace('_descr','')
                    metric_col = f'metric_{simple_metric_name}_{resample_idx+1}'
                    
                    # Check if column exists
                    if metric_col not in results_df.columns:
                        print(colored(f"Warning: Column {metric_col} not found in DataFrame", 'yellow'))
                        continue
                    
                    zero_indices = results_df[metric_col] == 0 #series with True/False
                    
                    if zero_indices.any(): #If any of the values of that column are 0
                        zero_rows_columns[metric_col] = []
                        for idx in zero_indices[zero_indices].index: #Loop over True indices (rows with 0s)
                            try:
                                print(colored(f"Missing value for metric '{simple_metric_name}' in resample {resample_idx+1}", 'red'))
                                print(colored(f"Question: {results_df.loc[idx, 'questions']}", 'green'))
                                zero_rows_columns[metric_col].append(idx) #Keep track of columns and rows with zero values
                            except Exception as e:
                                print(colored(f"Unexpected error processing zero value at row {idx}: {e}", 'red'))
                
                except Exception as e:
                    print(colored(f"Error processing metric {metric} in resample {resample_idx}: {e}", 'red'))
        
        return zero_rows_columns # Return column names and rows with zero values
    
    except Exception as e:
        print(colored(f"Critical error in handle_zero_values: {e}", 'red'))
        traceback.print_exc()
        return {}  # Return empty dict in case of critical error

def process_zero_values(results_df, zero_rows_columns, list_of_metrics, model_name): #TO BE ACTIVATED
    """Process and optionally replace zero values in results."""
    row_zero_counts = {}
    col_zero_counts = {}

    # Force a copy to ensure changes are applied properly below when replace with mean value
    results_df_copy = results_df.copy()

    for column_name, row_indices in zero_rows_columns.items():
        for row_idx in row_indices:
                
            # Get values for this metric for this row and column (one resample per time)
            values = results_df.loc[row_idx, column_name]

            assert values==0, "Values should be 0"
            
            if values != 0: #We should never get here
                with open('values_'+str(model_name.split('/')[1])+'_'+str(column_name)+'_'+str(row_idx)+'.txt', 'w') as f:
                    f.write(str(values))
                
            #Given that values are 0, replace with mean of non-zero values
            df_values=results_df.loc[:, column_name].values
            non_zero_values = [x for x in df_values if x != 0]

            if len(non_zero_values) > 0:
                mean_value = np.mean(non_zero_values)

                if results_df.loc[row_idx, column_name] == 0 and mean_value != 0:
                    print(colored(f"0 value in row {row_idx}, column {column_name} should be replaced with mean {mean_value:.2f}", 'yellow'))
                    # Uncomment to actually replace values:
                    # results_df.loc[row_idx, column_name] = mean_value#round(mean_value, 1)

                    row_zero_counts[row_idx] = row_zero_counts.get(row_idx, 0) + 1
                    col_zero_counts[column_name] = col_zero_counts.get(column_name, 0) + 1

    print("\nZero values replaced per row:")
    for row in sorted(row_zero_counts):
        print(f"Row/question {row}: {row_zero_counts[row]} replacements")

    print("\nZero values replaced per column:")
    for col in sorted(col_zero_counts):
        print(f"Column/metric {col}: {col_zero_counts[col]} replacements")

def reorganize_evaluation_metrics(all_resamples_metrics, list_of_metrics, model_name, list_of_questions, n_resamples):
    """    
    This function takes evaluation metrics from multiple resampling runs and reorganizes them into
    a structured dictionary where each metric's scores are grouped together. It handles cases where
    some evaluations may have failed (represented by 0s).
    
    Args:
        all_resamples_metrics (list): List of evaluation results for each resample. Each resample contains
                                     scores for multiple questions and metrics.
        list_of_metrics (list): List of metric names to process (e.g., ['completeness_descr', 'relevance_descr']).
        model_name (str): Name of the model being evaluated, used for logging.
        list_of_questions (list): List of questions that were evaluated.
        n_resamples (int): Number of resampling iterations performed.
    
    Returns:
        dict: Dictionary where keys are metric names (without '_descr' suffix) and values are lists
              containing all scores for that metric across all resamples and questions.
              
    Note:
        The function assumes each resample has scores for all questions and metrics.
    """
    metric_scores_all_resamples = {metric.replace('_descr', ''): [] for metric in list_of_metrics}
    #The above dict will have num_metrics elements, with their value for each question, over each run (first num_questions for first run, then next num_questions for next run, etc)
    #Example: {'completeness': {'mean':[4, 3, 3, 5, 5, 4, 3, 4, 3, 3, 5, 5, 0, 3, 5, 3, 3, 5, 5, 4, 3]},  #assuming 3 runs and 7 questions
    #          'relevance': {'mean':[4, 3, 3, 3, 4, 3, 2, 4, 3, 3, 3, 3, 0, 2, 4, 3, 3, 3, 3, 3, 2]}, ...}
    #In case of error, there will be num_questions less elements in the sublist for which there was an error
    
    for metric_name in list_of_metrics:
        clean_name = metric_name.replace('_descr', '')
        
        #Each resample_metrics (num_resamples in total) has a list of num_questions lists, each having num_metrics values
        #format of each sublist: [EvaluationResult(key='completeness', score=4, value='To evaluate the ...
        #If error, instead of the above list we have just a 0.
        for resample_idx, resample_metrics in enumerate(all_resamples_metrics):

            with open('resample_metrics_'+str(resample_idx)+'_'+str(metric_name)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(resample_metrics))

            metric_idx = list_of_metrics.index(metric_name) #0-num_metrics the range of values of this. 

            scores = [m[metric_idx].score if m!=0 and m!=[] else 0 
                     for m in resample_metrics] #num_questions elements each time
            assert len(scores)==len(list_of_questions), "Scores length not matching num_questions"

            with open('scores_'+str(resample_idx)+'_'+str(metric_name)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(scores))

            metric_scores_all_resamples[clean_name].extend(scores) #Every time we add one metric for one resample (num_questions elements)

            with open('metric_stats_reorganized_'+str(resample_idx)+'_'+str(metric_name)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(metric_scores_all_resamples))

    assert [len(x) for x in metric_scores_all_resamples.values()]==[len(list_of_questions)*n_resamples]*len(list_of_metrics), "Metric stats length not matching"

    return metric_scores_all_resamples

def save_results(results_df, judge_model, model_id, stage="before"):
    """Save results DataFrame to Excel."""
    filename = (f"results_{judge_model.split('/')[1]}_judge_with_"
               f"{model_id.replace('/','_')}_{stage}_nan_replacement.xlsx")
    
    try:
        #Extract reasoning traces and final answers from predicted answers
        # Check for <think> tags in predicted answer columns and split them if found
        for col in results_df.columns:
            if 'predicted_answer' in col:
                # Create new column names
                run_number = col.split('_')[-1]
                reasoning_col = f'reasoning_trace_{run_number}'
                
                # Check if we need to split this column
                has_think_tags = results_df[col].astype(str).str.contains('</think>', na=False).any()
                has_think_start_tags = results_df[col].astype(str).str.contains('<think>', na=False).any()
                
                if has_think_tags or has_think_start_tags: 
                    print("Has </think> or <think> in answer")
                    # Extract reasoning traces and final answers
                    reasoning_traces = []
                    final_answers = []
                    
                    for answer in results_df[col]:
                        if isinstance(answer, str):
                            if '<think>' in answer and '</think>' in answer:
                                # Extract the reasoning trace between <think> and </think>
                                think_start = answer.find('<think>') + len('<think>')
                                think_end = answer.find('</think>')
                                
                                if think_start >= 0 and think_end >= 0:
                                    reasoning = answer[think_start:think_end].strip()
                                    print("Reasoning:",reasoning)
                                    final_answer = answer[think_end + len('</think>'):].strip()
                                    print("Final answer:",final_answer)
                                    reasoning_traces.append(reasoning)
                                    final_answers.append(final_answer)
                            elif '</think>' in answer:
                                print("Only </think> in answer")
                                # Handle case where only </think> is present
                                think_end = answer.find('</think>')
                                reasoning = answer[:think_end].strip()
                                print("Reasoning:",reasoning)
                                final_answer = answer[think_end + len('</think>'):].strip()
                                print("Final answer:",final_answer)
                                reasoning_traces.append(reasoning)
                                final_answers.append(final_answer)
                            elif '<think>' in answer:
                                print("Only <think> in answer")
                                # Handle case where only <think> is present
                                think_start = answer.find('<think>') + len('<think>')
                                reasoning = answer[think_start:].strip()
                                print("Reasoning:",reasoning)
                                final_answer = ""  # No final answer if only <think> tag is present
                                print("Final answer: (empty)")
                                reasoning_traces.append(reasoning)
                                final_answers.append(final_answer)
                            else:
                                reasoning_traces.append('')
                                final_answers.append(answer)
                    
                    if any(trace.strip() for trace in reasoning_traces): #If there is any reasoning trace
                        # Add the new columns - insert reasoning column right after the predicted answer column
                        col_idx = results_df.columns.get_loc(col)
                        results_df.insert(col_idx + 1, reasoning_col, reasoning_traces)
                        results_df[col] = final_answers
                        print("Added reasoning traces and final answers to dataframe")

    except Exception as e:
        print("Error in saving trace results:", e)
        traceback.print_exc()
    
    results_df.to_excel(filename, index=False)

In [39]:
# Main execution loop (move code below inside this function)
# def main():
all_models_stats, all_runs_model_metrics = load_model_stats(judge_model) #Try to load already saved data (if some models have already been evaluated), otherwise initialize empty dicts

for judge_model in judges: #judge models
    for model_id in models: #generation models
        global model_name, model, tokenizer, pipeline, generate_max_tokens
        model, tokenizer, pipeline = get_model(model_id)
        model_name = model_id #Since model_name defined as global variable
        
        try: #Sometimes some errors with the evaluation
            evaluation_all_resamples, dataset_langsmith = perform_evaluation(model_id, judge_model, n_resamples, example_inputs, multiple_judges_evaluator(judge_model), langsmith_api_key)
            chunk_size = len(example_inputs) #Number of questions
            results_df, list_of_questions = process_evaluation_results(langsmith_api_key, dataset_langsmith)
            
            all_resamples_metrics = [] #Keep track of all metrics over all resamples and all questions
            #There will be n_resamples lists, each with num_questions sublists (each having num_metrics sublists) (so num_questions*num_metrics elements in those in total)
            #Each question will have 6 metric values like this: [EvaluationResult(key='completeness', score=4, value='To evaluate the ....
            all_runs_metric_scores = [] #This will be appended to the input that plots metrics at the end. 
            #The format of it is [{metric1_descr_run1: [q1_score, q2_score, ...], metric2_descr_run1: [q1_score, q2_score, ...], ...}, 
            #                     {metric1_descr_run2: [q1_score, q2_score, ...], metric2_descr_run2: [q1_score, q2_score, ...], ...},
            #                     ...num_runs]
            
            # Process each resample
            for resample_idx in range(n_resamples):
                start_idx = resample_idx * chunk_size #start index of current resample (chunk size is the number of questions of each resample)
                #Resample_results saved above in the process_metrics function
                resample_results = evaluation_all_resamples[start_idx:start_idx + chunk_size] #Get results of a particular resample
                print("Resampl",resample_results)
                assert len(resample_results)==chunk_size, f"Number of resample results not matching num_questions. Got {len(resample_results)} resample results but expected {chunk_size}"

                predicted_answers = [x['run'].outputs['output'] for x in resample_results] #None if error
                assert len(predicted_answers)==chunk_size, f"Number of predicted answers not matching num_questions. Got {len(predicted_answers)} predicted answers but expected {chunk_size}"

                # with open('predicted_answers_'+str(resample_idx)+'_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                #     f.write(str(predicted_answers))

                #Add predicted answers to df
                results_df[f'predicted_answer_{resample_idx+1}'] = predicted_answers

                individual_run_metric_scores, metrics, results_df = process_metrics(
                        resample_results, 
                        list_of_metrics, 
                        list_of_questions,
                        resample_idx,
                        results_df,
                        model_name
                    )           

                # Handle zero values
                zero_rows_columns = handle_zero_values(results_df, n_resamples, list_of_metrics)
                if zero_rows_columns:
                    unique_zero_rows_columns = len(set([x for sublist in list(zero_rows_columns.values()) for x in sublist]))
                    print(colored(f"ERROR: Found missing values in {unique_zero_rows_columns} rows out of {len(results_df)}", 'red'))
                    process_zero_values(results_df, zero_rows_columns, list_of_metrics, model_name) #Replace 0s with mean of non-zero values     
                
                #In each iteration we append the metrics (6 in total) of one resample for all questions - n at the end, one for each resample
                #If there is an error, the metrics will be 0 (there will be n_errors*num_metrics less 'EvaluationResult' objects in that case)
                all_resamples_metrics.append(metrics)

                #Has n_resamples lists, each with num_metrics sublists (each sublist has scores over all questions of one metric) 
                all_runs_metric_scores.append(individual_run_metric_scores)
            
            assert len(all_runs_metric_scores)==n_resamples, f"Number of all_runs_metric_scores not matching num_resamples. \
                Got {len(all_runs_metric_scores)} all_runs_metric_scores but expected {n_resamples}"
            
            for i in range(n_resamples):
                assert len(all_runs_metric_scores[i])==len(list_of_metrics), f"Number of all_runs_metric_scores[{i}] not matching num_metrics. \
                    Got {len(all_runs_metric_scores[i])} all_runs_metric_scores[{i}] but expected {len(list_of_metrics)}"

            with open('all_runs_metric_scores_main_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(all_runs_metric_scores))

            with open('all_resamples_metrics_main_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(all_resamples_metrics))

            assert len(all_resamples_metrics)==n_resamples, f"Number of all_resamples_metrics not matching num_resamples. \
                Got {len(all_resamples_metrics)} all_resamples_metrics but expected {n_resamples}"
            
            for i in range(n_resamples): #Each one will have num_questions elements, each with num_metrics sublists (or 0 if error)
                assert len(all_resamples_metrics[i])==len(list_of_questions), f"Number of all_resamples_metrics[{i}] not matching num_questions. \
                    Got {len(all_resamples_metrics[i])} all_resamples_metrics[{i}] but expected {len(list_of_questions)}" #Each all_ressamples_metrics[i] should have num_questions elements

            # Calculate statistics
            metric_stats_resampling = calculate_metric_statistics(
                all_runs_metric_scores, 
                list_of_metrics, 
                len(list_of_questions),
                model_name
            )
            
            # Save initial results
            save_results(results_df, judge_model, model_id, "before")
            
            # # Handle zero values
            # zero_rows_columns = handle_zero_values(results_df, n_resamples, list_of_metrics)
            # if zero_rows_columns:
            #     unique_zero_rows_columns = len(set([x for sublist in list(zero_rows_columns.values()) for x in sublist]))
            #     print(colored(f"ERROR: Found missing values in {unique_zero_rows_columns} rows out of {len(results_df)}", 'red'))
            #     process_zero_values(results_df, zero_rows_columns, list_of_metrics, model_name) #Replace 0s with mean of non-zero values
            
            # Reorganize metrics - Has num_metrics keys, each with num_questions*num_resamples values (as a list)
            metric_scores_all_resamples = reorganize_evaluation_metrics(all_resamples_metrics, list_of_metrics, model_name, list_of_questions, n_resamples)

            with open('metric_scores_all_resamples_final_main_'+str(model_name.split('/')[1])+'.txt', 'w') as f:
                f.write(str(metric_scores_all_resamples))

            assert len(metric_scores_all_resamples)==len(list_of_metrics), f"Number of metric_scores_all_resamples not matching num_metrics. \
                Got {len(metric_scores_all_resamples)} metric_scores_all_resamples but expected {len(list_of_metrics)}"
            
            for i in range(len(list_of_metrics)):
                name_of_metric=list_of_metrics[i].replace('_descr','')
                assert len(metric_scores_all_resamples[name_of_metric])==len(list_of_questions)*n_resamples, f"Number of metric_scores_all_resamples[{name_of_metric}] not matching \
                    num_questions*num_resamples. Got {len(metric_scores_all_resamples[name_of_metric])} metric_scores_all_resamples[{name_of_metric}] but \
                    expected {len(list_of_questions)*n_resamples}"

            metric_names = list(metric_scores_all_resamples.keys()) #Final list of metrics for plotting
            
            # Verify metric names
            metrics_names_loop = [metric.replace('_descr','') for metric in list_of_metrics]
            assert metrics_names_loop == metric_names, "Metric names mismatch"
            
            # Save results
            all_runs_model_metrics[model_id] = all_runs_metric_scores #Used in plotting metrics
            #Dictionary in format {model_id:[{metric_1_run_1:[values], metric_2_run_1:[values], ...}, {metric_1_run_2:[values]....}]

            all_models_stats[model_id] = plot_figures_metrics(
                all_runs_model_metrics,
                metric_names,
                model_id,
                judge_model
            ) #Stats like mean, std, etc. per metric and per run over all questions
            
            # Save to files
            with open(f'stats_{judge_model.split("/")[1]}.json', 'w') as f:
                json.dump(all_models_stats, f, indent=4)
            with open(f'all_runs_model_metrics_{judge_model.split("/")[1]}.json', 'w') as f:
                json.dump(all_runs_model_metrics, f, indent=4)

            print("Model",model_id,"saved")
            print("Models saved so far:",list(all_models_stats.keys()))
                
        except Exception as e:
            print("An error occurred in evaluating model",model_id)
            print("Error Details:", e)
            traceback.print_exc()
        
        finally:
            # Clear VRAM
            del model, tokenizer, pipeline
            torch.cuda.empty_cache()
            print('-'*100)

    # if __name__ == "__main__":
    #     main()

Dataset not found. Creating new dataset

Performing evaluation of resample 1/1 of gemini/gemini-2.0-flash-exp
View the evaluation results for experiment: 'openai/gpt-4o-mini_judge_with_gemini/gemini-2.0-flash-exp_resample_0-2d0f8ecc' at:
https://smith.langchain.com/o/42866633-791c-5d79-833d-4e624439c57c/datasets/d00f6c05-1f27-4fe4-bea3-f6da9f7068af/compare?selectedSessions=b73da2a8-1081-4317-b427-7be78e641df4




0it [00:00, ?it/s]

Tool Decision: no_tool_needed
No tool will be used
Using Gemini model...
Full response from Gemini model: candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text='The Newton-Raphson technique generally exhibits faster convergence compared to loop-based iterative methods in water engineering (and other fields) due to several key aspects:\n\n*   **Quadratic Convergence:** The most significant advantage of Newton-Raphson is its quadratic convergence rate. This means that the number of correct digits in the approximation roughly doubles with each iteration, *provided* the initial guess is sufficiently close to the root and the function meets certain smoothness criteria (e.g., continuous first and second derivatives).  Loop-based methods, such as simple fixed-point iteration or bisection, typically have linear convergence, meaning 

1it [00:51, 51.51s/it]

[32mUser message:Question: What aspects of the Newton-Raphson technique generally lead to faster convergence compared to loop-based iterative methods?, Actual answer: Newton-Raphson methods solve all governing equations simultaneously by using partial derivatives (the Jacobian) that link changes in flows or heads to changes in hydraulic equations. This approach allows the solver to make a more direct beeline toward the solution, whereas loop-based methods adjust flows one loop at a time. By incorporating global information into each iteration, Newton-Raphson typically reaches high accuracy in fewer iterations, especially for large or complex systems., Predicted answer: The Newton-Raphson technique generally exhibits faster convergence compared to loop-based iterative methods in water engineering (and other fields) due to several key aspects:

*   **Quadratic Convergence:** The most significant advantage of Newton-Raphson is its quadratic convergence rate. This means that the number of

2it [01:50, 56.10s/it]

[32mUser message:Question: Which factors influence the choice of initial flow guesses in the Hardy Cross approach, and how can poor choices affect convergence?, Actual answer: Typical choices for initial flows are guided by approximate network demands or simple proportional splits of total system flow. For instance, if a node has a demand, one might set pipe flows toward that node according to each pipe’s relative capacity. Poorly chosen initial flows can lead to very slow convergence or even divergence, especially if the network has pipes with drastically different diameters or roughness values. The Hardy Cross method relies on incremental loop corrections, so extreme imbalances in initial flows can produce oscillations or prolonged iteration cycles., Predicted answer: Okay, I can definitely address the factors influencing initial flow guesses in the Hardy Cross method and the impact of poor choices on convergence.

**Factors Influencing the Choice of Initial Flow Guesses in the Hard

3it [02:27, 47.03s/it]

[32mUser message:Question: How are flow corrections applied around each loop in the Hardy Cross method, and why must flow continuity be maintained at every junction?, Actual answer: In the Hardy Cross method, one first selects individual loops in the network and makes an initial guess of the flow in each pipe. Then, each loop is corrected iteratively: the head loss around the loop is summed, and a flow correction is applied to reduce the total head-loss error to zero. This correction is added (or subtracted) to the assumed flow in every pipe forming that loop. Flow continuity must be maintained at every junction to ensure that all flow entering a node is accounted for by either leaving through connecting pipes or satisfying demand. If continuity is not enforced, the model would not reflect true network behavior, causing errors to propagate in subsequent iterations., Predicted answer: Okay, let's break down flow corrections in the Hardy Cross method and the importance of flow continuit

3it [03:02, 60.90s/it]


Total time for evaluation: 185.0729537010193
Resampl [{'run': RunTree(id=UUID('8f771d33-7356-41da-8b57-212805bace2d'), name='Target', start_time=datetime.datetime(2025, 4, 22, 12, 15, 54, 917920, tzinfo=datetime.timezone.utc), run_type='chain', end_time=datetime.datetime(2025, 4, 22, 12, 16, 6, 688912, tzinfo=datetime.timezone.utc), extra={'metadata': {'revision_id': None, 'num_repetitions': 1, 'example_version': '2025-04-22T12:15:52.457154+00:00', 'ls_method': 'traceable'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.1.128', 'library': 'langsmith', 'platform': 'Linux-6.5.0-44-generic-x86_64-with-glibc2.35', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.10.12', 'langchain_version': '0.3.0', 'langchain_core_version': '0.3.55'}}, error=None, serialized={'name': 'Target', 'signature': '(inputs: dict) -> dict', 'doc': 'Given a question, return the answer from the model, optionally using tools if tool_usage is True'}, events=[], inputs={'inputs': {'ques

0it [00:00, ?it/s]

Tool Decision: no_tool_needed
No tool will be used
Using OpenAI model...
Response from OpenAI: The Newton-Raphson technique is a powerful numerical method for finding roots of real-valued functions, and it generally exhibits faster convergence compared to loop-based iterative methods due to several key aspects:

1. **Quadratic Convergence**: The Newton-Raphson method has quadratic convergence near the root, meaning that the number of correct digits approximately doubles with each iteration once close to the root. In contrast, many loop-based iterative methods, such as simple fixed-point iterations, often have linear convergence, which is significantly slower.

2. **Use of Derivatives**: The Newton-Raphson method utilizes the derivative of the function to inform the next approximation. This derivative provides information about the slope of the function, allowing for more informed and efficient steps toward the root. Loop-based methods may not leverage this information, leading to less 

1it [00:49, 49.69s/it]

[32mUser message:Question: What aspects of the Newton-Raphson technique generally lead to faster convergence compared to loop-based iterative methods?, Actual answer: Newton-Raphson methods solve all governing equations simultaneously by using partial derivatives (the Jacobian) that link changes in flows or heads to changes in hydraulic equations. This approach allows the solver to make a more direct beeline toward the solution, whereas loop-based methods adjust flows one loop at a time. By incorporating global information into each iteration, Newton-Raphson typically reaches high accuracy in fewer iterations, especially for large or complex systems., Predicted answer: The Newton-Raphson technique is a powerful numerical method for finding roots of real-valued functions, and it generally exhibits faster convergence compared to loop-based iterative methods due to several key aspects:

1. **Quadratic Convergence**: The Newton-Raphson method has quadratic convergence near the root, meani

2it [01:28, 43.58s/it]

[32mUser message:Question: Which factors influence the choice of initial flow guesses in the Hardy Cross approach, and how can poor choices affect convergence?, Actual answer: Typical choices for initial flows are guided by approximate network demands or simple proportional splits of total system flow. For instance, if a node has a demand, one might set pipe flows toward that node according to each pipe’s relative capacity. Poorly chosen initial flows can lead to very slow convergence or even divergence, especially if the network has pipes with drastically different diameters or roughness values. The Hardy Cross method relies on incremental loop corrections, so extreme imbalances in initial flows can produce oscillations or prolonged iteration cycles., Predicted answer: The Hardy Cross method is an iterative technique used for analyzing flow in pipe networks, particularly in water distribution systems. The choice of initial flow guesses is crucial for the convergence and efficiency of

3it [02:01, 38.70s/it]

[32mUser message:Question: How are flow corrections applied around each loop in the Hardy Cross method, and why must flow continuity be maintained at every junction?, Actual answer: In the Hardy Cross method, one first selects individual loops in the network and makes an initial guess of the flow in each pipe. Then, each loop is corrected iteratively: the head loss around the loop is summed, and a flow correction is applied to reduce the total head-loss error to zero. This correction is added (or subtracted) to the assumed flow in every pipe forming that loop. Flow continuity must be maintained at every junction to ensure that all flow entering a node is accounted for by either leaving through connecting pipes or satisfying demand. If continuity is not enforced, the model would not reflect true network behavior, causing errors to propagate in subsequent iterations., Predicted answer: The Hardy Cross method is an iterative technique used for analyzing flow in pipe networks, particularl

3it [02:38, 52.71s/it]


Total time for evaluation: 159.13202738761902
Resampl [{'run': RunTree(id=UUID('d8ea4d5e-7c0a-43ab-b7b4-adad603c7116'), name='Target', start_time=datetime.datetime(2025, 4, 22, 12, 19, 5, 86954, tzinfo=datetime.timezone.utc), run_type='chain', end_time=datetime.datetime(2025, 4, 22, 12, 19, 16, 946307, tzinfo=datetime.timezone.utc), extra={'metadata': {'revision_id': None, 'num_repetitions': 1, 'example_version': '2025-04-22T12:19:04.024823+00:00', 'ls_method': 'traceable'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.1.128', 'library': 'langsmith', 'platform': 'Linux-6.5.0-44-generic-x86_64-with-glibc2.35', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.10.12', 'langchain_version': '0.3.0', 'langchain_core_version': '0.3.55'}}, error=None, serialized={'name': 'Target', 'signature': '(inputs: dict) -> dict', 'doc': 'Given a question, return the answer from the model, optionally using tools if tool_usage is True'}, events=[], inputs={'inputs': {'ques

0it [00:00, ?it/s]

Tool Decision: no_tool_needed
No tool will be used
Using Gemini model...
Full response from Gemini model: candidates=[Candidate(content=Content(parts=[Part(video_metadata=None, thought=None, code_execution_result=None, executable_code=None, file_data=None, function_call=None, function_response=None, inline_data=None, text='The Newton-Raphson technique generally exhibits faster convergence compared to loop-based iterative methods in water engineering (and other fields) due to several key aspects:\n\n*   **Quadratic Convergence:** This is the most significant factor. Newton-Raphson exhibits quadratic convergence, meaning that the number of correct digits in the solution approximately doubles with each iteration.  In other words, the error decreases proportionally to the square of the previous error.  Loop-based methods, such as simple fixed-point iteration or successive substitution, typically have linear convergence, where the error decreases linearly with each iteration.  This differen

1it [01:19, 79.35s/it]

[32mUser message:Question: What aspects of the Newton-Raphson technique generally lead to faster convergence compared to loop-based iterative methods?, Actual answer: Newton-Raphson methods solve all governing equations simultaneously by using partial derivatives (the Jacobian) that link changes in flows or heads to changes in hydraulic equations. This approach allows the solver to make a more direct beeline toward the solution, whereas loop-based methods adjust flows one loop at a time. By incorporating global information into each iteration, Newton-Raphson typically reaches high accuracy in fewer iterations, especially for large or complex systems., Predicted answer: The Newton-Raphson technique generally exhibits faster convergence compared to loop-based iterative methods in water engineering (and other fields) due to several key aspects:

*   **Quadratic Convergence:** This is the most significant factor. Newton-Raphson exhibits quadratic convergence, meaning that the number of co

2it [02:17, 66.72s/it]

[32mUser message:Question: Which factors influence the choice of initial flow guesses in the Hardy Cross approach, and how can poor choices affect convergence?, Actual answer: Typical choices for initial flows are guided by approximate network demands or simple proportional splits of total system flow. For instance, if a node has a demand, one might set pipe flows toward that node according to each pipe’s relative capacity. Poorly chosen initial flows can lead to very slow convergence or even divergence, especially if the network has pipes with drastically different diameters or roughness values. The Hardy Cross method relies on incremental loop corrections, so extreme imbalances in initial flows can produce oscillations or prolonged iteration cycles., Predicted answer: Okay, I can definitely address the factors influencing initial flow guesses in the Hardy Cross method and the impact of poor choices on convergence.

**Factors Influencing the Choice of Initial Flow Guesses in the Hard

3it [03:29, 69.48s/it]

[32mUser message:Question: How are flow corrections applied around each loop in the Hardy Cross method, and why must flow continuity be maintained at every junction?, Actual answer: In the Hardy Cross method, one first selects individual loops in the network and makes an initial guess of the flow in each pipe. Then, each loop is corrected iteratively: the head loss around the loop is summed, and a flow correction is applied to reduce the total head-loss error to zero. This correction is added (or subtracted) to the assumed flow in every pipe forming that loop. Flow continuity must be maintained at every junction to ensure that all flow entering a node is accounted for by either leaving through connecting pipes or satisfying demand. If continuity is not enforced, the model would not reflect true network behavior, causing errors to propagate in subsequent iterations., Predicted answer: Okay, let's break down flow corrections in the Hardy Cross method and the importance of flow continuit

3it [04:03, 81.15s/it]


Total time for evaluation: 244.6001968383789
Resampl [{'run': RunTree(id=UUID('16fd391b-ee35-4dd2-bb13-6e709a97b475'), name='Target', start_time=datetime.datetime(2025, 4, 22, 12, 21, 53, 799784, tzinfo=datetime.timezone.utc), run_type='chain', end_time=datetime.datetime(2025, 4, 22, 12, 22, 5, 143424, tzinfo=datetime.timezone.utc), extra={'metadata': {'revision_id': None, 'num_repetitions': 1, 'example_version': '2025-04-22T12:21:52.592718+00:00', 'ls_method': 'traceable'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.1.128', 'library': 'langsmith', 'platform': 'Linux-6.5.0-44-generic-x86_64-with-glibc2.35', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.10.12', 'langchain_version': '0.3.0', 'langchain_core_version': '0.3.55'}}, error=None, serialized={'name': 'Target', 'signature': '(inputs: dict) -> dict', 'doc': 'Given a question, return the answer from the model, optionally using tools if tool_usage is True'}, events=[], inputs={'inputs': {'ques

0it [00:00, ?it/s]

Tool Decision: no_tool_needed
No tool will be used
Using OpenAI model...
Response from OpenAI: The Newton-Raphson technique is a powerful numerical method for finding roots of real-valued functions, and it generally exhibits faster convergence compared to loop-based iterative methods due to several key aspects:

1. **Quadratic Convergence**: The Newton-Raphson method has quadratic convergence near the root, meaning that the number of correct digits approximately doubles with each iteration once close to the root. This is in contrast to many loop-based methods, such as the bisection method or simple fixed-point iteration, which typically converge linearly.

2. **Use of Derivatives**: The Newton-Raphson method utilizes the first derivative (or the Jacobian in multivariable cases) of the function to inform the next guess. This derivative provides information about the slope of the function, allowing for more informed and efficient updates to the guess. Loop-based methods often do not use 

1it [01:29, 89.65s/it]

[32mUser message:Question: What aspects of the Newton-Raphson technique generally lead to faster convergence compared to loop-based iterative methods?, Actual answer: Newton-Raphson methods solve all governing equations simultaneously by using partial derivatives (the Jacobian) that link changes in flows or heads to changes in hydraulic equations. This approach allows the solver to make a more direct beeline toward the solution, whereas loop-based methods adjust flows one loop at a time. By incorporating global information into each iteration, Newton-Raphson typically reaches high accuracy in fewer iterations, especially for large or complex systems., Predicted answer: The Newton-Raphson technique is a powerful numerical method for finding roots of real-valued functions, and it generally exhibits faster convergence compared to loop-based iterative methods due to several key aspects:

1. **Quadratic Convergence**: The Newton-Raphson method has quadratic convergence near the root, meani

2it [02:43, 80.36s/it]

[32mUser message:Question: Which factors influence the choice of initial flow guesses in the Hardy Cross approach, and how can poor choices affect convergence?, Actual answer: Typical choices for initial flows are guided by approximate network demands or simple proportional splits of total system flow. For instance, if a node has a demand, one might set pipe flows toward that node according to each pipe’s relative capacity. Poorly chosen initial flows can lead to very slow convergence or even divergence, especially if the network has pipes with drastically different diameters or roughness values. The Hardy Cross method relies on incremental loop corrections, so extreme imbalances in initial flows can produce oscillations or prolonged iteration cycles., Predicted answer: The Hardy Cross method is an iterative technique used for analyzing flow in pipe networks, particularly in water distribution systems. The choice of initial flow guesses is crucial for the convergence and efficiency of

3it [03:43, 71.04s/it]

[32mUser message:Question: How are flow corrections applied around each loop in the Hardy Cross method, and why must flow continuity be maintained at every junction?, Actual answer: In the Hardy Cross method, one first selects individual loops in the network and makes an initial guess of the flow in each pipe. Then, each loop is corrected iteratively: the head loss around the loop is summed, and a flow correction is applied to reduce the total head-loss error to zero. This correction is added (or subtracted) to the assumed flow in every pipe forming that loop. Flow continuity must be maintained at every junction to ensure that all flow entering a node is accounted for by either leaving through connecting pipes or satisfying demand. If continuity is not enforced, the model would not reflect true network behavior, causing errors to propagate in subsequent iterations., Predicted answer: The Hardy Cross method is an iterative technique used for analyzing flow in pipe networks, particularl

3it [04:18, 86.12s/it]


Total time for evaluation: 259.25685930252075
Resampl [{'run': RunTree(id=UUID('3a308594-96d8-424a-9a95-55f8b7615bab'), name='Target', start_time=datetime.datetime(2025, 4, 22, 12, 26, 7, 338356, tzinfo=datetime.timezone.utc), run_type='chain', end_time=datetime.datetime(2025, 4, 22, 12, 26, 23, 77230, tzinfo=datetime.timezone.utc), extra={'metadata': {'revision_id': None, 'num_repetitions': 1, 'example_version': '2025-04-22T12:26:06.365707+00:00', 'ls_method': 'traceable'}, 'runtime': {'sdk': 'langsmith-py', 'sdk_version': '0.1.128', 'library': 'langsmith', 'platform': 'Linux-6.5.0-44-generic-x86_64-with-glibc2.35', 'runtime': 'python', 'py_implementation': 'CPython', 'runtime_version': '3.10.12', 'langchain_version': '0.3.0', 'langchain_core_version': '0.3.55'}}, error=None, serialized={'name': 'Target', 'signature': '(inputs: dict) -> dict', 'doc': 'Given a question, return the answer from the model, optionally using tools if tool_usage is True'}, events=[], inputs={'inputs': {'ques

Comparison plots between LLMs

In [42]:
aaa below only for one judge (the latest)

SyntaxError: invalid syntax (1552382455.py, line 1)

In [40]:
# Calculate mean and std of each metric over all runs for each model
aggregated_metrics_by_model = {}

for model, model_data in all_runs_model_metrics.items():
    if model not in aggregated_metrics_by_model:
        aggregated_metrics_by_model[model] = {}
    
    for run_data in model_data:
        for metric_name, metric_values in run_data.items():
            if metric_name not in aggregated_metrics_by_model[model]:
                aggregated_metrics_by_model[model][metric_name] = []
            
            if isinstance(metric_values, list) and all(isinstance(x, (int, float)) for x in metric_values):
                aggregated_metrics_by_model[model][metric_name].extend(metric_values)
            else:
                print(metric_values)

In [None]:
for model, metrics in aggregated_metrics_by_model.items():
    print(f"\nModel: {model}")
    print("-" * (len(model) + 8))
    
    for metric_name, values in metrics.items():
        if values:
            mean_value = np.mean(values)
            std_value = np.std(values)
            print(f"{metric_name}:")
            print(f"  Mean: {mean_value:.4f}")
            print(f"  Std:  {std_value:.4f}")

In [42]:
list_of_metric_names=[name.removesuffix('_descr') for name in list_of_metrics]
metric_means = {m: [] for m in list_of_metric_names}
metric_stds = {m: [] for m in list_of_metric_names}
model_names = []

for model, model_metrics in aggregated_metrics_by_model.items():
    model_names.append(model.split('/')[-1].replace('-descr', '').replace('_descr', ''))
    for m in list_of_metric_names:
        key = f"{m}_descr"
        if key in model_metrics and model_metrics[key]:
            values = model_metrics[key]
            metric_means[m].append(np.mean(values))
            metric_stds[m].append(np.std(values))
        else:
            metric_means[m].append(0.0)
            metric_stds[m].append(0.0)

In [None]:
plot_model_comparison(model_names, list_of_metric_names, metric_means, metric_stds)

Statistical comparison between models

In [None]:
def compare_model_performances(all_runs_model_metrics): #all_models_stats, 
    """
    Performs statistical comparison between models using paired differences, standard errors,
    and Pearson correlation coefficients following section 4.2 methodology.
    
    Args:
        all_models_stats (dict): Dictionary containing statistics for each model
        all_runs_model_metrics (dict): Dictionary containing raw metrics for each model/run/question
        
    Returns:
        dict: Dictionary containing pairwise comparison results
    """
    import numpy as np
    from scipy import stats
    import itertools
    
    # Get all model pairs for comparison
    models = list(all_runs_model_metrics.keys())
    model_pairs = list(itertools.combinations(models, 2))
    
    # Store results
    comparison_results = {}
    
    for model1, model2 in model_pairs:
        comparison_key = f"{model1.split('/')[-1]}_vs_{model2.split('/')[-1]}"
        comparison_results[comparison_key] = {}
        
        # Get metrics (removing '_descr' suffix)
        metrics = [metric.replace('_descr', '') for metric in list(all_runs_model_metrics[model1][0].keys())]
        
        # Create file for this model comparison
        variance_results_text = f"\n=== Variance Analysis Results for {comparison_key} ===\n"
        
        for metric in metrics:
            # Calculate differences and correlations for each resample
            resample_differences = []
            resample_ses = []
            correlations = []
            model1_variances = []  # Initialize list
            model2_variances = []  # Initialize list
            
            # Iterate through resamples - Same number for both models
            for resample_idx in range(len(all_runs_model_metrics[model1])):
                # Get scores for both models for this resample
                scores1 = all_runs_model_metrics[model1][resample_idx][f'{metric}_descr']
                scores2 = all_runs_model_metrics[model2][resample_idx][f'{metric}_descr']
                
                # Calculate differences for each question
                question_differences = np.array(scores1) - np.array(scores2)
                
                # Calculate mean difference for this resample
                mean_diff = np.mean(question_differences) #Same as the formula in the paper since mean(a-b)=mean(a)-mean(b)
                
                # Calculate standard error for this resample - Paired analysis (section 4.2)
                n = len(question_differences)
                se = np.sqrt(np.sum((question_differences - mean_diff)**2) / (n * (n-1))) if n > 1 else np.nan

                # # Calculate standard errors for each model - Unpaired analysis (section 4.1)
                # n = len(scores1)
                # sea = np.sqrt(np.sum((scores1 - np.mean(scores1))**2) / (n * (n - 1))) if n > 1 else np.nan
                # seb = np.sqrt(np.sum((scores2 - np.mean(scores2))**2) / (n * (n - 1))) if n > 1 else np.nan

                # # Calculate the combined standard error as sqrt(sea^2 + seb^2)
                # se = np.sqrt(sea**2 + seb**2)

                # Calculate variances for each model
                var1 = np.var(scores1, ddof=1)  # Using ddof=1 for sample variance
                var2 = np.var(scores2, ddof=1)
                model1_variances.append(var1)
                model2_variances.append(var2)
                
                # Calculate Pearson correlation
                correlation, _ = stats.pearsonr(scores1, scores2)
                
                resample_differences.append(mean_diff)
                resample_ses.append(se)
                correlations.append(correlation)
            
            # Convert to numpy arrays
            resample_differences = np.array(resample_differences)
            resample_ses = np.array(resample_ses)
            correlations = np.array(correlations)
            model1_variances = np.array(model1_variances)
            model2_variances = np.array(model2_variances)
            print("resample_differences",resample_differences)
            print("resample_ses",resample_ses)
            print("correlations",correlations)
            print(f"Model 1 variances: {model1_variances}")
            print(f"Model 2 variances: {model2_variances}")
          
            # Calculate overall mean difference over all resamples
            overall_mean_diff = np.mean(resample_differences)
            print("overall_mean_diff",overall_mean_diff)
            
            #We want an aggregated SE across all resamples for the same questions (same paired differences)
            #This approach accounts for the fact that each resampling provides a different estimate of the variance of the same underlying distribution, 
            # and averaging these estimates gives a better representation of the overall uncertainty.

            # Calculate pooled standard error across resamples
            R = len(resample_differences)
            pooled_se = np.sqrt(np.sum(resample_ses**2) / (R**2))
            print("pooled_se",pooled_se)
            
            # # If the resampling results are independent estimates of variance (i.e., combining uncertainty estimates from independent sources), the combined variance is
            # # the sum of all individual variances, and the combined standard error is given below (goal to capture total variability)
            # # Calculate the overall combined SE across all resamples
            # combined_se = np.sqrt(np.nansum(np.array(resample_ses)**2))

            # Calculate overall variance reduction across all resamples
            n = len(scores1)
            
            # Calculate mean variances across resamples
            mean_var1 = np.mean(model1_variances)  # Var(sA)
            mean_var2 = np.mean(model2_variances)  # Var(sB)
            
            # Calculate mean correlation across resamples
            mean_correlation = np.mean(correlations)
            
            # Calculate covariance between model scores
            mean_cov = mean_correlation * np.sqrt(mean_var1 * mean_var2)  # Cov(sA, sB)
            
            # Calculate variance for unpaired case: Var(μA-B,unpaired) = (Var(sA) + Var(sB))/n
            var_unpaired = (mean_var1 + mean_var2) / n
            
            # Calculate variance for paired case: Var(μA-B,paired) = (Var(sA) + Var(sB) - 2Cov(sA,sB))/n
            var_paired = (mean_var1 + mean_var2 - 2 * mean_cov) / n
            
            # The reduction in variance is: Var(μA-B,unpaired) - Var(μA-B,paired) = 2Cov(xA,xB)/n
            variance_reduction = 2 * mean_cov / n  # This should equal var_unpaired - var_paired
            
            # Calculate percentage reduction in variance
            percent_reduction = (variance_reduction / var_unpaired) * 100 if var_unpaired != 0 else 0

            # Add results for this metric to the text
            variance_results_text += f"\nMetric: {metric}\n"
            variance_results_text += f"Mean Model 1 variance (Var(sA)): {mean_var1:.6f}\n"
            variance_results_text += f"Mean Model 2 variance (Var(sB)): {mean_var2:.6f}\n"
            variance_results_text += f"Mean covariance (Cov(sA,sB)): {mean_cov:.6f}\n"
            variance_results_text += f"Unpaired variance: {var_unpaired:.6f}\n"
            variance_results_text += f"Paired variance: {var_paired:.6f}\n"
            variance_results_text += f"Variance reduction (2Cov(xA,xB)/n): {variance_reduction:.6f}\n"
            variance_results_text += f"Percent reduction: {percent_reduction:.1f}%\n"

            # # Calculate t-statistic and p-value
            # t_stat = overall_mean_diff / pooled_se if pooled_se != 0 else np.nan
            # df = R - 1  # degrees of freedom
            # p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df)) if not np.isnan(t_stat) else np.nan
            
            # # Calculate confidence interval
            # t_crit = stats.t.ppf(0.975, df)  # 95% CI
            # ci_margin = t_crit * pooled_se

            # Calculate z-statistic and CI using standard normal distribution
            z_stat = overall_mean_diff / pooled_se if pooled_se != 0 else np.nan
            
            # Calculate confidence interval using 1.96 for 95% CI
            ci_margin = 1.96 * pooled_se
            
            # Calculate p-value using standard normal distribution
            #For a two-tailed test p = 2 × (1 − Φ(|z|)), where Φ(z) is the cumulative distribution function (CDF) of the standard normal distribution.
            p_value = 2 * (1 - stats.norm.cdf(abs(z_stat))) if not np.isnan(z_stat) else np.nan
            
            # # Calculate average Pearson correlation - not accurate when correlations close to 1 or -1, variances differences across resamples, sample size is small.
            # avg_correlation = np.mean(correlations)

            #Apply Fisher z-transformation
            z_values = [0.5 * np.log((1 + r) / (1 - r)) for r in correlations]

            # Compute the mean Fisher z-value
            z_mean = np.mean(z_values)

            #Back-transform to Pearson correlation scale
            overall_correlation = (np.exp(2 * z_mean) - 1) / (np.exp(2 * z_mean) + 1)
            
            # Store results
            comparison_results[comparison_key][metric] = {
                "mean_difference": overall_mean_diff,
                "pooled_standard_error": pooled_se,
                "ci_low": overall_mean_diff - ci_margin,
                "ci_high": overall_mean_diff + ci_margin,
                # "t_statistic": t_stat,
                "z_statistic": z_stat,
                "p_value": p_value,
                "significant": p_value < 0.05 if not np.isnan(p_value) else None,
                "better_model": model1.split('/')[-1] if overall_mean_diff > 0 else model2.split('/')[-1],
                "pearson_correlation": overall_correlation
            }
        
        # Write all metrics results for this model comparison to a single file
        with open(f'variance_results_{comparison_key}.txt', 'w') as f:
            variance_results_text += f"Overall Variance Reduction Analysis:\n"
            f.write(variance_results_text)
    
    return comparison_results

comparison_results = compare_model_performances(all_runs_model_metrics) #all_models_stats

# Save results to file
with open('comparison_results.json', 'w') as f:
    # Convert numpy types to native Python types for JSON serialization
    def convert_to_serializable(obj):
        if isinstance(obj, (np.int_, np.intc, np.intp, np.int8,
            np.int16, np.int32, np.int64, np.uint8,
            np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.bool_)):
            return bool(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        elif obj is None:
            return None
        return obj
    
    serializable_results = json.loads(
        json.dumps(comparison_results, default=convert_to_serializable)
    )
    json.dump(serializable_results, f, indent=4)

In [None]:
# Plot comparison results
# import matplotlib.pyplot as plt
# import seaborn as sns

# Extract metrics and models from comparison_results
metrics = [metric.replace('_descr', '') for metric in list_of_metrics]
model_pairs = list(comparison_results.keys())

# Create figure with subplots for each metric
# Calculate number of rows needed based on number of metrics
num_metrics = len(metrics)
num_rows = (num_metrics + 2) // 3  # Using 3 columns, calculate rows needed (ceiling division)
fig, axes = plt.subplots(num_rows, 3, figsize=(25, 20 * num_rows / 2), dpi=600)  # Adjusted figsize proportionally
fig.suptitle('Model Comparison Results by Metric', fontsize=16, y=1.05)
axes = axes.flatten()

for i, metric in enumerate(metrics):
    ax = axes[i]
    
    # Extract data for this metric
    means = []
    cis = []
    labels = []
    
    for pair in model_pairs:
        metric_data = comparison_results[pair][metric]
        means.append(metric_data['mean_difference'])
        # ci_margin = metric_data['ci_margin']
        cis.append([metric_data['ci_low'], 
                   metric_data['ci_high']])
        labels.append(pair)

    # Create bar plot
    bars = ax.bar(range(len(means)), means)
    
    # Add error bars for confidence intervals
    ax.errorbar(range(len(means)), means, 
               yerr=[[m - ci[0] for m, ci in zip(means, cis)],
                     [ci[1] - m for m, ci in zip(means, cis)]],
               fmt='none', color='black', capsize=5)
    
    # Add horizontal line at y=0
    ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    
    # Customize plot
    ax.set_title(f'{metric.capitalize()}')
    ax.set_xticks(range(len(means)))
    ax.set_xticklabels(labels, rotation=90) # Changed to vertical labels
    ax.set_ylabel('Mean Difference')
    
    # Color bars based on statistical significance
    for j, bar in enumerate(bars):
        if comparison_results[model_pairs[j]][metric]['p_value'] < 0.05:
            bar.set_color('darkred')
        else:
            bar.set_color('lightgray')

    # Hide any unused subplots
    for i in range(num_metrics, len(axes)):
        axes[i].set_visible(False)

plt.tight_layout()

# Save plot before showing with high resolution
plt.savefig('model_comparisons.png', bbox_inches='tight', dpi=600)  # Increased DPI for higher resolution

# Show plot after saving
plt.show()

Create Tables

In [None]:
def create_comparison_table(comparison_results, metrics):
    """
    Creates a formatted table from comparison results.
    
    Args:
        comparison_results (dict): The comparison results dictionary
        metrics (list): List of metrics to include
        
    Returns:
        str: Formatted markdown table
    """
    # Table header
    table = "| Metric | Model | Baseline | Model - Baseline | 95% Conf. Interval | Correlation |\n"
    table += "|--------|--------|-----------|-----------------|-------------------|-------------|\n"
    
    # Add rows for each comparison and metric
    for pair in comparison_results:
        model1, model2 = pair.split('_vs_')
        for metric in metrics:
            results = comparison_results[pair][metric]
            
            row = f"| {metric} | {model1} | {model2} | "
            row += f"{results['mean_difference']:.1%} | "
            row += f"({results['ci_low']:.1%}, {results['ci_high']:.1%}) | "
            row += f"{results['pearson_correlation']:.2f} |\n"
            
            table += row
            
    return table

# Create and print the table
metrics = [m.replace('_descr', '') for m in list_of_metrics]
comparison_table = create_comparison_table(comparison_results, metrics)
print(comparison_table)

# Save table to file
with open('comparison_table.txt', 'w') as f:
    f.write(comparison_table)

Power Analysis

In [None]:
from statsmodels.stats.power import TTestIndPower

def perform_power_analysis(effect_size=0.5, alpha=0.05, power=0.8):
    """
    Perform power analysis to determine required sample size.
    
    Args:
        effect_size (float): Expected effect size (Cohen's d)
        alpha (float): Significance level
        power (float): Desired statistical power
        
    Returns:
        int: Required sample size per group
    """
    analysis = TTestIndPower()
    sample_size = analysis.solve_power(
        effect_size=effect_size,
        alpha=alpha,
        power=power,
        alternative='two-sided'
    )
    return int(np.ceil(sample_size))

# First, determine required sample size
required_samples = perform_power_analysis(effect_size=0.1254, alpha=0.05, power=0.8)  #These parameters result in a sample size of 1000
print(f"Required samples per model for statistical power: {required_samples}")

For real-time inference (below implementation only for meta-llama/Meta-Llama-3.1-8B-Instruct)

In [48]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# import torch
# # del pipeline #Otherwise too much memory is used

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name,device_map='auto')

# #Example of real-time response generation
# messages=[{"role": "user", "content": "What is the chemical formula of water?"}]

# inputs_tokenized = tokenizer.apply_chat_template(
#     messages,
#     tokenize=True,
#     add_generation_prompt=True,
#     return_tensors="pt",
#     return_dict=True,
# ).to("cuda")

# input_ids = inputs_tokenized['input_ids']

# # Generate tokens one by one
# max_length = 256
# output_ids = input_ids
# for _ in range(256):
#     outputs = model.generate(
#         output_ids,
#         max_new_tokens=1,
#         do_sample=True,
#         top_k=50,
#         pad_token_id=tokenizer.eos_token_id
#     )
#     new_token_id = outputs[0, -1].item()
#     if new_token_id == tokenizer.eos_token_id:
#         break
#     output_ids = torch.cat([output_ids, outputs[:, -1:]], dim=1)
#     new_token = tokenizer.decode(new_token_id, skip_special_tokens=True)
#     print(new_token, end="", flush=True)

# print()

Other evaluators from Langsmith

In [49]:
# https://docs.smith.langchain.com/old/evaluation/faq/evaluator-implementations
# https://docs.smith.langchain.com/old/evaluation/quickstart

# from langsmith.evaluation import LangChainStringEvaluator

# eval_llm = ChatOpenAI(model_name=judge_model.split('/')[1], api_key=openai_api_key, temperature=0.0, seed=42)

# #Evaluators
# qa_evaluator = LangChainStringEvaluator("qa", config={"llm": eval_llm}) #LLM just gives 'correct' or 'incorrect' based on reference answer
# context_qa_evaluator = LangChainStringEvaluator("context_qa", config={"llm": eval_llm}) #Also uses reference context of example outputs to do the above
# cot_qa_evaluator = LangChainStringEvaluator("cot_qa", config={"llm": eval_llm}) #Same as above but with chain of thought 'reasoning'

#Prompts Used internally:

# 1) context_qa_evaluator: You are a teacher grading a quiz.
# You are given a question, the context the question is about, and the student's answer. You are asked to score the student's answer as either CORRECT or INCORRECT, 
# based on the context.

# Example Format:
# QUESTION: question here
# CONTEXT: context the question is about here
# STUDENT ANSWER: student's answer here
# GRADE: CORRECT or INCORRECT here

# Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. 
# It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 


# 2) cot_qa_evaluator: You are a teacher grading a quiz.
# You are given a question, the context the question is about, and the student's answer. You are asked to score the student's answer as either CORRECT or INCORRECT, 
# based on the context.
# Write out in a step by step manner your reasoning to be sure that your conclusion is correct. Avoid simply stating the correct answer at the outset.

# Example Format:
# QUESTION: question here
# CONTEXT: context the question is about here
# STUDENT ANSWER: student's answer here
# EXPLANATION: step by step reasoning here
# GRADE: CORRECT or INCORRECT here

# Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer.
#  It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 


# 3) qa_evaluator: You are a teacher grading a quiz.
# You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

# Example Format:
# QUESTION: question here
# STUDENT ANSWER: student's answer here
# TRUE ANSWER: true answer here
# GRADE: CORRECT or INCORRECT here

# Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer.
#  It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! 

Alternatively, use custom prompts as shown below (and set {"prompt": PROMPT} as additional argument inside the config above)

In [50]:
# from langchain_core.prompts.prompt import PromptTemplate

# _PROMPT_TEMPLATE = """You are an expert professor specialized in chemical engineering answers to questions.
# You are grading the following question:
# {query}
# Here is the real answer:
# {answer}
# You are grading the following predicted answer:
# {result}
# Respond with CORRECT or INCORRECT:
# """

# PROMPT = PromptTemplate(
#     input_variables=["query", "result", "answer"], template=_PROMPT_TEMPLATE
# )

Notes: Non-reproducible results, even when seed set (https://platform.openai.com/docs/api-reference/chat/create#chat-create-seed), temperature=0 (top_p should not change when we changed temperature - smaller values result in more constrained and focused response - https://medium.com/@rasithbm/chatopenai-parameters-83bef49f6384)