# Evaluate LLM results

Install Dependencies

In [1]:
# %%capture
# !pip install datasets==2.20.0
# !pip install -U langsmith==0.1.99
# !pip install langchain_openai==0.1.22
# !pip install langchain==0.2.13
# !pip install langchain_community==0.2.12                          
# !pip install transformers==4.44.0
# !pip install termcolor==2.4.0
# !pip install accelerate==0.33.0
# !pip install pandas==2.2.2
# !pip install openpyxl==3.1.5
# !pip install python-dotenv==1.0.1
# !pip install einops==0.8.0
# !pip install wheel==0.44.0
# !pip install sentencepiece==0.2.0
# !pip install protobuf==5.27.3 #Mistral models needs this
# !pip install groq==0.10.0 #Groq models needs this
# !pip install matplotlib==3.9.2
# !pip install seaborn==0.13.2

# !pip install flash-attn==2.6.3 #Install it at the end after wheel has been installed
# !pip install anthropic==0.34.1 #Anthropic models needs this

# #Only if CPU is used
# !pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html

RunPod specific parameters - Check also runpod_instructions.txt

In [2]:
#For RunPod change to persistent storage directory
import os
os.chdir('/workspace')

Specify Path and Load API Keys

In [None]:
file_path ='/workspace/Example_QA_data_raw.xlsx' #Dataset generated with the help of GPT-4o - Has to be an excel file with 'input' and 'output' columns

custom_cache_dir="/workspace/cache/huggingface" #Save models here so that we don't have to download them again if we 'stop' and reinitialize the pod

# Check if custom_cache_dir is defined, otherwise use default behavior
try:
    cache_dir=custom_cache_dir
except:
    cache_dir=None

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv(dotenv_path=os.getcwd()+"/env")

# Get the OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY_DRACO')
langsmith_api_key = os.getenv('LANGSMITH_API_KEY')

#Login to Hugging Face
from huggingface_hub import login
# Log in with your Hugging Face token
login(token=os.getenv('HF_TOKEN'))

# print(openai_api_key)
# print(langsmith_api_key)

Select model and name for the experiment

In [4]:
#Models to generate responses to questions
models=[ 
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Meta-Llama-3.1-8B-Instruct", #Takes 2.5-3mins in A4500 (20GB VRAM)
    "microsoft/Phi-3.5-mini-instruct", #Took 5mins in A40 with 48GB VRAM, 2mins in A4500 with 20GB VRAM
    "mistralai/Mistral-7B-Instruct-v0.3", #4mins in A40 with 48GB VRAM, 2.5mins in A4500 with 20GB VRAM
    "Qwen/Qwen2-7B-Instruct", #4mins in A40 with 48GB VRAM, 2 mins in A4500 with 20GB VRAM
    'AI-MO/NuminaMath-7B-TIR', #2.5 in A4500 with 20GB VRAM - We can also try 01-ai/Yi-Coder-9B-Chat
    'microsoft/Phi-3-mini-4k-instruct', #6 mins in RTX3090
    "google/gemma-2-9b-it", #More than 20GB of GPU memory needed - Works with A40 with 48GB VRAM (8mins), but not with A4500 - 20GB, and V100 - 32GB
    'mistralai/Mistral-Nemo-Instruct-2407', #12B parameters, 11mins in 2 RTX3090, 16mins in V100 with 32GB VRAM
    'openai/gpt-4o-mini' #Costs very low ~0.01$ for 9 Q&A pairs.
    ] #All above models need ~130GB space, the last needs ~30GB . For 44 Q&A pairs it takes ~50min/model

# Groq models are defined as: groq_website/model_name e.g. 'groq_website/llama-3.1-70b-versatile'
# OpenAI models are defined as: 'openai/model_name', e.g. 'openai/gpt-4o-mini'
# Anthropic models are defined as 'anthropic/model_name', e.g. 'anthropic/claude-3-haiku-20240307' - Couldn't use due to billing issues

# I couldn't run 'nvidia/Mistral-NeMo-Minitron-8B-Base', "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4" (Conflicting dependencies),
# 'google/recurrentgemma-9b-it' # RecurrentGemmaForCausalLM.forward() got an unexpected keyword argument 'position_ids'

#Define model to act as a judge
judge_model='openai/gpt-4o-mini' #If used with Llama, only 0.01$ for 9 Q&A pairs for gpt-4o-mini, and 0.22$ for gpt-4o

#Define maximum number of tokes in the judge LLM output
max_output_tokens=500

#Limit of tokens in the generated response from LLM
generate_max_tokens=1000

#Inference on whole dataset?
inference_on_whole_dataset=True

Define prompts for custom evaluation metrics

In [5]:
common_prompt="""
You are an autoregressive language model that acts as a judge in comparing a predicted vs an actual answer to a questions.
Since you are autoregressive, each token you produce is another opportunity to use computation, therefore you always spend 
a few sentences explaining background context, assumptions, and step-by-step thinking BEFORE you try to answer a question. 
Your users are experts in chemical engineering, so they already know you're a language model and your capabilities and limitations, so don't 
remind them of that. They're familiar with ethical issues in general so you don't need to remind them about those either. 
Don't be verbose in your answers, but do provide details and examples where it might help the explanation. 
""" #This is common for all prompts below

In [6]:
completeness_descr = """
Your task is to evaluate responses predicted by an LLM with regards to completeness compared to the completeness of a given actual, golden standard answer. 
The completeness metric evaluates the extent to which the user's question is answered in full in the predicted response. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: There is no response.
2: No parts of a suitable answer are present.
3: Few elements of a complete answer are present.
4: Most elements of a complete answer are present.
5: The response covers all elements of a complete answer.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

relevance_descr = """
Your task is to evaluate responses predicted by an LLM with regards to relevance compared to the relevance of a given actual, golden standard answer. 
The relevance metric evaluates the amount of irrelevant information in the predicted response considering the user's original question. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response answers something else, not the user's question.
2: The response answers the user's question but the information provided is mostly irrelevant.
3: The response answers the user's question but contains more irrelevant information than relevant information.
4: The response answers the user's question, and shares a bit of irrelevant information.
5: The response answers the user's question and contains no irrelevant information.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

conciseness_descr = """
Your task is to evaluate responses predicted by an LLM with regards to conciseness compared to the conciseness of a given actual, golden standard answer. 
The conciseness metric evaluates the amount of unexpected extra information in the predicted response considering the user's original question. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response is too long and stops before completion or enters an infinite loop.
2: The response includes a lot of extra information and uses flowery language.
3: The response includes a lot of extra information or uses flowery language.
4: The response is short and includes a small amount of extra information.
4: The response is as short as possible while still answering the prompt.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

confidence_descr = """
Your task is to evaluate responses predicted by an LLM with regards to confidence compared to the confidence of a given actual, golden standard answer. 
The condifence metric evaluates the degree of assurance that is conveyed the response that the predicted answer is correct. 
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: Complete Rejection. The response makes it clear that the given answer is incorrect or that no correct answer can be provided.
2: Doubt and Disagreement. The response suggests that the answer is likely incorrect or raises significant concerns.
3: Uncertainty. The response indicates that the answer could be correct, but there is significant doubt or insufficient evidence.
4: Moderate Agreement. The response leans towards the answer being correct but acknowledges some uncertainty.
5: Full Endorsement. The reponse confidentely asserts that the given answer is correct.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

factuality_descr = """
Your task is to evaluate responses predicted by an LLM with regards to factuality compared to the factuality of a given actual, golden standard answer.
 The factuality metric evaluates the degree of hallucination contained in a response or, in other words, how accurate a given response is.
You can assign a score from 1 to 5, with the following interpretations:
1: The response is a complete hallucination
2: The response is mostly a hallucination but does not change key information from the prompt (such as chemical identifiers).
3: The response contains large amounts of both hallucinations and factual information.
4: The response includes mostly factual information with slight hallucinations.
5: The response only includes factual information.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

judgement_descr = """
Your task is to evaluate responses predicted by an LLM with regards to judgement compared to the judgement of a given actual, golden standard answer.
The judgment metric assesses how strongly the response implies its correctness, taking into account the actual accuracy of the answer.
You can assign a score from 1 to 5 to the predicted response with the following interpretations:
1: The response confidently claims a hallucination as truth.
2: The response misinterprets information received in the prompt.
3: The response shows that the model is unsure about the answer or states that information is theoretical.
4: The response is wrong but it is made clear that the answer is wrong or that the model is unable to provide a correct answer.
5: The response is correct.
IMPORTANT: End your responses with the sentence: "FINAL SCORE:" followed by whole numbers only (1, 2, 3, 4, or 5). Do not use decimal points. This may not be disregarded!
"""

In [7]:
#How the dataset will be named in Langsmith
def get_dataset_name(model_name, judge_model):
    try: #For Hugging Face models
        return "Chemical_Engineering_Evaluation_"+model_name.split('/')[1]+'_with_judge_'+judge_model+'_beam_search'
    except: #For OpenAI models
        return "Chemical_Engineering_Evaluation_"+model_name+'_with_judge_'+judge_model+'_beam_search'

Check if GPU is available

In [None]:
import torch
print(torch.version.cuda)

Read Excel File

In [10]:
import pandas as pd
qa=pd.read_excel(file_path) #Read Excel

Create Dataset from df

In [11]:
from datasets import Dataset
loaded_dataset=Dataset.from_pandas(qa)

if inference_on_whole_dataset==False:
    loaded_dataset = loaded_dataset.train_test_split(test_size=0.2, seed=42) #Used if going to fine-tune in part of the dataset

In [12]:
if inference_on_whole_dataset==False:
    dataset_train=loaded_dataset['train']
    dataset_test=loaded_dataset['test']
else:
    dataset_test=loaded_dataset #When we use the whole dataset

Create Langsmith Test Dataset

In [None]:
#https://docs.smith.langchain.com/old/evaluation/faq/manage-datasets

from langsmith import Client

example_inputs = [(x['input'],x['output']) for x in dataset_test]
print(example_inputs)

def create_langsmith_dataset(dataset_name, example_inputs, langsmith_api_key):

    client = Client(api_key=langsmith_api_key)

    try:
        #Load the dataset if already exists
        for existing_dataset in client.list_datasets():
            if existing_dataset.name==dataset_name:
                dataset_langsmith=existing_dataset
        for x in dataset_langsmith:
            print("Dataset Loaded")
            break

    except: #Otherwise create it
        print("Dataset not found. Creating new dataset")
        # Storing inputs in a dataset lets us run chains and LLMs over a shared set of examples.
        dataset_langsmith = client.create_dataset(dataset_name=dataset_name,
                                                description="Q&A chemical engineering.")

        for input_prompt, output_answer in example_inputs:
            client.create_example(
                inputs={"question": input_prompt.replace('\n', ' ')},
                outputs={"answer": output_answer.replace('\n', ' ')},
                # metadata={"source": "Wikipedia"},
                dataset_id=dataset_langsmith.id,
            )

    return dataset_langsmith

Custom Evaluation Metrics

In [14]:
# https://docs.smith.langchain.com/old/cookbook/introduction
# https://docs.smith.langchain.com/old/evaluation/faq/custom-evaluators
# https://docs.smith.langchain.com/how_to_guides/evaluation/evaluate_llm_application#use-a-summary-evaluator

from langsmith.schemas import Run, Example
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from termcolor import colored

list_of_metrics=['completeness_descr','relevance_descr','conciseness_descr','confidence_descr','factuality_descr','judgement_descr']

#Function that compares the real answer with the predicted answer of an LLM and returns a score based on the evaluation
def factor_evaluator(run: Run, example: Example) -> dict: 
    # print("Run:",run)

    question=run.inputs.get("inputs")['question']
    # print("Question:",question)
    actual_answer = example.outputs.get("answer")
    # print("Real answer:",example.outputs.get("answer"))
    predicted_answer = run.outputs.get("output")
    # print("Predicted Answer:",answer)
    
    # Check if there is output from LLM
    if not predicted_answer:
        print("No output from LLM")
        return {"key": "custom_metric" , "score": 0} 
    
    else:
        scores={} #Store scores for each metric
        descriptions={} #Store descriptions for each metric
        
        for metric_name in list_of_metrics: #Iterate through all metrics
            print("Evaluating based on:",metric_name)
            metric_value=common_prompt+eval(metric_name) #Get the actual description of the metric

            # Define roles and placeholders
            chat_template = ChatPromptTemplate.from_messages(
            [("system", metric_value),
                ("user", "Question: {question}, Actual answer: {actual_answer}, Predicted answer: {predicted_answer}"),
                # ("ai", "It's sunny and warm outside."), #Use this if we want to use few shot prompts
            ]
            )

            messages = chat_template.format_messages(question=question, actual_answer=actual_answer, predicted_answer=predicted_answer)
            # print("Messages:",messages)

            formatted_messages = [(role, msg.content) for role, msg in zip(["system", "user"], messages)]
            # print("Formatted messages:",formatted_messages) #[('system', 'You are an autoregressive lan....', 'user':.....)]

            # Initialize the model and get response
            llm = ChatOpenAI(model_name=judge_model.split('/')[1], api_key=openai_api_key, temperature=0, max_tokens=max_output_tokens, seed=42)
            ai_response = llm.invoke(formatted_messages)

            # Output
            # print(colored("System message:"+ messages[0].content,'blue'))
            print(colored("User message:"+ messages[1].content, 'green'))
            print(colored("AI message:"+ ai_response.content,'red'))

            #Decide what the final score is based on output
            if "FINAL SCORE:" in ai_response.content: 
                score = int(ai_response.content.split("FINAL SCORE:")[1])
            else:
                print("Invalid response from LLM:", ai_response.content)
                score = 0

            scores[metric_name]=score
            descriptions[metric_name]=ai_response.content
            print("Scores:",scores)
            print("\n")

    return {
        "results":[ #We always need 'key', 'score' pairs
            {"key": "completeness" , "score": scores['completeness_descr'],"value":descriptions['completeness_descr']},
            {"key": "relevance" , "score": scores['relevance_descr'], "value":descriptions['relevance_descr']},
            {"key": "conciseness" , "score": scores['conciseness_descr'], "value":descriptions['conciseness_descr']},
            {"key": "confidence" , "score": scores['confidence_descr'], "value":descriptions['confidence_descr']},
            {"key": "factuality" , "score": scores['factuality_descr'], "value":descriptions['factuality_descr']},
            {"key": "judgement" , "score": scores['judgement_descr'], "value":descriptions['judgement_descr']}
        ]
    }

Define Models that Generate Responses

In [15]:
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.random.manual_seed(0) #Set for reproducibility

def initialize_model(model_id):
    # transformers.set_seed(42) #Tried for reproducibility but didn't work
    
    pipeline = transformers.pipeline( 
            "text-generation",
            model=model_id,
            model_kwargs={"torch_dtype": torch.bfloat16, "cache_dir":cache_dir},
            # trust_remote_code=True,
            device_map="auto" #Use 'cuda' if one GPU available (works in Delft Blue with 32GB VRAM) - 'auto' the alternative for distributed over all available GPUs
        )
    return pipeline

def get_model(model_id):
    """Given a model name, return the loaded model, tokenizer, and pipeline"""

    if 'openai' not in model_id and 'groq_website' not in model_id: #For Hugging Face models
        pipeline=initialize_model(model_id)

    #Returns below variables if defined, and returns None for any that are not.
    model = locals().get('model', None)
    tokenizer = locals().get('tokenizer', None)
    pipeline = locals().get('pipeline', None)

    return model, tokenizer, pipeline

Generate Responses

In [16]:
import time

def predict(inputs: dict) -> dict:
    """Given a question, return the answer from the model"""
    
    #Get these variables from the global scope
    global model_name
    
    messages = [ #Only use the questions to ask the model to generate the response
      {"role": "user", "content": inputs['question']},
    ]

    if 'gemma' not in model_name: #Gemma doesn't support system message
      messages.insert(0, {"role": "system", "content": "You are a language model specialized in chemical engineering. Answer the following question:"})
    else: #For gemma add system prompt in user message
      messages[0]['content']="You are a language model specialized in chemical engineering. Answer the following question: " + messages[0]['content']
    # print("Prompt:",messages)

    generation_args = { 
        "max_new_tokens": max_output_tokens, 
        "return_full_text": False, 
        "temperature": 0.1, # 1e-8,  #Has to be positive number - not considered from model when do_sample is False (reproducible results)
        "do_sample": True, #Selects highest probability token if sets to False
        "num_beams" : 5, #3 can also work if computationally intensive - more info on https://huggingface.co/blog/how-to-generate
        #Warnings will be raised by some models

        #If we only set temp!=0 or if we also set do_sample=False then warning: `do_sample` is set to `False`. However, `temperature` is set to `1e-08` 
        # -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
        # That means that the temperature is probably ignored
        # Sometimes, results not reproducible if only temp is set
      } 
    
    if 'openai' not in model_name and 'groq_website' not in model_name: #For Hugging Face models
      response=pipeline(messages, **generation_args)[0]['generated_text']
      print(model_name,':',response)

    else: 
      if 'openai' in model_name:
        try:
          import openai
          from langsmith.wrappers import wrap_openai
                  
          # Define OpenAI client
          openai_client = wrap_openai(openai.Client(api_key=openai_api_key))
          
          response = openai_client.chat.completions.create(messages=messages, temperature=0, model=model_name.split('/')[1],  seed=42) 
          # print("Response:",response.choices[0].message.content)
          response=response.choices[0].message.content #That's the response without formatting
          time.sleep(5) #To avoid rate limiting

        except Exception as e:
          print("Error:",e)
          print("OpenAI Model ID:",model_name)

      elif 'groq_website' in model_name:
        try:
          from groq import Groq
          client = Groq()
          actual_model_name=model_name.split('/')[1]
          response = client.chat.completions.create(
              model=actual_model_name,
              max_tokens=generate_max_tokens,
              temperature=0,
              messages=messages)
          # print("Response from Groq:",response.choices[0].message.content)
          time.sleep(5) #To avoid rate limiting

        except Exception as e:
          print("Error:",e)
          print("Groq Model ID:",model_name)

    return {"output": response}

In [17]:
def plot_figures_metrics(metric_names, metric_values, model_name, judge_model):
    #Plot figures with distributions of metrics
    import matplotlib.pyplot as plt
    import seaborn as sns
    import numpy as np

    # Colors for separate plots
    colors = sns.color_palette("Set3", len(metric_names))

    fig, axes = plt.subplots(len(metric_names), 1, figsize=(10, 18))
    plt.subplots_adjust(hspace=0.6, top=0.94) #hspace=1 if 'values' below every plot, 'top' sets distance between title and subfigures

    # Set a title for all subplots
    fig.suptitle('Metric Distributions', fontsize=16)

    # Define the bin edges explicitly to ensure consistency
    bin_edges = np.arange(0.0, 5.6, 0.2)  # Adjust to cover the range 1-5 with bins of width 1

    # Plotting each metric in separate subplots
    for i, (metric_name, values) in enumerate(zip(metric_names, metric_values)):
        sns.histplot(values, bins=bin_edges, color=colors[i], ax=axes[i], kde=False)
        axes[i].set_title(f'{metric_name}')
        axes[i].set_xlim(0, 5.5) #Keep 0 in case of errors
        axes[i].set_ylabel('Frequency')
        axes[i].set_yticks(range(0, 11, 5)) #Set y-ticks at intervals of 5

        # Hide x-axis labels and ticks for all but the last subplot
        if i < len(metric_names) - 1:
            axes[i].set_xlabel('')
        else:
            axes[i].set_xlabel('Values')

    # Save the plot with the judge and model names
    plt.savefig(str(judge_model.split('/')[1])+'_judge_with_'+str(model_name).replace("/","_")+'_metric_distributions.png')

Perform the Evaluation over all models

In [None]:
#https://python.langchain.com/v0.2/docs/integrations/chat/openai/
from langsmith.evaluation import evaluate

#Initialize models
for model_id in models:
    
    dataset_name=get_dataset_name(model_id, judge_model) #How the dataset will be named in Langsmith
    dataset_langsmith=create_langsmith_dataset(dataset_name, example_inputs, langsmith_api_key)
    model, tokenizer, pipeline = get_model(model_id)
    print(f"Model: {model_id} loaded")
    model_name=model_id #Since model_name defined as global variable

    # Evaluation
    begin=time.time()

    evaluation_results=evaluate(
        predict, #Function that call our LLM and returns its output
        data=dataset_langsmith.name, #Just using dataset_langsmith doesn't work 
        evaluators=[factor_evaluator], #Evaluators to use
        # metadata={"revision_id": "the version of your pipeline you are testing"},
        experiment_prefix=str(judge_model)+'_judge_with_'+str(model_id) # A prefix for your experiment names to easily identify them
    )

    end=time.time()
    print("Total time taken:",end-begin)

    try: #Sometimes some errors with 1+ Q&A missing

        #Extract metrics and save to df
        #Initialize empty df to be filled with results
        results_df=pd.DataFrame()

        #https://docs.smith.langchain.com/tutorials/Developers/evaluation
        list_of_questions=[x['example'].inputs['question'] for x in evaluation_results]
        list_of_answers=[x['example'].outputs['answer'] for x in evaluation_results]
        list_of_predicted_answers=[x['run'].outputs['output'] for x in evaluation_results]

        #Fill the df with the results
        results_df['questions']=list_of_questions
        results_df['answers']=list_of_answers
        results_df['predicted_answers']=list_of_predicted_answers

        #Get indices for which list_of_predicted_answers is None (Correct for errors in model response)
        indices_to_drop = [i for i, answer in enumerate(list_of_predicted_answers) if answer is None]
        if len(indices_to_drop) > 0:
            print(colored("ERROR:"+str(len(indices_to_drop))+" rows out of "+str(len(results_df))+ " had to be dropped due to NaN (no model response)",'red'))
            print(colored("These were:"+str(indices_to_drop),'green'))

        #Drop indices from results_df
        results_df=results_df.drop(indices_to_drop)

        all_runs_metrics=[x['evaluation_results']['results'] for ind,x in enumerate(evaluation_results) if ind not in indices_to_drop] #list of lists with metric names
        all_metric_values=[[{x[i].key: x[i].score} for x in all_runs_metrics] for i in range(len(list_of_metrics))] #List of all completeness scores

        #Check if all keys have the same metric name. 
        same_metric= [all(list(d.keys())[0] == list(sublist[0].keys())[0] for d in sublist) for sublist in all_metric_values]
        assert [True]*len(same_metric)==same_metric

        metric_names=[key for i in range(len(all_metric_values)) for key in all_metric_values[i][0].keys()]
        metric_values=[[list(x.values())[0] for x in all_metric_values[i]] for i in range(len(all_metric_values))]
        
        all_metric_prompts=[[{x[i].key:x[i].value} for x in all_runs_metrics] for i in range(len(list_of_metrics))]
        metric_prompts=[[list(x.values())[0] for x in all_metric_prompts[i]] for i in range(len(all_metric_prompts))]

        # Adding columns to DataFrame
        for col_name, col_values,col_prompts in zip(metric_names, metric_values,metric_prompts):
            results_df[col_name] = col_values
            results_df[col_name+'_prompt'] = col_prompts

        #Save results to Excel
        results_df.to_excel("results_"+str(judge_model.split('/')[1])+'_judge_with_'+str(model_id).replace("/","_")+".xlsx",index=False)

        #Plot figures with distributions of metrics
        plot_figures_metrics(metric_names, metric_values, model_id, judge_model)

    except:
        print("An error occur in plotting metrics")

    # Clear VRAM at the end of each iteration
    del model, tokenizer, pipeline
    torch.cuda.empty_cache()

    print('-'*100)

Notes: Non-reproducible results, even when seed set (https://platform.openai.com/docs/api-reference/chat/create#chat-create-seed), temperature=0 (top_p should not change when we changed temperature - smaller values result in more constrained and focused response - https://medium.com/@rasithbm/chatopenai-parameters-83bef49f6384)