# Code for Evaluating the LLM Models

In [1]:
#Imports
import time
import re, json
import pandas as pd
from llm import getChatChain
from app import load_documents_into_database
from langchain_community.llms import Ollama
from langchain.evaluation import load_evaluator
from langchain_community.vectorstores import Chroma

In [2]:
def extract_rating(evaluation: str) -> int:
    """
    Extracts the rating number from the evaluation string.
    It handles both single and double-bracketed formats.
    """
    match = re.search(r'Rating: \[?\[?(\d+)\]?\]?', evaluation)
    if match:
        return int(match.group(1))
    else:
        print(f"Invalid format for evaluation output: {evaluation}")
        return None

def evaluate_mistral(llm_model_name: str, db: Chroma) -> tuple:
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    
    f = open("stats.csv", "a")
    for index, row in df.iterrows():
        inicio = time.time()

        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        
        fim = time.time()
        tempo = round(float(fim - inicio), 2)

        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
        except ValueError as e:
            print(f"ValueError: {e}")
            evaluation = str(e)  # Assign the exception message to evaluation

        # Debugging print statement to inspect the evaluation output
        print(f"Evaluation output: {evaluation}")

        # Convert the evaluation to a string if it is a dictionary
        if isinstance(evaluation, dict):
            evaluation = json.dumps(evaluation)

        # Extract the rating using the updated function
        score = extract_rating(evaluation)
        if score is None:
            continue

        # Debug print statements to confirm writing to the file
        print(f'\n[QUESTION] {question}')
        print(f'[SCORE] {score}')
        print(f'Writing to CSV: {llm_model_name},{score},{tempo}')
        f.write(f"{llm_model_name},{score},{tempo}\n")
        f.flush()  # Flush the file buffer to ensure data is written to disk

    f.close()  # Explicitly close the file
    print("Finished writing to CSV.")

In [3]:
def extract_rating(evaluation: str) -> int:
    """
    Extracts the rating number enclosed in single or double brackets from the evaluation string.
    """
    match = re.search(r'Rating: \[?\[?(\d+)\]?\]?', evaluation)
    if match:
        return int(match.group(1))
    else:
        print(f"Invalid format for evaluation output: {evaluation}")
        return None

def evaluate_llama2(llm_model_name: str, db: Chroma) -> tuple:
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    f = open("stats.csv", "a")
    
    for index, row in df.iterrows():
        inicio = time.time()

        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        
        fim = time.time()
        tempo = round(float(fim - inicio), 2)

        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
        except ValueError as e:
            print(f"ValueError: {e}")
            evaluation = str(e)  # Assign the exception message to evaluation

        # Debugging print statement to inspect the evaluation output
        print(f"Evaluation output: {evaluation}")

        # Convert the evaluation to a string if it is a dictionary
        if isinstance(evaluation, dict):
            evaluation = json.dumps(evaluation)

        # Extract the rating using the new function
        score = extract_rating(evaluation)
        if score is None:
            continue

        print(f'\n[QUESTION] {question}')
        print(f'[SCORE] {score}')
        print(f'Writing to CSV: {llm_model_name},{score},{tempo}')
        f.write(f"{llm_model_name},{score},{tempo}\n")
        f.flush()  # Ensure data is written to disk

    f.close()  # Explicitly close the file
    print("Finished writing to CSV.")

In [4]:
def extract_rating(evaluation: str) -> int:
    """
    Extracts the rating number from the evaluation string.
    It handles both single and double-bracketed formats.
    """
    match = re.search(r'Rating: \[?\[?(\d+)\]?\]?', evaluation)
    if match:
        return int(match.group(1))
    else:
        print(f"Invalid format for evaluation output: {evaluation}")
        return None

def evaluate_zephyr(llm_model_name: str, db: Chroma) -> tuple:
    evaluator = load_evaluator("labeled_score_string", criteria="correctness", llm=Ollama(model=llm_model_name))
    chat = getChatChain(Ollama(model=llm_model_name), db)
    df = pd.read_csv("evaluate.csv")

    print("\n[INFO] Evaluating model: ", llm_model_name)
    
    try:
        f = open("stats.csv", "a")
    except Exception as e:
        print(f"Failed to open file Stats.csv: {e}")
        return

    for index, row in df.iterrows():
        inicio = time.time()

        question = row['question']
        reference_answer = row['answer']
        model_answer = chat(question=question)
        
        fim = time.time()
        tempo = round(float(fim - inicio), 2)

        try:
            evaluation = evaluator.evaluate_strings(
                prediction=model_answer,
                reference=reference_answer,
                input=question
            )
        except ValueError as e:
            print(f"ValueError: {e}")
            evaluation = str(e)  # Assign the exception message to evaluation

        # Debugging print statement to inspect the evaluation output
        print(f"Evaluation output: {evaluation}")

        # Convert the evaluation to a string if it is a dictionary
        if isinstance(evaluation, dict):
            evaluation = json.dumps(evaluation)

        # Extract the rating using the updated function
        score = extract_rating(evaluation)
        if score is None:
            continue

        # Debug print statements to confirm writing to the file
        print(f'\n[QUESTION] {question}')
        print(f'[SCORE] {score}')
        print(f'Writing to CSV: {llm_model_name},{score},{tempo}')
        
        try:
            f.write(f"{llm_model_name},{score},{tempo}\n")
            f.flush()  # Ensure data is written to disk immediately
        except Exception as e:
            print(f"Failed to write to file: {e}")

    try:
        f.close()  # Explicitly close the file
    except Exception as e:
        print(f"Failed to close file: {e}")

    print("Finished writing to CSV.")

# Mistral

In [5]:
# Avaliação do Mistral segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("mistral", "nomic-embed-text", "../Final PDF Files", True)
evaluate_mistral("mistral", db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio), 2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 46/46 [00:44<00:00,  1.04it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  mistral
 Article 1 of the Portuguese Penal Code establishes the principle of legality, which means that only acts described and declared punishable by law prior to their commission can be criminally punished. Additionally, security measures can only be applied to dangerous states whose assumptions are fixed in law prior to their completion. (Sources: ../Final PDF Files/Codigo_Penal_Divided_Parte_3.pdf, Page 0; ../Final PDF Files/Codigo_Penal_Divided_Parte_3.pdf, Page 0)O artigo 1 do Código Penal português estabelece o princípio da legalidade, o que significa que apenas atos descritos e declarados puníveis por lei antes de sua comissão podem ser punidos criminalmente. Além disso, as medidas de segurança só podem ser aplicadas a estados perigosos cujas suposições são fixadas na lei antes da conclusão. (Fontes: ../Final PDF Files/codigo_penal_divided_parte_3.pdf, página 0;
ValueError: Invalid output:  Evaluat

# Llama2

In [6]:
#Avaliação do Llama2 segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("llama2","nomic-embed-text","../Final PDF Files",True)
evaluate_llama2("llama2",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 46/46 [00:42<00:00,  1.09it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  llama2
According to Article 1 of the Portuguese Penal Code, crimes are considered criminal if they violate the law. Examples of crimes punishable in Portugal include homicide, lesions, robbery, burglary, theft, fraud, corruption, abuse of power or authority, illegal association, terrorism, prevaricação (fiscal fraud), and environmental damage. The Portuguese system of criminal justice is divided into two main branches: the investigatory phase and the judicial phase. During the investigatory phase, authorities investigate the crime and gather evidence to determine who committed it and how. Once the investigation is complete, the case is transferred to the judicial phase, where it is tried in front of a judge or a panel of judges. The accused has the right to a fair trial, including the right to be represented by an attorney and to present a defense. If convicted, the defendant can appeal the verdict to a hi

# Zephyr

In [5]:
#Avaliação do Zephyr segundo o Tempo, a Precisão e a Accuracy.
inicio = time.time()
db = load_documents_into_database("zephyr","nomic-embed-text","../Final PDF Files",True)
evaluate_zephyr("zephyr",db)
fim = time.time()
print("O Modelo demorou " + str(round((fim-inicio),2)) + " segundos a gerar as respostas.")

Loading documents
Loading .pdf files


100%|██████████| 46/46 [00:43<00:00,  1.06it/s]


Loading .md files


0it [00:00, ?it/s]


Creating embeddings and loading documents into Chroma

[INFO] Evaluating model:  zephyr
Article 1 of the Portuguese Penal Code, titled "Principle of Legality," establishes the fundamental principle that only acts defined and declared as punishable by law prior to their commission can be subjected to criminal punishment. This means that individuals cannot be held liable for actions that were not explicitly prohibited or penalized in advance. Additionally, Article 1 also applies this principle to security measures, which can only be applied to dangerous situations that have been previously defined and established by law. Therefore, the Portuguese Penal Code ensures that criminal liability is clearly established and enforced, preventing arbitrary or retroactive punishment for actions.O artigo 1 do Código Penal Português, intitulado "Princípio da Legalidade", estabelece o princípio fundamental que apenas atos definidos e declarados como puníveis por lei antes da sua comissão pode ser subme

KeyboardInterrupt: 