# Evaluation: LLM as a Judge Work

This file will use an LLM to evaluate to provide a clear, consistent, and concise evaluation of each finetuned model's results.

In [1]:
# First, install the required packages that might not already be installed
%pip install langchain langchain-ollama
%pip install pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Set up the system prompt
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage

import pandas as pd
from tqdm import tqdm
import os


judge_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You are an expert judge in an LLM-as-a-judge system. Given a question and an answer,
            rank the answer from a score of 0 to 5 given the following criteria and score interpretation.
            Be as unbiased as possible and output ONLY the integer value 0, 1, 2, 3, 4, or 5. Output nothing else
            except for this integer.

            The use case you are judging is whether an LLM fine-tuned on SEC filing data is capable of producing helpful,
            truthful, and insightful analysis about a particular stock or company. Good answers are ones that would provide any information
            necessary to both answer the original question and spread to any details that would potentially affect investment.

            Here is the information you will need to judge each output. You will be provided a question, a ground truth answer, and a 
            test answer (this is the one you're supposed to judge). Whenever you assign a score to the test answer, it should always be
            based on its quality RELATIVE to the ground truth answer. Use the original question for context to make this distinction.

            Criteria: 

            Does the answer answer the question correctly, in the appropriate level of detail, and at the appropriate degree of complexity/age range?

            Score Rubric:

            0: The output is inappropriate, harmful, or hateful OR the output is irrelevant, unhelpful, speaks to a different stock/company, etc.
            1: The answer does not accurately summarize, ignores all instructions, or is incomplete/otherwise incorrect
            2: The answer completes the required task, but ignores most/some of intended parameters
            3: The answer Completes task and considers all instructions, but output does not fully reflect intended tone
            4: The answer is complete, correct, and reflective of intended scope and difficulty, but is somewhat less detailed than gold standard answer
            5: The answer is generally equal to gold standard in all aspects

            The message will be formatted like this:

            Question:

            <<Question Text Goes Here>>

            Ground Truth Answer:

            <<Answer Text Goes Here>>

            Test Answer:

            <<Answer Text Goes Here>>
            """
        ),
        MessagesPlaceholder(variable_name="input")
    ]
)

data = pd.read_csv('./fine_tuning_results.csv')
active_results = [
    'llama-3.2-1B',
    'llama-3.2-3B',
    'phi3',
    'mistral',
    'gemini'
] # Add more as we finish

llm = ChatOllama(
    model='llama3'
)

judge = judge_prompt | llm

In [8]:
full_results = []

for k in tqdm(range(len(data.index))):
    row = data.iloc[k]

    ground_truth = row["Ground_truth"]
    question = row["Question"]

    llama1b = f"""

    Question:

    {question}

    Ground Truth Answer:

    {ground_truth}

    Test Answer:

    {row[active_results[0]]}

    """

    llama3b = f"""

    Question:

    {question}

    Ground Truth Answer:

    {ground_truth}

    Test Answer:

    {row[active_results[1]]}

    """

    phi = f"""

    Question:

    {question}

    Ground Truth Answer:

    {ground_truth}

    Test Answer:

    {row[active_results[2]]}

    """

    mistral = f"""

    Question:

    {question}

    Ground Truth Answer:

    {ground_truth}

    Test Answer:

    {row[active_results[3]]}

    """

    gemini = f"""

    Question:

    {question}

    Ground Truth Answer:

    {ground_truth}

    Test Answer:

    {row[active_results[4]]}

    """

    res_llama1b = judge.invoke({"input": [HumanMessage(content=llama1b)]}).content
    res_llama3b = judge.invoke({"input": [HumanMessage(content=llama3b)]}).content
    res_phi = judge.invoke({"input": [HumanMessage(content=phi)]}).content
    res_mistral = judge.invoke({"input": [HumanMessage(content=mistral)]}).content
    res_gemini = judge.invoke({"input": [HumanMessage(content=gemini)]}).content

    result = {
        "Question No": k,
        "LLaMa-1B Score": res_llama1b,
        "LLaMa-3B Score": res_llama3b,
        "Phi 3 Score": res_phi,
        "Mistral Score": res_mistral,
        "Gemini Score": res_gemini
    }

    full_results.append(result)

100%|██████████| 1210/1210 [55:29<00:00,  2.75s/it] 


In [10]:
# Download Results as CSV
df = pd.DataFrame(full_results)
df.to_csv('./LLM_judge_results.csv')