# Notebook to generate synthetic QA over the data

In [392]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import pickle
import pathlib
import random
import os
import sys
from transformers import pipeline
from llama_cpp import Llama
from pprint import pprint


# Add the project root directory to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
PROJECT_ROOT = pathlib.Path(project_root)

from StructuredRag.utils import mistral_conversation
from StructuredRag.algorithms.inquirer import StructRAGInquirer

pd.set_option("display.max_colwidth", None)

### Load docs and instantiate our RAG retriever agent

In [359]:
# path_to_data = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/results/v0/2024-06-14/embedded_index.pickle")
with open(PROJECT_ROOT / "results" / 'v0' / '2024-07-18' / 'embedded_index.pickle', "rb") as f:
    document_index = pickle.load(f)

In [360]:
# instantiated so we can use it to retreive the approapriate context
rag_agent = StructRAGInquirer(
    str((PROJECT_ROOT / "results" / 'v0' / '2024-07-18').resolve()),
    llm_name='google/flan-t5-large',
    llm_max_tokens=512,
    use_anchor_document=False,
)

# result = rag_agent.run_inquirer(
#     query = outputs[0]['question'].replace('Factoid question: ', ''),
#     source_document_name=None,
#     k_context = 3,
# )

Loading item: embedded_index
Loading item: notes
Loading item: edge_thresh
Loading item: adj_matrix


In [361]:
def build_context_for_QA_gen(doc, rag_agent, k_context: int = 3):
    """
    Builds the context string for generating synthetic question-answering pairs.

    Args:
        doc: The document for which the context is being built.
        rag_agent: The RAG agent used for retrieving similar nodes.
        k_context (int): The number of similar nodes to consider for building the context. Default is 3.

    Returns:
        context_string (str): The generated context string containing the clean text of similar nodes.
    """
    similar_nodes = rag_agent._graph_similar_nodes(doc.id_, k_context)
    
    context_string = """ """
    for i, (node_id, _) in enumerate(similar_nodes):
        # Yes its unoptimised... find the document who's id matches the node_id
        node = next((x for x in document_index if x.id_ == node_id), None)
        
        clean_text = node.text.replace("\n", " ").replace("\t", " ").replace("  ", " ").strip()
        context_string += f"Context {i}: {clean_text} \n"
    
    return context_string

context_bundle = build_context_for_QA_gen(document_index[10], rag_agent, k_context=3)

print(context_bundle)

Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 61641.80it/s]

 Context 0: Committee continues to judge that the risks to its modal inflation projection are skewed to the upside. Second-round effects in domestic prices and wages are expected to take longer to unwind than they did to emerge. There are also upside risks to inflation from energy prices given events in the Middle East. Taking account of this skew, the mean projection for CPI inflation is 2.2% and 1.9% at the two and three-year horizons respectively. Conditioned on the alternative assumption of constant interest rates at 5.25%, which is a higher profile than the market curve beyond the second half of 2024, mean CPI inflation returns to target in two years’ time and falls to 1.6% at the three-year horizon. The MPC’s remit is clear that the inflation target applies at all times, reflecting the primacy of price stability in the UK monetary policy framework. The framework recognises that there will be occasions when inflation will depart from the target as a result of shocks and disturbanc




### 1. Setup a text-generation pipeline for the QA task

In [376]:
# First install / activate the model
# path_to_save = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag")

llm = Llama.from_pretrained(
    repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
    filename="capybarahermes-2.5-mistral-7b.Q5_K_M.gguf",
    local_dir=PROJECT_ROOT,
    verbose=False,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    n_ctx=1400,
)

In [377]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

# Shortened and fitting into the CHATML format
QA_SYSTEM_PROMPT = """ 
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)
""",

In [378]:
def create_chatML_QA_prompt(document_text: str) -> List[dict]: 
    return [
        {
            "role": "system",
            "content": QA_SYSTEM_PROMPT,
        },
        {
            "role": "user",
            "content": document_text,
        },
    ]

In [365]:
doc_text = document_index[100].text.replace("\n", " ")

test = llm.create_chat_completion(
    messages = create_chatML_QA_prompt(doc_text),
)

test

In [366]:
# Run the loop
raw_outputs = []
outputs = []
for sample_doc in tqdm(random.sample(document_index, 10), desc="Generating QA pairs"):
    
    context_text = build_context_for_QA_gen(sample_doc, rag_agent, k_context=3)
    
    output = llm.create_chat_completion(messages = create_chatML_QA_prompt(context_text))
    
    raw_outputs.append(output)
    outputs.append(
        {
            "context": context_text,
            "question": output['choices'][0]['message']['content'].split('Factoid question: ')[1].split('\n')[0],
            'answer': output['choices'][0]['message']['content'].split('Answer: ')[1],
        }
    )

Generating QA pairs:   0%|          | 0/10 [00:00<?, ?it/s]

Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 45680.54it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 107917.30it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 100896.27it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 25262.40it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 132340.89it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 116185.71it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 46048.70it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 112447.83it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 97018.91it/s]
Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 117330.65it/s]


In [375]:
raw_outputs[5]

{'id': 'chatcmpl-5a57d5e6-e941-4562-a65d-eebf037c2c42',
 'object': 'chat.completion',
 'created': 1721336023,
 'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-mistral-7b.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'Factoid question: What is the four-quarter inflation rate in Q4 excluding fuel and the impact of MTIC fraud?\n\nAnswer: The four-quarter inflation rate in Q4 excluding fuel and the impact of MTIC fraud is given in Context 0 (aa).'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 906, 'completion_tokens': 61, 'total_tokens': 967}}

In [57]:
### OLD PRE FORMATTED PROMPT
raw_outputs = []
outputs = []
for sample_doc in tqdm(random.sample(document_index, 10)):
    
    context_text = sample_doc.text
    context_text = context_text.replace("\n", " ")
    context_text = context_text.replace("  ", " ")
    
    output = llm(
        prompt=QA_generation_prompt.format(context=context_text),
        max_tokens=None,
        echo=True
    )
    raw_outputs.append(output)
    outputs.append(
        {
            "context": context_text,
            "question": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[1],
            "answer": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[-1],
        }
    )
    
    

  0%|          | 0/10 [00:00<?, ?it/s]

### 2. Assess the quality of the QA pairs

Based on the logic in this paper arxiv2312.10003 [https://arxiv.org/abs/2312.10003](https://arxiv.org/abs/2312.10003)

In [379]:
QA_GROUNDEDNESS_PROMPT = """
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the context and question.
"""

QA_RELEVANCE_PROMPT = """
Your task is to provide a 'total rating' representing how useful this question can be to macro-economists looking for information whilst working at the Bank of England.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.
"""

QA_STANDALONE_PROMPT = """
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like MPC, CPI or YBUS and still be a 5.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.
"""

In [380]:
def create_chatML_quality_prompt(prompt: str, question: str, context: str = None) -> List[dict]:
    if context is None:
        return [
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": f"Factoid question: {question}",
            },
        ]
    else:
        return [
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": f"Context: {context}\nFactoid question: {question}",
            },
        ]

In [261]:
output_bundle = outputs[4]

groundedness_eval = llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_GROUNDEDNESS_PROMPT, output_bundle['question'], output_bundle['context']),
    )

In [268]:
groundedness_eval

{'id': 'chatcmpl-34005a1d-fc96-43e5-a7c5-9304ccd6cb63',
 'object': 'chat.completion',
 'created': 1721317634,
 'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-mistral-7b.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'Evaluation: The context clearly states that "private sector regular AWE growth falls to around 3% by the end of the forecast period, in the MPC’s modal projection." This information directly answers the question about the private sector regular AWE growth rate at the end of the forecast period.\n\nTotal rating: 5'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 469, 'completion_tokens': 68, 'total_tokens': 537}}

In [381]:
raw_evals = []
for output_bundle in tqdm(outputs, desc="Evaluating QA pairs"):
    
    groundedness_eval = llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_GROUNDEDNESS_PROMPT, output_bundle['question'], output_bundle['context']),
    )
    
    relevance_eval = llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_RELEVANCE_PROMPT, output_bundle['question']),
    )
    
    standalone_eval = llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_STANDALONE_PROMPT, output_bundle['question']),
    )


    # Extract the scores and write them
    for eval_type, eval_output in zip(["groundedness", "relevance", "standalone"], [groundedness_eval, relevance_eval, standalone_eval]):
        # If the model has stopped generating text due to stopping itself, rather than max tokens, etc.
        if eval_output['choices'][0]["finish_reason"] == 'stop':
            output_bundle.update(
                {
                    f"{eval_type}_score": eval_output['choices'][0]['message']['content'].split('Total rating: ')[1].split('\n')[0].strip(),
                    f"{eval_type}_rationale": eval_output['choices'][0]['message']['content'].split('Evaluation: ')[1].split('\n')[0].strip(),
                }
            )
        else:
            output_bundle.update(
                {
                    f"{eval_type}_score": "0",
                    f"{eval_type}_rationale": ""
                }
            )
    
    raw_evals.append(
        {
            "groundedness": groundedness_eval,
            "relevance": relevance_eval,
            "standalone": standalone_eval
        }
    )

Evaluating QA pairs:   0%|          | 0/10 [00:00<?, ?it/s]

In [271]:
outputs

[{'context': 'its projection. Global GDP growth remains subdued… Chart 2.2: Global GDP growth continues to be subdued Four-quarter UK-weighted world GDP growth ( a) Sources: Refinitiv Eikon from LSEG and Bank calculations. (a) See footnote (c) of Table 1.D for definition. Figures for 2023 Q4 to 2024 Q3 are Bank staff projections. These projections do not include the advance estimate of US GDP in 2023 Q4 or the preliminary flash estimate of euro-area GDP for the same quarter, which were released after the data cut-off. …but with significant regional differences. Page 31 Bank of England',
  'question': 'What is the source of the data used in Chart 2.2 showing global GDP growth in the context provided?',
  'answer': 'The source of the data used in Chart 2.2 is Refinitiv Eikon from LSEG.',
  'groundedness_score': '5',
  'groundedness_rationale': 'The context clearly states that "Sources: Refinitiv Eikon from LSEG and Bank calculations" are used for the data in Chart 2.2 showing global GDP 

In [272]:
# Filter out any bad outputs using the scores - simple lambda
filtered_outputs = list(filter(lambda x: float(x['groundedness_score']) >= 2 and float(x['relevance_score']) >= 2 and float(x['standalone_score']) >= 2, outputs))

In [275]:
len(outputs), len(filtered_outputs)

(10, 7)

Old

In [62]:
question_groundedness_critique_prompt = """
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

In [132]:
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to macro-economists looking for information at the Bank of England.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [139]:
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like MPC, Committee, CPI or YBUS and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [180]:
raw_evals = []
for output_bundle in tqdm(outputs, desc="Evaluating QA pairs"):
    
    groundedness_eval = llm(
        prompt=question_groundedness_critique_prompt.format(
            question=output_bundle['question'],
            context=output_bundle['context']
        ),
        max_tokens=None,
        echo=True
    )
    
    relevance_eval = llm(
        prompt=question_relevance_critique_prompt.format(
            question=output_bundle['question']
        ),
        max_tokens=None,
        echo=True
    )
    
    standalone_eval = llm(
        prompt=question_standalone_critique_prompt.format(
            question=output_bundle['question']
        ),
        max_tokens=None,
        echo=True
    )

    # Extract the scores and write them
    for eval_type, eval_output in zip(["groundedness", "relevance", "standalone"], [groundedness_eval, relevance_eval, standalone_eval]):
        # If the model has stopped generating text due to stopping itself, rather than max tokens, etc.
        if eval_output['choices'][0]["finish_reason"] == 'stop':
            output_bundle.update(
                {
                    f"{eval_type}_score": eval_output['choices'][0]['text'].split("Total rating:")[-1].strip(),
                    f"{eval_type}_rationale": eval_output['choices'][0]['text'].split("Evaluation:")[-1].strip()
                }
            )
        else:
            output_bundle.update(
                {
                    f"{eval_type}_score": "0",
                    f"{eval_type}_rationale": ""
                }
            )

    # output_bundle.update(
    #     {
    #         "groundedness_score": groundedness_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #         "relevance_score": relevance_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #         "standalone_score": standalone_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #     }
    # )    
    
    raw_evals.append(
        {
            "groundedness": groundedness_eval,
            "relevance": relevance_eval,
            "standalone": standalone_eval
        }
    )

  0%|          | 0/10 [00:00<?, ?it/s]

In [385]:
# pprint(outputs[5])
# raw_evals[0]['groundedness']['choices'][0]['text']
raw_evals[0]

{'groundedness': {'id': 'chatcmpl-38f54963-e6e6-43c0-b489-237e3b4e3e4f',
  'object': 'chat.completion',
  'created': 1721336490,
  'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-mistral-7b.Q5_K_M.gguf',
  'choices': [{'index': 0,
    'message': {'role': 'assistant',
     'content': 'Evaluation: The context provides information about the expected GDP growth in Q4 for the UK, specifically stating that "On balance, Bank staff expect GDP to grow by 0.1% in the fourth quarter." This statement directly answers the question.\n\nTotal rating: 5'},
    'logprobs': None,
    'finish_reason': 'stop'}],
  'usage': {'prompt_tokens': 810,
   'completion_tokens': 59,
   'total_tokens': 869}},
 'relevance': {'id': 'chatcmpl-89b43625-a79b-4f64-bcf7-3163aef6f22b',
  'object': 'chat.completion',
  'created': 1721336500,
  'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-

In [176]:
# Filter out any bad outputs using the scores - simple lambda
filtered_outputs = list(filter(lambda x: float(x['groundedness_score']) >= 2 and float(x['relevance_score']) >= 2 and float(x['standalone_score']) >= 2, outputs))

In [390]:
filtered_outputs[1]

{'context': 'A measure of medium-term inflation compensation in financial markets has risen over the course of the year and stands well above its average level over the previous decade, though still below its peak in the first half of 2022 (Chart 2.21). Interpreting these data is challenging because they can move for reasons unrelated to inflation expectations, for example due to illiquidity in markets and the use of these instruments in hedging pension liabilities. As this is a measure of RPI inflation compensation, any changes in the outlook for the wedge between RPI and CPI can also affect these data. The median respondent in the November Market Participants Survey expected CPI inflation of 2.1% three years ahead, down slightly from 2.2% in August. The distribution of survey responses remained skewed to the upside.Chart 2.20: Firms’ CPI inflation expectations have fallen back Firm inflation expectations (a) Source: DMP Survey. (a) Data are based on responses to the question: ‘What d

In [179]:
outputs


[{'context': 'outcomes for GDP  growth. It has been conditioned on Bank Rate\nfollowing a path implied by market yields, but allows the Committee’ s judgement on the risks around the other\nconditioning assumptions set out in Section 1.1, including wholesale energy prices, to af fect the calibration of the fan\nchart skew. To the left of the shaded area, the distribution reflects uncertainty around revisions to the data over the past.\nTo the right of the shaded area, the distribution reflects uncertainty over the evolution of GDP growth in the future. If\neconomic circumstances identical to today’s were to prevail on 100 occasions, the MPC’s best collective judgement is\nthat the mature estimate of GDP growth would lie within the darkest central band on only 30 of those occasions. The fan\nchart is constructed so that outturns are also expected to lie within each pair of the lighter aqua areas on 30 occasions.\nIn any particular quarter of the forecast period, GDP growth is therefore 

### 3. Generate answers to the Qs with the RAG system

In [389]:
for qa_bundle in filtered_outputs:
    print(qa_bundle['context'])
    break

estimates of the participation and employment rates in mid-2022 Indicative staff estimates of the impact of updating the LFS population weights (a) Sources: ONS and Bank calculations. (a) Indicative staff estimates are based on the ONS’s January 2023 population projections. Bars represent the change between 2019 Q4 and the three months to July 2022 in the current LFS estimates versus the indicative post-revision estimates calculated by Bank staff. Bank of England  Page 68


In [393]:
rag_agent_llm = StructRAGInquirer(
    str((PROJECT_ROOT / "results" / 'v0' / '2024-07-18').resolve()),
    llm_type="llamacpp",
    model_path = str(PROJECT_ROOT / "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"),
    use_anchor_document=False,
    llm_max_tokens=1400,
)

TypeError: StructRAGInquirer.__init__() got an unexpected keyword argument 'llm_type'

In [None]:
response = rag_agent_llm.run_inquirer(
    query = filtered_outputs[0]['question'],
    source_document_name=None,
    k_context = 3,
)

### 4. Use the filtered QA pairs and RAG answers to assess the RAG system, with a 'judge' agent.

There are different types of [evaluation metrics in RAG](https://docs.ragas.io/en/latest/concepts/metrics/index.html)

We will focus on:
1. Answer relevancy
2. Faithfullness

We will use https://huggingface.co/prometheus-eval/prometheus-13b-v1.0 as our evaluation model, or something similar. Found [a GGUF vsevolodl/prometheus-7b-v2.0-GGUF](https://huggingface.co/vsevolodl/prometheus-7b-v2.0-GGUF)

In [383]:
judge_llm = Llama.from_pretrained(
    repo_id="vsevolodl/prometheus-7b-v2.0-GGUF",
    filename="prometheus-7b-v2.0.Q6_K.gguf",
    local_dir=PROJECT_ROOT,
    verbose=False,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    n_ctx=1400,
)

In [382]:
# Instruction wit
judge_prompt = """
###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)\"
4. Please do not generate any other opening, closing, and explanations.

###The instruction to evaluate:
{query} Use the following context to answer the question: 
Context: {context}

###Response to evaluate:
{answer_RAG_system}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback: 
"""

conv = mistral_conversation.get_conv_template("llama-2")
conv.set_system_message("You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance.")
conv.append_message(conv.roles[0], judge_prompt)
conv.append_message(conv.roles[1], None)

judge_prompt = conv.get_prompt()

In [323]:
evaluation = judge_llm(
    prompt=judge_prompt.format(
        query=filtered_outputs[0]['question'],
        context=filtered_outputs[0]['context'],
        reference_answer=filtered_outputs[0]['answer'],
        answer_RAG_system="The quick brown fox jumps over the lazy dog",
    ),
    max_tokens=None,
    echo=True
)

{'id': 'cmpl-7d7203d7-62b2-4251-8a71-ec75de68b023',
 'object': 'text_completion',
 'created': 1721320091,
 'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/prometheus-7b-v2.0.Q6_K.gguf',
 'choices': [{'text': '[INST] <<SYS>>\nYou are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance.\n<</SYS>>\n\n\n###Task Description:\nAn instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.\n1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.\n2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.\n3. The output format should look as follows: "Feedback: (write 