# Notebook to generate synthetic QA over the data

In [37]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import pickle
import pathlib
import random
from transformers import pipeline
from llama_cpp import Llama

pd.set_option("display.max_colwidth", None)

### 0. Load the documents

In [6]:
path_to_data = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/results/v0/2024-06-14/embedded_index.pickle")
with open(path_to_data, "rb") as f:
    document_index = pickle.load(f)

  _torch_pytree._register_pytree_node(


In [15]:
print(document_index[0].dict().keys())

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator', 'class_name'])


### 1. Setup a text-generation pipeline for the QA task

In [28]:
# First install / activate the model
path_to_save = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag")

llm = Llama.from_pretrained(
    repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
    filename="capybarahermes-2.5-mistral-7b.Q5_K_M.gguf",
    local_dir=path_to_save,
    verbose=False,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
)

In [29]:
output = llm(
      "Q: Name the planets in the solar system? A: ", # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)

{'id': 'cmpl-cd7f6d62-b261-495e-b8a8-b66ce8dba19a', 'object': 'text_completion', 'created': 1721082073, 'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-mistral-7b.Q5_K_M.gguf', 'choices': [{'text': 'Q: Name the planets in the solar system? A: 8 planets. Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus and Neptune.', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 14, 'completion_tokens': 28, 'total_tokens': 42}}


In [36]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

In [57]:
raw_outputs = []
outputs = []
for sample_doc in tqdm(random.sample(document_index, 10)):
    
    context_text = sample_doc.text
    context_text = context_text.replace("\n", " ")
    context_text = context_text.replace("  ", " ")
    
    output = llm(
        prompt=QA_generation_prompt.format(context=sample_doc.text),
        max_tokens=None,
        echo=True
    )
    raw_outputs.append(output)
    outputs.append(
        {
            "context": sample_doc.text,
            "question": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[1],
            "answer": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[-1],
        }
    )
    
    

  0%|          | 0/10 [00:00<?, ?it/s]

In [56]:
outputs

[{'context': 'Some contacts anticipate a pickup in transactional activity from 2024 Q2 onwards as funding\ncosts stabilise or reduce. Others, especially those serving the construction and property\nsectors, are more cautious and expect some recovery towards the end of 2024 or early 2025.\nDomestic demand has softened for construction products and consumer goods. Food and\ndrink output remains stable, but consumers continue to switch to cheaper brands and\nproducts. High-tech sectors such as aerospace, defence, specialised capital and sustainable\nequipment report activity continuing to pick up. Some contacts’ output was also supported by\nbetter-performing export markets such as the US and Asia. Vehicle output improved as supply\nchain disruption eased and demand was resilient.\nFood, drink and consumer goods producers expect some recovery in demand later in the year .\nHouse building, which remains the weakest sector, has slowed markedly over the past year\ndue to weak demand and risi

### 2. Assess the quality of the QA pairs

Based on the logic in this paper arxiv2312.10003 [https://arxiv.org/abs/2312.10003](https://arxiv.org/abs/2312.10003)

In [62]:
question_groundedness_critique_prompt = """
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the question and context.

Question: {question}\n
Context: {context}\n
Answer::: """

In [110]:
question_relevance_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how useful this question can be to macroeconomists working at the Bank of England.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [60]:
question_standalone_critique_prompt = """
You will be given a question.
Your task is to provide a 'total rating' representing how context-independant this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like Gradio, Hub, Hugging Face or Space and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.

Question: {question}\n
Answer::: """

In [105]:
raw_evals = []
for output_bundle in tqdm(outputs):
    
    groundedness_eval = llm(
        prompt=question_groundedness_critique_prompt.format(
            question=output_bundle['question'],
            context=output_bundle['context']
        ),
        max_tokens=None,
        echo=True
    )
    
    relevance_eval = llm(
        prompt=question_relevance_critique_prompt.format(
            question=output_bundle['question']
        ),
        max_tokens=None,
        echo=True
    )
    
    standalone_eval = llm(
        prompt=question_standalone_critique_prompt.format(
            question=output_bundle['question']
        ),
        max_tokens=None,
        echo=True
    )

    # Extract the scores and write them
    for eval_type, eval_output in zip(["groundedness", "relevance", "standalone"], [groundedness_eval, relevance_eval, standalone_eval]):
        # If the model has stopped generating text due to reaching the max token limit
        if eval_output['choices'][0]["finish_reason"] == 'stop':
            output_bundle.update(
                {
                    f"{eval_type}_score": eval_output['choices'][0]['text'].split("Total rating:")[-1].strip(),
                    f"{eval_type}_rationale": eval_output['choices'][0]['text'].split("Evaluation:")[-1].strip()
                }
            )
        else:
            output_bundle.update(
                {
                    f"{eval_type}_score": "0"
                }
            )

    # output_bundle.update(
    #     {
    #         "groundedness_score": groundedness_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #         "relevance_score": relevance_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #         "standalone_score": standalone_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
    #     }
    # )    
    
    raw_evals.append(
        {
            "groundedness": groundedness_eval,
            "relevance": relevance_eval,
            "standalone": standalone_eval
        }
    )

  0%|          | 0/10 [00:00<?, ?it/s]

In [120]:
raw_evals[7]['groundedness']['choices'][0]['text'].split('Evaluation:')[-1].strip()

'The context clearly provides the answer to the question by stating "The CPI measure'

In [None]:
# Filter out any bad outputs using the scores - simple lambda
filtered_outputs = list(filter(lambda x: x['groundedness_score'] >= 3 and x['relevance_score'] >= 3 and x['standalone_score'] >= 3, outputs))


In [112]:
outputs[3]

{'context': 'Despite the headwinds to income growth, households are relatively optimistic about their\nfuture finances. In the Bank’s NMG survey, the measure of households’ expectations for\ntheir own financial situation over the next year has improved substantially since 2022 and\nis now in line with results prior to the pandemic. Survey responses also suggest that\nhouseholds’ perceived risk of job loss has been falling and is now at its lowest level since\n2015, although expectations for the level of economy-wide unemployment have increased\nslightly over the past six months. The NMG’s measure of household income expectations\nhas also risen, although this largely reflects the expectation that nominal incomes will\ngrow given high inflation.\nDuring the pandemic, household consumption fell by more than income as households\nwere less able to spend on services, which meant that in aggregate households built up\nadditional savings. Much of these additional savings took the form of ban