# Notebook to generate synthetic QA over the data

In [34]:
%load_ext autoreload
%autoreload 2

from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
import pickle
import pathlib
import random
import os
import sys
from transformers import pipeline
from llama_cpp import Llama
from pprint import pprint
from importlib import reload

# Add the project root directory to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)
PROJECT_ROOT = pathlib.Path(project_root)

%aimport StructuredRag
from StructuredRag.utils import mistral_conversation
from StructuredRag.algorithms.inquirer import StructRAGInquirer

pd.set_option("display.max_colwidth", None)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
reload(StructuredRag.algorithms.inquirer)

<module 'StructuredRag.algorithms.inquirer' from '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/src/StructuredRag/algorithms/inquirer.py'>

### Load docs and instantiate our RAG and model agent

In [135]:
experiment_path = 'v0/2024-07-18'

In [5]:
# path_to_data = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/results/v0/2024-06-14/embedded_index.pickle")
with open(PROJECT_ROOT / "results" / experiment_path / 'embedded_index.pickle', "rb") as f:
    document_index = pickle.load(f)

In [10]:
inquirer = StructuredRag.algorithms.inquirer.StructRAGInquirer(
    path_to_experiment= str(PROJECT_ROOT / "results" / experiment_path),
    llm_type='llamacpp',
    model_path=str(PROJECT_ROOT / "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"),
    llm_max_tokens=1400,
    n_gpu_layers=-1, # All layers
    use_anchor_document=False,
)

# Use inquirer.llm for the llamacpp object
# Otherwise use inquirer.run_inquirer for the RAG pipeline

Loading item: embedded_index
Loading item: notes
Loading item: edge_thresh
Loading item: adj_matrix


Adding edges to graph: 100%|██████████| 440/440 [00:00<00:00, 97909.37it/s]


In [None]:
# # instantiated so we can use it to retreive the approapriate context
# rag_agent = StructRAGInquirer(
#     str((PROJECT_ROOT / "results" / 'v0' / '2024-07-18').resolve()),
#     llm_name='google/flan-t5-large',
#     llm_max_tokens=512,
#     use_anchor_document=False,
# )

# result = rag_agent.run_inquirer(
#     query = outputs[0]['question'].replace('Factoid question: ', ''),
#     source_document_name=None,
#     k_context = 3,
# )

In [11]:
def build_context_for_QA_gen(doc, rag_agent, k_context: int = 3):
    """
    Builds the context string for generating synthetic question-answering pairs.

    Args:
        doc: The document for which the context is being built.
        rag_agent: The RAG agent used for retrieving similar nodes.
        k_context (int): The number of similar nodes to consider for building the context. Default is 3.

    Returns:
        context_string (str): The generated context string containing the clean text of similar nodes.
    """
    similar_nodes = rag_agent._graph_similar_nodes(doc.id_, k_context)
    
    context_string = """ """
    for i, (node_id, _) in enumerate(similar_nodes):
        # Yes its unoptimised... find the document who's id matches the node_id
        node = next((x for x in document_index if x.id_ == node_id), None)
        
        clean_text = node.text.replace("\n", " ").replace("\t", " ").replace("  ", " ").strip()
        context_string += f"Context item {i}: {clean_text} \n"
    
    return context_string

context_bundle = build_context_for_QA_gen(document_index[10], inquirer, k_context=3)

print(context_bundle)

 Context item 0: Committee continues to judge that the risks to its modal inflation projection are skewed to the upside. Second-round effects in domestic prices and wages are expected to take longer to unwind than they did to emerge. There are also upside risks to inflation from energy prices given events in the Middle East. Taking account of this skew, the mean projection for CPI inflation is 2.2% and 1.9% at the two and three-year horizons respectively. Conditioned on the alternative assumption of constant interest rates at 5.25%, which is a higher profile than the market curve beyond the second half of 2024, mean CPI inflation returns to target in two years’ time and falls to 1.6% at the three-year horizon. The MPC’s remit is clear that the inflation target applies at all times, reflecting the primacy of price stability in the UK monetary policy framework. The framework recognises that there will be occasions when inflation will depart from the target as a result of shocks and distu

### 1. Setup a text-generation pipeline for the QA task

In [None]:
# First install / activate the model
# path_to_save = pathlib.Path("/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag")

# llm = Llama.from_pretrained(
#     repo_id="TheBloke/CapybaraHermes-2.5-Mistral-7B-GGUF",
#     filename="capybarahermes-2.5-mistral-7b.Q5_K_M.gguf",
#     local_dir=PROJECT_ROOT,
#     verbose=False,
#     n_gpu_layers=-1, # Uncomment to use GPU acceleration
#     n_ctx=1400,
# )

In [12]:
QA_generation_prompt = """
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)

Now here is the context.

Context: {context}\n
Output:::"""

# Shortened and fitting into the CHATML format
QA_SYSTEM_PROMPT = """ 
Your task is to write a factoid question and an answer given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.

Provide your answer as follows:

Output:::
Factoid question: (your factoid question)
Answer: (your answer to the factoid question)
""",

In [13]:
def create_chatML_QA_prompt(document_text: str) -> List[dict]: 
    return [
        {
            "role": "system",
            "content": QA_SYSTEM_PROMPT,
        },
        {
            "role": "user",
            "content": document_text,
        },
    ]

In [None]:
# doc_text = document_index[100].text.replace("\n", " ")

# test = inquirer.llm.create_chat_completion(
#     messages = create_chatML_QA_prompt(doc_text),
# )

# test

In [14]:
# Run the loop
raw_outputs = []
outputs = []
for sample_doc in tqdm(random.sample(document_index, 10), desc="Generating QA pairs"):
    
    context_text = build_context_for_QA_gen(sample_doc, inquirer, k_context=3)
    
    output = inquirer.llm.create_chat_completion(messages = create_chatML_QA_prompt(context_text))
    
    raw_outputs.append(output)
    outputs.append(
        {
            "context": context_text,
            "question": output['choices'][0]['message']['content'].split('Factoid question: ')[1].split('\n')[0],
            'answer': output['choices'][0]['message']['content'].split('Answer: ')[1],
        }
    )

Generating QA pairs:   0%|          | 0/10 [00:00<?, ?it/s]

In [15]:
raw_outputs[5]

{'id': 'chatcmpl-edab8a0d-89b9-4944-8cb8-932cf58f40ec',
 'object': 'chat.completion',
 'created': 1721393338,
 'model': '/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/capybarahermes-2.5-mistral-7b.Q5_K_M.gguf',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'Factoid question: How quickly has core goods inflation fallen compared to services inflation across advanced economies?\n\nAnswer: Core goods inflation has fallen more quickly than services inflation.'},
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 812, 'completion_tokens': 36, 'total_tokens': 848}}

In [None]:
# ### OLD PRE FORMATTED PROMPT
# raw_outputs = []
# outputs = []
# for sample_doc in tqdm(random.sample(document_index, 10)):
    
#     context_text = sample_doc.text
#     context_text = context_text.replace("\n", " ")
#     context_text = context_text.replace("  ", " ")
    
#     output = llm(
#         prompt=QA_generation_prompt.format(context=context_text),
#         max_tokens=None,
#         echo=True
#     )
#     raw_outputs.append(output)
#     outputs.append(
#         {
#             "context": context_text,
#             "question": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[1],
#             "answer": output['choices'][0]['text'].split("Output:::")[-1].split('\n')[-1],
#         }
#     )
    
    

### 2. Assess the quality of the QA pairs

Based on the logic in this paper arxiv2312.10003 [https://arxiv.org/abs/2312.10003](https://arxiv.org/abs/2312.10003)

In [16]:
QA_GROUNDEDNESS_PROMPT = """
Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here are the context and question.
"""

QA_RELEVANCE_PROMPT = """
Your task is to provide a 'total rating' representing how useful this question can be to macro-economists looking for information whilst working at the Bank of England.
Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.
"""

QA_STANDALONE_PROMPT = """
Your task is to provide a 'total rating' representing how context-independent this question is.
Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
The questions can contain obscure technical nouns or acronyms like MPC, CPI or YBUS and still be a 5.

For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

Provide your answer as follows:

Answer:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 5)

You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

Now here is the question.
"""

In [17]:
def create_chatML_quality_prompt(prompt: str, question: str, context: str = None) -> List[dict]:
    if context is None:
        return [
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": f"Factoid question: {question}",
            },
        ]
    else:
        return [
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": f"Context: {context}\nFactoid question: {question}",
            },
        ]

In [None]:
# output_bundle = outputs[4]

# groundedness_eval = inquirer.llm.create_chat_completion(
#         messages = create_chatML_quality_prompt(QA_GROUNDEDNESS_PROMPT, output_bundle['question'], output_bundle['context']),
#     )

# groundedness_eval

In [18]:
raw_evals = []
for output_bundle in tqdm(outputs, desc="Evaluating QA pairs"):
    
    groundedness_eval = inquirer.llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_GROUNDEDNESS_PROMPT, output_bundle['question'], output_bundle['context']),
    )
    
    relevance_eval = inquirer.llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_RELEVANCE_PROMPT, output_bundle['question']),
    )
    
    standalone_eval = inquirer.llm.create_chat_completion(
        messages = create_chatML_quality_prompt(QA_STANDALONE_PROMPT, output_bundle['question']),
    )


    # Extract the scores and write them
    for eval_type, eval_output in zip(["groundedness", "relevance", "standalone"], [groundedness_eval, relevance_eval, standalone_eval]):
        # If the model has stopped generating text due to stopping itself, rather than max tokens, etc.
        if eval_output['choices'][0]["finish_reason"] == 'stop':
            output_bundle.update(
                {
                    f"{eval_type}_score": eval_output['choices'][0]['message']['content'].split('Total rating: ')[1].split('\n')[0].strip(),
                    f"{eval_type}_rationale": eval_output['choices'][0]['message']['content'].split('Evaluation: ')[1].split('\n')[0].strip(),
                }
            )
        else:
            output_bundle.update(
                {
                    f"{eval_type}_score": "0",
                    f"{eval_type}_rationale": ""
                }
            )
    
    raw_evals.append(
        {
            "groundedness": groundedness_eval,
            "relevance": relevance_eval,
            "standalone": standalone_eval
        }
    )

Evaluating QA pairs:   0%|          | 0/10 [00:00<?, ?it/s]

In [19]:
outputs[1]

{'context': ' Context item 0: to 7.4% in July but declined to 6.8% in August, 0.3 percentage points lower than expected in the August Report. Some of those movements were linked to services such as airfares and accommodation that tend to be volatile over the summer holiday period. Excluding these travel-related components, services inflation had been more stable at continued high rates, albeit slightly weaker than expected. CPI inflation was expected to fall significantly further in the near term, reflecting lower annual energy inflation, despite the renewed upward pressure from oil prices, and further declines in food and core goods price inflation. Services price inflation, however, was projected to remain elevated in the near term, with some potential month-to-month volatility. Developments in key indicators of inflation persistence had been mixed, with the acceleration in the AWE not apparent in other measures of wages and with some downside news on services inflation. There were i

In [20]:
# Filter out any bad outputs using the scores - simple lambda
filtered_outputs = list(filter(lambda x: float(x['groundedness_score']) >= 2 and float(x['relevance_score']) >= 2 and float(x['standalone_score']) >= 2, outputs))

In [21]:
len(outputs), len(filtered_outputs)

(10, 8)

Old

In [None]:
# question_groundedness_critique_prompt = """
# Your task is to provide a 'total rating' scoring how well one can answer the given question unambiguously with the given context.
# Give your answer on a scale of 1 to 5, where 1 means that the question is not answerable at all given the context, and 5 means that the question is clearly and unambiguously answerable with the context.

# Provide your answer as follows:

# Answer:::
# Evaluation: (your rationale for the rating, as a text)
# Total rating: (your rating, as a number between 1 and 5)

# You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

# Now here are the question and context.

# Question: {question}\n
# Context: {context}\n
# Answer::: """

# question_relevance_critique_prompt = """
# You will be given a question.
# Your task is to provide a 'total rating' representing how useful this question can be to macro-economists looking for information at the Bank of England.
# Give your answer on a scale of 1 to 5, where 1 means that the question is not useful at all, and 5 means that the question is extremely useful.

# Provide your answer as follows:

# Answer:::
# Evaluation: (your rationale for the rating, as a text)
# Total rating: (your rating, as a number between 1 and 5)

# You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

# Now here is the question.

# Question: {question}\n
# Answer::: """

# question_standalone_critique_prompt = """
# You will be given a question.
# Your task is to provide a 'total rating' representing how context-independent this question is.
# Give your answer on a scale of 1 to 5, where 1 means that the question depends on additional information to be understood, and 5 means that the question makes sense by itself.
# For instance, if the question refers to a particular setting, like 'in the context' or 'in the document', the rating must be 1.
# The questions can contain obscure technical nouns or acronyms like MPC, Committee, CPI or YBUS and still be a 5: it must simply be clear to an operator with access to documentation what the question is about.

# For instance, "What is the name of the checkpoint from which the ViT model is imported?" should receive a 1, since there is an implicit mention of a context, thus the question is not independant from the context.

# Provide your answer as follows:

# Answer:::
# Evaluation: (your rationale for the rating, as a text)
# Total rating: (your rating, as a number between 1 and 5)

# You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.

# Now here is the question.

# Question: {question}\n
# Answer::: """

In [None]:
# raw_evals = []
# for output_bundle in tqdm(outputs, desc="Evaluating QA pairs"):
    
#     groundedness_eval = llm(
#         prompt=question_groundedness_critique_prompt.format(
#             question=output_bundle['question'],
#             context=output_bundle['context']
#         ),
#         max_tokens=None,
#         echo=True
#     )
    
#     relevance_eval = llm(
#         prompt=question_relevance_critique_prompt.format(
#             question=output_bundle['question']
#         ),
#         max_tokens=None,
#         echo=True
#     )
    
#     standalone_eval = llm(
#         prompt=question_standalone_critique_prompt.format(
#             question=output_bundle['question']
#         ),
#         max_tokens=None,
#         echo=True
#     )

#     # Extract the scores and write them
#     for eval_type, eval_output in zip(["groundedness", "relevance", "standalone"], [groundedness_eval, relevance_eval, standalone_eval]):
#         # If the model has stopped generating text due to stopping itself, rather than max tokens, etc.
#         if eval_output['choices'][0]["finish_reason"] == 'stop':
#             output_bundle.update(
#                 {
#                     f"{eval_type}_score": eval_output['choices'][0]['text'].split("Total rating:")[-1].strip(),
#                     f"{eval_type}_rationale": eval_output['choices'][0]['text'].split("Evaluation:")[-1].strip()
#                 }
#             )
#         else:
#             output_bundle.update(
#                 {
#                     f"{eval_type}_score": "0",
#                     f"{eval_type}_rationale": ""
#                 }
#             )

#     # output_bundle.update(
#     #     {
#     #         "groundedness_score": groundedness_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
#     #         "relevance_score": relevance_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
#     #         "standalone_score": standalone_eval['choices'][0]['text'].split("Total rating:")[-1].strip(),
#     #     }
#     # )    
    
#     raw_evals.append(
#         {
#             "groundedness": groundedness_eval,
#             "relevance": relevance_eval,
#             "standalone": standalone_eval
#         }
#     )

### 3. Generate answers to the Qs with the RAG system

In [22]:
filtered_outputs[0]

{'context': ' Context item 0: Interest rate rises will also reduce non-housing asset prices. Indeed, net financial wealth relative to household incomes fell materially in 2022 despite no reduction in nominal household deposits (Broadbent (2022) ). However, changes in household financial wealth tend to have less overall impact on demand because most households do not own significant non-housing and non-pension wealth. The effect can be important for those households at the top of the wealth distribution where, prior to the pandemic, the richest tenth of households had 17% of their total net wealth in financial assets. The consumption effect of reductions in the value of financial wealth are captured within the purple bars in Chart 3.7. Although not directly affected by rising interest rates, households in the rental sector may also face increased housing costs, leading to further reductions in consumer demand. Specifically, rising interest rates increase costs for buy-to-let (BTL) landl

In [27]:
for qa_bundle in tqdm(filtered_outputs, desc="Running inquirer"):
    response = inquirer.run_inquirer(
        query = qa_bundle['question'],
        source_document_name=None,
        k_context = 3,
    )
    
    qa_bundle.update(
        {
            "RAG_response": response,
            "RAG_response_text": response['choices'][0]['message']['content'],
        }
    )
    

Running inquirer:   0%|          | 0/8 [00:00<?, ?it/s]

'The expected growth rate of private sector regular AWE by the end of the MPC\'s modal projection is around 3%. This information can be found in Context 0, which states that "private sector regular AWE growth falls to around 3% by the end of the forecast period."'

### 4. Use the filtered QA pairs and RAG answers to assess the RAG system, with a 'judge' agent.

There are different types of [evaluation metrics in RAG](https://docs.ragas.io/en/latest/concepts/metrics/index.html)

We will focus on:
1. Answer relevancy
2. Faithfullness

We will use https://huggingface.co/prometheus-eval/prometheus-13b-v1.0 as our evaluation model, or something similar. Found [a GGUF vsevolodl/prometheus-7b-v2.0-GGUF](https://huggingface.co/vsevolodl/prometheus-7b-v2.0-GGUF)

In [87]:
judge_llm = Llama.from_pretrained(
    repo_id="vsevolodl/prometheus-7b-v2.0-GGUF",
    filename="prometheus-7b-v2.0.Q6_K.gguf",
    local_dir=PROJECT_ROOT,
    verbose=False,
    n_gpu_layers=-1, # Uncomment to use GPU acceleration
    n_ctx=1600,
)

In [133]:
# Instruction wit
judge_prompt = """
###Task Description:
An instruction (including the context) a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: (write a feedback for criteria) [RESULT] (an integer number between 1 and 5)\"
4. Please do not generate any other opening, closing, and explanations.

###The instruction to evaluate:
You are an AI assistant with a focus on helping to answer economists' search questions over particular documents. 
Respond only to the question asked, the response should be concise and relevant, and use the context provided to give a comprehensive answer.
It is important to maintain impartiality and non-partisanship. If you are unable to answer a question based on the given instructions and context, please indicate so.
Your responses should be well-structured and professional, using British English.

{query} Use the following context to answer the question:
Context: {context}

###Response to evaluate:
{answer_RAG_system}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback: 
"""

conv = mistral_conversation.get_conv_template("mistral")
conv.set_system_message("You are a fair judge assistant tasked with providing clear, objective feedback based on specific criteria, ensuring each assessment reflects the absolute standards set for performance.")
conv.append_message(conv.roles[0], judge_prompt)
conv.append_message(conv.roles[1], None)

judge_prompt = conv.get_prompt()

In [109]:
for qa_bundle in tqdm(filtered_outputs, desc="Running judgement llm"):
    evaluation = judge_llm.create_completion(
        prompt=judge_prompt.format(
            query=qa_bundle['question'],
            context=qa_bundle['context'],
            reference_answer=qa_bundle['answer'],
            answer_RAG_system=qa_bundle['RAG_response']['choices'][0]['message']['content'],
        ),
        echo=True,
        max_tokens=None
    )
    
    qa_bundle.update(
        {
            "judge_evaluation": evaluation,
            "judge_score": evaluation['choices'][0]['text'].split('[RESULT]')[-1].strip()
        }
    )

Running judgement llm:   0%|          | 0/8 [00:00<?, ?it/s]

### 5. View and save the results

In [132]:
pprint(filtered_outputs[1]['question'])
pprint(filtered_outputs[1]['RAG_response']['choices'][0]['message']['content'])
pprint(filtered_outputs[1]['judge_score'])
pprint(filtered_outputs[1]['judge_evaluation']['choices'][0]['text'].split('[/INST]')[1].split('[RESULT]')[0].strip())

'What was the change in CPI inflation between July and August?'
('Based on the provided context, we can find the change in CPI inflation '
 'between July and August in Context 0:\n'
 '\n'
 'Context 0 states that "Core goods CPI inflation had fallen from 6.4% in June '
 'to 5.2% in August." This indicates a decrease of 1.2 percentage points in '
 'core goods CPI inflation between June and August. Additionally, it mentions '
 'that Services CPI inflation rose from 7.2% in June to an unspecified value '
 'in August.\n'
 '\n'
 'However, the question asks for the change in overall CPI inflation between '
 'July and August, which is not explicitly mentioned in any of the contexts '
 'provided. If you could provide more specific information about the overall '
 'CPI inflation rates for July and August, I would be able to calculate the '
 'change accurately.')
'1'
('The response provided fails to directly answer the question about the change '
 'in overall CPI inflation between July and August

In [None]:
# Save the outputs
with open(PROJECT_ROOT / "results" / experiment_path / 'QA_outputs.pickle', "wb") as f:
    pickle.dump(filtered_outputs, f)