# Scaffold the user-based application

In [1]:
import os
import sys
import pickle
import networkx as nx
import matplotlib as mpl
import numpy as np
import random
import matplotlib.pyplot as plt
import re
from pprint import pprint

from datetime import date
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

# Add the project root directory to the system path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

from StructuredRag.evaluation import graph_scoring
from StructuredRag.processing import graph_construction
from StructuredRag.algorithms import v0, v1
from StructuredRag.processing import distance_metrics
from StructuredRag.etl import embedding_funcs, etl_funcs

from llama_index.core.node_parser import TokenTextSplitter

from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.docstore.document import Document
from langchain.prompts.prompt import PromptTemplate
from langchain.output_parsers import PydanticOutputParser


  from tqdm.autonotebook import tqdm, trange


# Testing LM load

In [2]:
# model = SentenceTransformer(r"C:\Users\335257\.cache\huggingface\hub\models--sentence-transformers--all-MiniLM-L6-v2\snapshots\cbce8a8c7380b8bc926ac6d6425442c393b66d10")
model = SentenceTransformer("sentence-transformers/multi-qa-mpnet-base-dot-v1")

You try to use a model that was created with version 3.0.0.dev0, however, your version is 3.0.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [18]:
q1 = "Homer is a Simpsons character"
q2 = "Shakespeare is a great poet"


emb1 = model.encode(q1)
emb2 = model.encode(q2)

float(util.dot_score(emb1, emb2))

11.704995155334473

# New

In [2]:
from StructuredRag.algorithms.inquirer import StructRAGInquirer

inquirer = StructRAGInquirer(
    path_to_experiment='/Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/results/v0/2024-05-27',
    llm_name='google/flan-t5-large',
    llm_max_tokens=512,
)

Loading item: embedded_index
Loading item: edge_thresh
Loading item: adj_matrix


  _torch_pytree._register_pytree_node(


In [3]:
res = inquirer.run_inquirer(
    query='How will climate change affect the economy?',
    source_document_name='MPR November 2023.pdf',
    k_context=3,
)

  _torch_pytree._register_pytree_node(
Token indices sequence length is longer than the specified maximum sequence length for this model (1103 > 512). Running this sequence through the model will result in indexing errors


In [4]:
for doc in res['input_documents']:
    print(doc.metadata["doc_difference"])

0
0.24579640986078943
0.25843880294410515


In [5]:
res

{'output_text': 'Physical impacts, for example extreme weather events and rising temperatures, can lead to disruptions in both output and inflation. Likewise, the transition to a low-carbon economy can impact activity through changes in policies, preferences, and technology . These channels will have resulting impacts for trends in labour productivity.',
 'input_documents': [Document(page_content='Climate change will affect the macroeconomy through a number of channels ( Angeli et al (2022)).\nPhysical impacts, for example extreme weather events and rising temperatures, can lead to\ndisruptions in both output and inflation. Likewise, the transition to a low-carbon economy can impact\nactivity through changes in policies, preferences, and technology . These channels will have resulting\nimpacts for trends in labour productivity.\nAround a third of DMP Survey respondents reported that climate change has resulted in an increase\nin their expenditure on capital over the past three years. T

# Legacy:

### Load some required data

In [2]:
# Select the runs you want to load
for experiment in sorted(os.listdir('../results')):
    print('Experiment:', experiment)
    for run in sorted(os.listdir('../results/' + experiment)):
        print("     || Run:", run)

Experiment: v0
     || Run: 2024-05-10
     || Run: 2024-05-14
     || Run: 2024-05-25
Experiment: v1
     || Run: 2024-05-10
     || Run: 2024-05-14
Experiment: v3
     || Run: 2024-05-10
     || Run: 2024-05-14
Experiment: v4
     || Run: 2024-05-10
     || Run: 2024-05-14
Experiment: v5
     || Run: 2024-05-14


In [3]:
run_path = 'v0/2024-05-25'

data = {}
# Read the data for the specified experiment
for item in os.listdir('../results/' + run_path):
    print('Loading item:', item.split('.')[0])
    
    with open('../results/' + run_path + '/' + item, 'rb') as f:
        data[item.split('.')[0]] = pickle.load(f)

Loading item: embedded_index
Loading item: edge_thresh
Loading item: adj_matrix


## Get the query and context

In [4]:
# User selects one of the documents
for doc in os.listdir("../data/01_raw/"):
    pprint(doc)

selected_doc = 'monetary policy report february 2024.pdf'

'monetary policy report february 2024.pdf'
'.gitkeep'
'MPR November 2023.pdf'


In [5]:
# User's query
# query = 'What is the relationship between unemployment and inflation?'
query = 'What are the key risks to the economy'

# Embed the query
embedded_query = embedding_funcs.embed_query(query)

  _torch_pytree._register_pytree_node(


In [6]:
# Find the most similar chunk of the document
sim_scores = {}
for doc in data['embedded_index']:
    if doc.metadata["file_name"].split("/")[-1] == selected_doc:
        sim_scores[doc.id_] = float(util.dot_score(embedded_query, doc.embedding))
        # sim_scores[doc.id_] = cosine_similarity(embedded_query.reshape(1, -1), np.array(doc.embedding).reshape(1, -1))[0][0]

# Sort the chunks
doc_similarity = dict(sorted(sim_scores.items(), key=lambda x: x[1], reverse=True))

most_similar_doc_id = list(doc_similarity.keys())[0]

In [89]:
# # Inspect the performance of the embedding search
# for doc in data['embedded_index']:
#     if doc.id_ == most_similar_doc_id:
#         print(doc.text)

In [32]:
# Search through the graph to find the most similar nodes
graph = graph_construction.construct_graph_from_adj_dict(data['adj_matrix'], data['edge_thresh'], data['embedded_index'])

node_paths = nx.single_source_dijkstra(G=graph, source=most_similar_doc_id, weight='weight')

k = 5
nearest_node_ids = list(node_paths[0].items())[:k]
# nearest_nodes

In [33]:
nearest_node_ids

[('28366e53-34b9-4492-a96f-c780adaa8911', 0),
 ('b3b0668a-275d-425d-b5ee-a7f3921e1fa8', 0.24459935597241614),
 ('f6245d2a-e66c-4346-a7d3-5a3fc098c453', 0.28790442749836603),
 ('6607baa7-b597-41c1-81c9-8c229f0cb397', 0.2928786271924973),
 ('6e008375-ac01-4a8d-b71e-30a17715d2b2', 0.3375807858294453)]

In [21]:
# Extract the info from the nodes
nearest_docs = []
for doc in data['embedded_index']:
    for node in nearest_node_ids:
        if node[0] == doc.id_:
            nearest_docs.append((doc, node[1]))

In [26]:
nearest_docs[3][1]

0

### Get the LLM

In [92]:
generative_model_name = "google/flan-t5-large"
llm_temperature = 0.0
llm_max_tokens = 512


llm = HuggingFacePipeline.from_model_id(
    model_id=generative_model_name, 
    task="text2text-generation", 
    model_kwargs={
        # "temperature": llm_temperature, 
        "max_length": llm_max_tokens,
    },
)


In [93]:
# Reshape the documents into a langchain document class
top_matches = [
    Document(
        page_content=doc.text,
        metadata={
            'doc_num': i + 1,
            # **doc.metadata,
        }
    )
    for i, doc in enumerate(nearest_docs)
]

In [94]:
# from pydantic import BaseModel, Field
# from typing import List, Optional


# class LlmResponse(BaseModel):
#     answer_provided: bool = Field(
#         description="""True if enough information is provided in the context to answer
#         the question, False otherwise."""
#     )
#     most_likely_answer: Optional[str] = Field(
#         description="""Answer to the question, quoting or only minimally rephrasing
#         the provided text. Empty if answer_provided=False."""
#     )
#     highlighting1: List[str] = Field(
#         description="""List of short exact subphrases from the first context document,
#         that are most relevant to the question and should therefore be highlighted
#         within the context."""
#     )
#     highlighting2: List[str] = Field(
#         description="""List of short exact subphrases from the second context document,
#         that are most relevant to the question and should therefore be highlighted
#         within the context."""
#     )
#     highlighting3: List[str] = Field(
#         description="""List of short exact subphrases from the third and any further
#         context document, that are most relevant to the question and should therefore
#         be highlighted within the context.
#         Empty if the number of context documents is smaller."""
#     )
#     reasoning: Optional[str] = Field(
#         description="""Step by step reasoning why an answer has been selected or could
#         not be provided. Reasoning how highlighted keywords relate to the question."""
#     )


In [95]:
_core_prompt = """
==Background==
You are an AI assistant with a focus on helping to answer economists' search questions
over particular documents. Your responses should be based only
on information provided within the query. It is important to maintain impartiality
and non-partisanship. If you are unable to answer a question based on the given
instructions, please indicate so. Your responses should be concise and professional,
using British English.
Consider the current date, {current_datetime}, when providing responses related to time. 
"""

_extractive_prompt = """
==TASK==
Your task is to extract and write an answer for the question based on the provided
contexts. Make sure to quote a part of the provided context closely. If the question
cannot be answered from the information in the context, please do not provide an answer.
If the context is not related to the question, please do not provide an answer.
Most importantly, even if no answer is provided, find one to three short phrases
or keywords in each context that are most relevant to the question, and return them
separately as exact quotes (using the exact verbatim text and punctuation).
Explain your reasoning.

Question: {question}
Contexts: {summaries}
"""

# parser = PydanticOutputParser(pydantic_object=LlmResponse)

EXTRACTIVE_PROMPT_PYDANTIC = PromptTemplate.from_template(
    template=_core_prompt
    + _extractive_prompt,
    # + "\n\n ==RESPONSE FORMAT==\n{format_instructions}"
    # + "\n\n ==JSON RESPONSE ==\n",
    partial_variables={
        "current_datetime": str(date.today()),
        # "format_instructions": parser.get_format_instructions(),
    },
)

_stuff_document_template = (
    "<Doc{doc_num} >{page_content}</Doc{doc_num}>"
)

STUFF_DOCUMENT_PROMPT = PromptTemplate.from_template(_stuff_document_template)

In [96]:
# Stuff the documents into the model
chain = load_qa_with_sources_chain(
    llm,
    chain_type='stuff',
    prompt=EXTRACTIVE_PROMPT_PYDANTIC,
    document_prompt=STUFF_DOCUMENT_PROMPT,
)

In [97]:
response = chain.invoke(
    {"input_documents": top_matches, "question": query},
    return_only_outputs=True,
)

Token indices sequence length is longer than the specified maximum sequence length for this model (2698 > 512). Running this sequence through the model will result in indexing errors


In [99]:
response

{'output_text': 'There are risks in both directions around the central projections for domestic spending and GDP , including those related to the transmission of monetary policy. In particular, there is uncertainty around the collateral and precautionary savings channels through which house prices af fect consumer spending, and around the extent to which the full effects of interest rates on business.'}