# Loading the data from the notes

In [34]:
from llama_index.core import Settings, VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.duckdb import DuckDBVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from fuzzywuzzy import process

import pathlib
import pandas as pd
import glob
import PyPDF2
from collections import Counter

### Metadata extraction & data loading

In [58]:
# Load the excel sheet where I've manually logged all the metadata
metadata_df = pd.read_excel(
    r"N:\CECD\10. Personal\Lukas Alemu\Study Repository\99. Capstone\dissertation_rag\config\data_organisation.xlsx",
    index_col=0
)

In [32]:
test_file_path = r'N:\CECD\10. Personal\Lukas Alemu\Study Repository\99. Capstone\dissertation_rag\data\01_raw\Agenda for the January 2020 Benchmark and Key Issues meetings.pdf'
file_name = test_file_path.split('\\')[-1]

# metadata = metadata_df.loc[file_name]
# print(metadata)

problem - not all our filenames match the keys. I made some errors in copying. Maybe I can do a simple fuzzy match? Performance is not a concern here...

In [42]:
def match_name(name, df, min_score=0):
    # Returns the best match from a list of names to the input name
    max_score = -1
    max_name = ""

    # for idx in df.index:
    for n in df.index.tolist():
        score = process.extractOne(name, [n])[1]
        if (score > min_score) & (score > max_score):
            max_name = n
            max_score = score
    
    return (max_name, max_score)

file_name = 'Agencies Pay and Labour Market Survey (MPC Note - January 2020) '
nm, _ = match_name(file_name, metadata_df)
metadata_df.loc[nm].to_dict()

{'Description': 'For 2020, survey respondents expect pay settlements to remain flat at the 2.9% reported for 2019. \nIn contrast, respondents expect growth in total labour costs per employee to increase somewhat.  While the difference could partly reflect the impact of non-pay benefits and changes in the composition of the workforce, we would put more weight on the steer from the settlements responses for the outlook for pay growth. \nAs in 2019, the “Ability to recruit and retain staff” and “the National Living Wage” are driving up the growth rate of total labour costs per employee in 2020.  “Brexit Uncertainty”, “Changes in profitability” and “Economic Uncertainty” continue to pull down on the change in the growth rate of total labour costs per employee. ',
 'Type': 'Recommended reading',
 'Date': Timestamp('2020-01-15 00:00:00'),
 'Authors': 'Florence Hubert, Frances Hill, Louise Parreira, Alexis Tessier, Iain Duff',
 'Topics': 'Inflation > Inflation Expectations, Business Condition

This works - lets wrap this in a function to work for the llamaindex boilerplate

In [68]:


def match_notes_metadata(file_path: str, metadata_df: pd.DataFrame):
    """Match the metadata using the file name and the manual extracts
    I pasted into the 'data organisation' spreadsheet. 

    Args:
        file_path (str): absolute file path to the pdf to match
        metadata_df (pd.DataFrame): dataframe of the data organisation spreadsheet

    Returns:
        dict: dictionary containing the matched metadata
    """
    file_name = file_path.split('\\')[-1]
    idx_nm, _ = match_name(file_name, metadata_df)
    matched_metadata = metadata_df.loc[idx_nm].to_dict()

    return matched_metadata

def get_random_metadata(file_path: str):
    """Dummy function to demonstrate how we could extract extra
    metadata from the text. We're going to need to make this 
    much more sophisticated.

    Args:
        file_path (str): absolute file path to the pdf

    Returns:
        dict: collection of random metadata
    """
    random_metadata = {}
    # Read the document and extract metadata as a dictionary
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        # Get the number of characters in the pdf
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
        random_metadata['num_characters'] = len(text)

        # Get the number of words in the pdf
        words = text.split()
        random_metadata['num_words'] = len(words)

        # Get the most common 5 words in the pdf
        word_counts = Counter(words)
        random_metadata['most_common_words'] = dict(word_counts.most_common(5))

    return random_metadata

Let's create a function to extract links

In [67]:
def get_metadata(file_path: str):
    """Gather all the metadata into one spot

    Args:
        file_path (str): absolute file path to pdf 

    Returns:
        dict: collection of our metadata
    """
    
    matched_metadata = match_notes_metadata(file_path, metadata_df)
    random_metadata = get_random_metadata(file_path)

    return {**matched_metadata, **random_metadata}

In [61]:
# Load the documents
path_to_docs = pathlib.PurePosixPath(r"N:\CECD\10. Personal\Lukas Alemu\Study Repository\99. Capstone\dissertation_rag\data\01_raw")
documents = SimpleDirectoryReader(path_to_docs, file_metadata=get_metadata).load_data()

In [65]:
documents[0].metadata

{'page_label': '1',
 'file_name': 'Agencies Pay and Labour Market Survey (MPC Note - January 2020).pdf',
 'Description': 'For 2020, survey respondents expect pay settlements to remain flat at the 2.9% reported for 2019. \nIn contrast, respondents expect growth in total labour costs per employee to increase somewhat.  While the difference could partly reflect the impact of non-pay benefits and changes in the composition of the workforce, we would put more weight on the steer from the settlements responses for the outlook for pay growth. \nAs in 2019, the “Ability to recruit and retain staff” and “the National Living Wage” are driving up the growth rate of total labour costs per employee in 2020.  “Brexit Uncertainty”, “Changes in profitability” and “Economic Uncertainty” continue to pull down on the change in the growth rate of total labour costs per employee. ',
 'Type': 'Recommended reading',
 'Date': Timestamp('2020-01-15 00:00:00'),
 'Authors': 'Florence Hubert, Frances Hill, Louise

# OTHER STUFF

### Persist the data into a db

In [None]:
# Instantiate the vector store
path_to_db = pathlib.PurePosixPath(r"//Users/lukasalemu/Documents/00. Bank of England/00. Degree/Dissertation/structured-rag/data/02_processed")
vector_store = DuckDBVectorStore("pg.duckdb", persist_dir=str(path_to_db))

### Configure the embedd model

In [None]:
# Instantiate the db
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

### Configure the boilerplate

In [None]:
# Configure things to point in the right place
Settings.embed_model = embed_model
Settings.chunk_size = 512

storage_settings = StorageContext.from_defaults(
    vector_store=vector_store,
)

### Construct the index and save

In [None]:
# Construct the index
index = VectorStoreIndex.from_documents(documents, storage_context=storage_settings)

### Load the index from the vector db

In [None]:
# Load the vector store from local
vector_store = DuckDBVectorStore.from_local(str(path_to_db/"pg.duckdb"))
index = VectorStoreIndex.from_vector_store(vector_store)

### Query

In [None]:
# Now we can retrieve similar documents to a given query
query_text = "What is the forecast"

results = index.as_retriever().retrieve(query_text)

print(len(results))
print(results[0].text)
print(results[0].metadata)