# RAG

This work will look at the implementation of RAG within NHS England. This notebook contains a simple RAG pipeline which 

In [None]:
import glob
import os

import toml
from dotenv import load_dotenv


import src.models as models

from tqdm import tqdm

config = toml.load("config.toml")
load_dotenv(".secrets")
os.environ["ANTHROPIC_API_KEY"] = os.getenv("anthropic_key")

if config['DEV_MODE']:
    config['PERSIST_DIRECTORY'] += "/dev"


In [None]:
rag_pipeline = models.RagPipeline(config['EMBEDDING_MODEL'], config['PERSIST_DIRECTORY'])

In [None]:
if (not config['DEV_MODE']):  # won't populate the database if in dev mode - we can just use what was already loaded.
    rag_pipeline.load_documents()

In [None]:
question = "Explain the main benefits of Reproducible Analytical Pipelines (RAP)"

result = rag_pipeline.answer_question(question, rag=False)

print(result)

In [None]:
rag_pipeline.retriever.get_relevant_documents("What is analytical best practice?")

In [47]:
extract_text_from_html_json_chunk([{"mainEntityOfPage": [{"name": "markdown", "text": "some text"}]}]).getvalue()

[{'name': 'markdown', 'text': 'some text'}]
dict_keys(['mainEntityOfPage'])

dict_keys(['name', 'text'])


'some text'

In [51]:
import jsonlines
import json
import io
import src.data_ingestion.simple_nhs_conditions_scrape as simple_nhs_conditions_scrape


def extract_text_from_html_json_chunk(ent: dict, text: str=None, key: str='mainEntityOfPage') -> io.StringIO:
    """Extracts text from the json object that contains the html for the nhs conditions pages
    Args:
    ent (dict): the json object
    text (str, optional): the text to append to. Defaults to None.
    key (str, optional): the key to look for in the json object. Defaults to "mainEntityOfPage".
    Returns:
    io.StringIO: the text extracted from the json object
    
    Example:
    >>> extract_text_from_html_json_chunk([{"mainEntityOfPage": [{"name": "markdown", "text": "some text"}]}]).getvalue()
    'some text'
    """   
    if text is None:
        text = io.StringIO()
    for elt in ent:
        nested = elt.get(key,"")
        if key == "mainEntityOfPage":
            if elt.get("name") == "markdown":
                text.write(elt.get('text'))
        elif key == "hasPart":
            headline_field = elt.get("headline","")
            text_field = elt.get("text","")
            description_field = elt.get("description","")
            text.write(headline_field + " " + description_field + " " + text_field + " ")

        if isinstance(nested, list):
            extract_text_from_html_json_chunk(nested, text, key = key)
    return text


def process_nhs_conditions_json(json_string: str) -> str:
    """Extracts the text from the json object that contains the html for the nhs conditions pages
    Args:
        json_string (str): the MHS Conditions json object (for a page), from the API.
    Returns:
        str: the text extracted from the json object
    Example:
    >>> process_nhs_conditions_json({"html": '{"mainEntityOfPage": [{"name": "markdown", "text": "some text"}]}'}).getvalue()
    'some text'
    """
    try:
        content = json.loads(obj['html'])

        # there are currently two types of html structure in the nhs conditions data - one where the mainEntityOfPage holds the text, and one where the hasPart holds the text
        main_entity_text = extract_text_from_html_json_chunk(content['mainEntityOfPage'], key='mainEntityOfPage').getvalue()
        has_part_text = extract_text_from_html_json_chunk(content['hasPart'], key='hasPart').getvalue()
        all_text = main_entity_text + has_part_text
    except json.JSONDecodeError as e:
        # some of the pages are just in HTML, not as a json object
        print("Error decoding json for ", obj['source_url'])
        all_text = ""
    
    return all_text

import doctest
doctest.testmod()


# with jsonlines.open('nhsconditions.jsonl') as reader:
#     for index, obj in enumerate(reader):
#         extracted_text = process_nhs_conditions_json(obj)

#         print(index, ": ", obj['source_url'] , " === ", extracted_text)


**********************************************************************
File "__main__", line 45, in __main__.process_nhs_conditions_json
Failed example:
    process_nhs_conditions_json({"html": '{"mainEntityOfPage": [{"name": "markdown", "text": "some text"}]}'}).getvalue()
Exception raised:
    Traceback (most recent call last):
      File "D:\python3115\Lib\doctest.py", line 1351, in __run
        exec(compile(example.source, filename, "single",
      File "<doctest __main__.process_nhs_conditions_json[0]>", line 1, in <module>
        process_nhs_conditions_json({"html": '{"mainEntityOfPage": [{"name": "markdown", "text": "some text"}]}'}).getvalue()
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    AttributeError: 'str' object has no attribute 'getvalue'
**********************************************************************
1 items had failures:
   1 of   1 in __main__.process_nhs_conditions_json
***Test 

TestResults(failed=1, attempted=2)