In [6]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [5]:
import os

os.environ["OPENAI_API_KEY"] = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

## Process PDFs


In [9]:
import urllib.request


def download_pdf(url: str, output_path: str) -> None:
    """
    download_pdf(url: str, output_path: str) -> None

    This function downloads a PDF file from a given URL and saves it to the specified output path.

    Args:
    url (str): The URL of the PDF file to be downloaded.
    output_path (str): The file path where the downloaded PDF will be saved.

    Returns:
    None
    """
    urllib.request.urlretrieve(url, output_path)

In [25]:
import re
from typing import List
from langchain.schema import Document


def preprocess(data: List[Document] | str) -> List[Document] | str:
    """
    preprocess(data: List[Document] | str) -> List[Document] | str

    This function preprocesses the input data, either a string or a list of Document objects, by
    removing excess whitespace and replacing newline characters with spaces.

    Args:
    data (List[Document] | str): The input data to preprocess, either a string or a list of Document objects.

    Returns:
    List[Document] | str: The preprocessed input data, either a string or a list of Document objects.
    """

    if not data:
        return data
    if isinstance(data, str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    if isinstance(data[0], str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    else:
        for d in data:
            d.page_content = re.sub("\s+", " ", d.page_content)
            d.page_content = d.page_content.replace("\n", " ")
        return data

In [14]:
import fitz


def pdf_to_text(path: str, start_page: int = 1, end_page: int = 0) -> List[str]:
    """
    pdf_to_text(path: str, start_page: int = 1, end_page: int = 0) -> List[str]

    This function extracts text from a PDF file, given a file path, and returns a list of strings, where each
    string corresponds to the extracted text of a page within the specified range.

    Args:
    path (str): The file path of the PDF file to extract text from.
    start_page (int, optional): The starting page number to extract text from. Defaults to 1.
    end_page (int, optional): The ending page number to extract text from. If 0, extracts text until the last page. Defaults to 0.

    Returns:
    List[str]: A list of strings, where each string contains the extracted text of a page within the specified range.
    """
    try:
        doc = fitz.open(path)
    except OSError:
        print(f"Error: could not open file {path}")
        return []
    total_pages: int = doc.page_count

    if end_page == 0:
        end_page = total_pages

    text_list = []

    for i in range(start_page - 1, end_page):
        try:
            text = doc.load_page(i).get_text("text")
            text = preprocess(text)
            text_list.append(text)
        except Exception:
            print(
                f"Error: could not extract text from page {i+1} in file {path}")

    doc.close()
    return text_list

In [30]:
def text_to_documents(
    texts: List[str], path: str, word_length: int = 80, start_page: int = 1
) -> List[Document]:
    """
    Converts a list of texts into a list of Document objects. Each text is split into chunks of specified word length
    and each chunk is added to a Document object along with the metadata indicating the source of the text.

    :param texts: A list of strings to be converted to Document objects
    :type texts: List[str]

    :param path: A string indicating the path or source of the texts
    :type path: str

    :param word_length: An integer indicating the number of words in each chunk of the text. Default is 80.
    :type word_length: int

    :param start_page: An integer indicating the starting page number of the Document object. Default is 1.
    :type start_page: int

    :return: A list of Document objects, each containing a chunk of text and metadata indicating the source of the text
    :rtype: List[Document]
    """
    text_toks = [t.split(" ") for t in texts]
    docs: List[Document] = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i: i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                text_toks[idx + 1] = chunk + text_toks[idx + 1]
                continue
            chunk = " ".join(chunk).strip()
            meta = {"Source": f"Page {idx+start_page} from {path}"}
            docs.append(Document(page_content=chunk, metadata=meta))
    return docs

In [35]:
def normal_process(path: str) -> List[Document]:
    """
    normal_process(path: str) -> List[Document]

    This function processes a PDF file, given a file path, and returns a list of Document objects, where each
    object contains a chunk of the extracted text.

    Args:
    path (str): The file path of the PDF file to process.

    Returns:
    List[Document]: A list of Document objects, where each object contains a chunk of the extracted text.
    """
    documents: List[Document] = []
    texts = pdf_to_text(path)
    documents = text_to_documents(texts, path)
    print(f"Processed {len(documents)} documents from {path}")
    return documents

In [17]:
def flatten_array(arr):
    """
    flatten_array(arr)

    This function flattens a nested list by recursively iterating through its elements and returning a
    flat list containing all non-list elements.

    Args:
    arr: A nested list to be flattened.

    Returns:
    list: A flat list containing all non-list elements of the input nested list.
    """
    result = []
    for i in arr:
        if isinstance(i, list):
            result.extend(flatten_array(i))
        else:
            result.append(i)
    return result

In [18]:
import glob


def embed_directory(path: str) -> List[Document]:
    """
    embed_directory(path: str) -> List[Document]

    This function processes all PDF files in a directory, given a directory path, and returns a list of
    Document objects, where each object contains a chunk of the extracted text.

    Args:
    path (str): The path of the directory containing the PDF files to process.

    Returns:
    List[Document]: A list of Document objects, where each object contains a chunk of the extracted text from the processed PDF files.
    """
    documents: List[Document] = []
    pdf_files = glob.glob("pdfs/*.pdf")
    for pdf_file in pdf_files:
        docs.append(normal_process(pdf_file))

    docs = flatten_array([normal_process(pdf_file) for pdf_file in pdf_files])

    print(f"Processed {len(documents)} documents from {path}")
    return documents

In [None]:
docs = []
pdf_files = glob.glob("pdfs/*.pdf")
docs = [flatten_array(normal_process(pdf_file)) for pdf_file in pdf_files]
d = []
for doc in docs:
    d.append([flatten_array(doc) for doc in docs])
d = flatten_array(d)
d[1]

### Save docs to CSV for longevity


In [20]:
import csv


def save_docs_to_csv(save_file: str, docs: List[Document]):
    """
    save_docs_to_csv(save_file: str, docs: List[Document])

    This function saves a list of Document objects to a CSV file, given a file path.

    Args:
    save_file (str): The file path of the CSV file to save the Document objects to.
    docs (List[Document]): The list of Document objects to save to the CSV file.
    """
    with open(save_file, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Page Content", "Metadata"])
        for doc in docs:
            writer.writerow([doc.page_content, doc.metadata])

In [None]:
save_docs_to_csv("pdfs.csv", d)

### Create embeddings and save index to disc


#### Embedding model


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

Vectorstore


In [None]:
from langchain.vectorstores import FAISS

db = FAISS.from_documents(d, embeddings)
db.save_local(folder_path="dbs", index_name="faiss_index")

## Load index


In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

db = FAISS.load_local(
    folder_path="dbs", index_name="faiss_index", embeddings=embeddings
)
compressor = LLMChainExtractor.from_llm(llm=OpenAI(temperature=0))
retriever = db.as_retriever()
retriever = ContextualCompressionRetriever(
    base_retriever=retriever, base_compressor=compressor
)

## Prep models


### Prompts


In [20]:
from langchain.prompts import PromptTemplate

answer_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, answer the question below."
    "Do not leave any information out."
    "Do not add any information that is not in the context."
    "Answer step-by-step. \n\nQuery: {query}\nAnswer: ",
    input_variables=["query", "context"],
)

summary_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, write a concise summary of the text as it relates to the query below."
    "Answer step-by-step.\nQuery:\n{query}\n\nSummary: ",
    input_variables=["context", "query"],
)

### Models


In [19]:
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

gpt4 = ChatOpenAI(
    temperature=0,
    model="gpt-4",
    streaming=True,
    callbacks=[StreamingStdOutCallbackHandler()],
)
gpt3turbo = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

### Chains


In [21]:
from langchain.chains import LLMChain

answer_chain = LLMChain(
    llm=ChatOpenAI(temperature=0, max_tokens=512), prompt=answer_prompt
)
summmary_chain = LLMChain(
    llm=OpenAI(temperature=0, max_tokens=1048), prompt=summary_prompt
)


def qa(query: str) -> str:
    context = retriever.get_relevant_documents(query=query)
    pretty_print_docs(context)
    summary = summmary_chain.run({"query": query, "context": context})
    return summary

NameError: name 'OpenAI' is not defined

## QA


In [None]:
qa("What is edema")

Document 1:

"Edema is easily recognized grossly; microscopically, it is appreciated as clearing and separation of the extracellular matrix (ECM) and subtle cell swelling. Edema is most commonly seen in subcutaneous tissues, the lungs, and the brain. Subcutaneous edema can be diffuse or more conspicuous in regions with high hydrostatic pressures. Its distribution is often influenced by gravity (e.g., it appears in the legs when standing and the sacrum when recumbent), a feature termed dependent edema. Finger pressure over markedly edematous subcutaneous tissue displaces the interstitial fluid and leaves a depression, a sign called pitting edema. Edema resulting from renal dysfunction often appears initially in parts of the body containing loose connective tissue, such as the eyelids; periorbital edema is thus a characteristic finding in severe renal disease. With pulmonary edema, the lungs are often two to three times their normal weight, and sectioning yields frothy, blood-tinged flui

"\nEdema is a condition characterized by swelling due to an accumulation of fluid in the body's tissues. It is most commonly seen in subcutaneous tissues, the lungs, and the brain. It is easily recognized by its gross appearance and microscopically, it is appreciated as clearing and separation of the extracellular matrix (ECM) and subtle cell swelling. Edema can be diffuse or more conspicuous in regions with high hydrostatic pressures and its distribution is often influenced by gravity. It can also be caused by renal dysfunction and appears initially in parts of the body containing loose connective tissue, such as the eyelids. With pulmonary edema, the lungs are often two to three times their normal weight and sectioning yields frothy, blood-tinged fluid. Brain edema can be localized or generalized depending on the nature and extent of the pathologic process or injury. [Source: Page 1 from Edema.pdf]."