In [1]:
from typing import List
import urllib.request
import os
import glob
import fitz
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.llms import OpenAI, LlamaCpp
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
import openai
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [3]:
os.environ["OPENAI_API_KEY"] = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llm = OpenAI(temperature=0, model="text-davinci-003", max_tokens=2000)


def generate(
    # TODO - add a way to use this instead of langchain wrapper bc langchain wrapper is 3x slower
    prompt: str,
    model: str = "text-davinci-003",
    temperature: float = 0.0,
) -> str:
    return openai.Completion.create(model=model, prompt=prompt, temperature=temperature)

  from .autonotebook import tqdm as notebook_tqdm
                    model was transfered to model_kwargs.
                    Please confirm that model is what you intended.


In [5]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path="/Users/home/llama.cpp/models/wizardLM-7B.ggml.q4_2.bin",
    callback_manager=callback_manager,
    verbose=True,
    max_tokens=2000,
)

llama.cpp: loading model from /Users/home/llama.cpp/models/wizardLM-7B.ggml.q4_2.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 5 (mostly Q4_2)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  59.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 1026.00 MB per state)
llama_init_from_file: kv self size  =  256.00 MB
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 


In [6]:
def download_pdf(url: str, output_path: str) -> None:
    urllib.request.urlretrieve(url, output_path)


def preprocess(data: List[Document] | str) -> List[Document] | str:
    if not data:
        return data
    if isinstance(data, str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    if isinstance(data[0], str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    else:
        for d in data:
            d.page_content = re.sub("\s+", " ", d.page_content)
            d.page_content = d.page_content.replace("\n", " ")
        return data


def pdf_to_text(path: str, start_page: int = 1, end_page: int = 0) -> List[str]:
    try:
        doc = fitz.open(path)
    except OSError:
        print(f"Error: could not open file {path}")
        return []
    total_pages: int = doc.page_count

    if end_page == 0:
        end_page = total_pages

    text_list = []

    for i in range(start_page - 1, end_page):
        try:
            text = doc.load_page(i).get_text("text")
            text = preprocess(text)
            text_list.append(text)
        except Exception:
            print(f"Error: could not extract text from page {i+1} in file {path}")

    doc.close()
    return text_list


def text_to_chunks(
    texts: List[str], path: str, word_length: int = 300, start_page: int = 1
) -> List[str]:
    text_toks = [t.split(" ") for t in texts]
    page_nums = []
    chunks = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i : i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                text_toks[idx + 1] = chunk + text_toks[idx + 1]
                continue
            chunk = " ".join(chunk).strip()
            chunk = f"[Page: {idx+start_page} from {path}]" + " " + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks


def normal_process(path: str) -> List[Document]:
    documents: List[Document] = []
    texts = pdf_to_text(path)
    data = text_to_chunks(texts, path)
    for text in data:
        documents.append(Document(page_content=text))
    print(f"Processed {len(documents)} documents from {path}")
    return documents


def flatten_array(arr):
    result = []
    for i in arr:
        if isinstance(i, list):
            result.extend(flatten_array(i))
        else:
            result.append(i)
    return result


def embed_directory(path: str) -> List[Document]:
    documents: List[Document] = []
    pdf_files = glob.glob("pdfs/*.pdf")
    for pdf_file in pdf_files:
        docs.append(normal_process(pdf_file))

    docs = flatten_array([normal_process(pdf_file) for pdf_file in pdf_files])

    print(f"Processed {len(documents)} documents from {path}")
    return documents

In [7]:
docs = []
pdf_files = glob.glob("pdfs/*.pdf")
docs = [flatten_array(normal_process(pdf_file)) for pdf_file in pdf_files]

Processed 8 documents from pdfs/Joseph Sambrook, David W. Russel - Molecular Cloning_ A Laboratory Manual. Volume 1, 2, & 3 (in one file).pdf
Processed 1531 documents from pdfs/Diagnostic%20and%20statistical%20manual%20of%20mental%20disorders%20_%20DSM-5%20%28%20PDFDrive.com%20%29.pdf
Processed 4 documents from pdfs/Asthma-1.pdf
Processed 6 documents from pdfs/Neoplasia contd.pdf
Processed 2992 documents from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).pdf
Processed 7 documents from pdfs/Nutritional Diseases.pdf
Processed 4 documents from pdfs/Kevin M. G. Taylor, Michael E. Aulton - Aulton's Pharmaceutics_ The Design and Manufacture of Medicines-Elsevier (2021).pdf
Processed 2 documents from pdfs/Micriobiology mod5.pdf
Processed 4 documents from pdfs/Systemic Effects of Inflammation & Tissue Repair.pdf
Processed 2222 documents from pdfs/Joanne Willey, Kathleen Sandman, Dorothy Wood - Prescott's Microbiology 11th Edition-McGraw-Hill Education (2019).p

In [8]:
d = []
for doc in docs:
    d.append([flatten_array(doc) for doc in docs])
d = flatten_array(d)

Document(page_content='[Page: 300 from pdfs/Joseph Sambrook, David W. Russel - Molecular Cloning_ A Laboratory Manual. Volume 1, 2, & 3 (in one file).pdf] ""', metadata={})

In [None]:
d[0]

In [9]:
db = FAISS.from_documents(d, embeddings)
db.save_local(folder_path="dbs", index_name="manual_db")

In [10]:
db = FAISS.load_local(folder_path="dbs", index_name="manual_db", embeddings=embeddings)
compressor = LLMChainExtractor.from_llm(llm=OpenAI(temperature=0, max_tokens=512))
retriever = db.as_retriever()
retriever = ContextualCompressionRetriever(
    base_retriever=retriever, base_compressor=compressor
)

In [11]:
answer_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, answer the question below."
    "Do not leave any information out."
    "Do not add any information that is not in the context."
    "Answer step-by-step. \n\nQuery: {query}\nAnswer: ",
    input_variables=["query", "context"],
)

summary_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, write a concise summary of the text as it relates to the query below."
    "Make sure to include the source of the information. The source is indicated by [Page: X from Y.pdf]."
    "Answer step-by-step.\nQuery:\n{query}\n\nSummary: ",
    input_variables=["context", "query"],
)

In [12]:
answer_chain = LLMChain(llm=OpenAI(temperature=0, max_tokens=512), prompt=answer_prompt)
summmary_chain = LLMChain(
    llm=OpenAI(temperature=0, max_tokens=1048), prompt=summary_prompt
)


def qa(query: str) -> str:
    context = retriever.get_relevant_documents(query=query)
    pretty_print_docs(context)
    summary = summmary_chain.run({"query": query, "context": context})
    return summary

In [15]:
qa("What is edema")

Document 1:

"Edema is easily recognized grossly; microscopically, it is appreciated as clearing and separation of the extracellular matrix (ECM) and subtle cell swelling. Edema is most commonly seen in subcutaneous tissues, the lungs, and the brain. Subcutaneous edema can be diffuse or more conspicuous in regions with high hydrostatic pressures. Its distribution is often influenced by gravity (e.g., it appears in the legs when standing and the sacrum when recumbent), a feature termed dependent edema. Finger pressure over markedly edematous subcutaneous tissue displaces the interstitial fluid and leaves a depression, a sign called pitting edema. Edema resulting from renal dysfunction often appears initially in parts of the body containing loose connective tissue, such as the eyelids; periorbital edema is thus a characteristic finding in severe renal disease. With pulmonary edema, the lungs are often two to three times their normal weight, and sectioning yields frothy, blood-tinged flui

"\nEdema is a condition characterized by swelling due to an accumulation of fluid in the body's tissues. It is most commonly seen in subcutaneous tissues, the lungs, and the brain. It is easily recognized by its gross appearance and microscopically, it is appreciated as clearing and separation of the extracellular matrix (ECM) and subtle cell swelling. Edema can be diffuse or more conspicuous in regions with high hydrostatic pressures and its distribution is often influenced by gravity. It can also be caused by renal dysfunction and appears initially in parts of the body containing loose connective tissue, such as the eyelids. With pulmonary edema, the lungs are often two to three times their normal weight and sectioning yields frothy, blood-tinged fluid. Brain edema can be localized or generalized depending on the nature and extent of the pathologic process or injury. [Source: Page 1 from Edema.pdf]."