In [21]:
from typing import List
import urllib.request
import os
import glob
import fitz
import re
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.llms import OpenAI, LlamaCpp
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
import openai
from langchain.callbacks.base import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler


In [46]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [3]:
os.environ["OPENAI_API_KEY"] = "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"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llm = OpenAI(temperature=0, model="text-davinci-003", max_tokens=2000)


def generate(
    # TODO - add a way to use this instead of langchain wrapper bc langchain wrapper is 3x slower
    prompt: str,
    model: str = "text-davinci-003",
    temperature: float = 0.0,
) -> str:
    return openai.Completion.create(model=model, prompt=prompt, temperature=temperature)

  from .autonotebook import tqdm as notebook_tqdm
                    model was transfered to model_kwargs.
                    Please confirm that model is what you intended.


In [34]:
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
llm = LlamaCpp(
    model_path="/Users/home/llama.cpp/models/wizardLM-7B.ggml.q4_2.bin",
    callback_manager=callback_manager,
    verbose=True,
    max_tokens=2000,
)


llama.cpp: loading model from /Users/home/llama.cpp/models/wizardLM-7B.ggml.q4_2.bin
llama_model_load_internal: format     = ggjt v1 (latest)
llama_model_load_internal: n_vocab    = 32001
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: ftype      = 5 (mostly Q4_2)
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: n_parts    = 1
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =  59.11 KB
llama_model_load_internal: mem required  = 5809.33 MB (+ 1026.00 MB per state)
AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | 
llama_init_from_file: kv self size  =  256.00 MB


In [160]:
def download_pdf(url: str, output_path: str) -> None:
    urllib.request.urlretrieve(url, output_path)


def preprocess(data: List[Document] | str) -> List[Document] | str:
    if not data:
        return data
    if isinstance(data, str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    if isinstance(data[0], str):
        data = re.sub("\s+", " ", data)
        data = data.replace("\n", " ")
        return data
    else:
        for d in data:
            d.page_content = re.sub("\s+", " ", d.page_content)
            d.page_content = d.page_content.replace("\n", " ")
        return data


def pdf_to_text(path: str, start_page: int = 1, end_page: int = 0) -> List[str]:
    try:
        doc = fitz.open(path)
    except OSError:
        print(f"Error: could not open file {path}")
        return []
    total_pages: int = doc.page_count

    if end_page == 0:
        end_page = total_pages

    text_list = []

    for i in range(start_page - 1, end_page):
        try:
            text = doc.load_page(i).get_text("text")
            text = preprocess(text)
            text_list.append(text)
        except Exception:
            print(
                f"Error: could not extract text from page {i+1} in file {path}")

    doc.close()
    return text_list


def text_to_chunks(
    texts: List[str], path: str, word_length: int = 300, start_page: int = 1
) -> List[str]:
    text_toks = [t.split(" ") for t in texts]
    page_nums = []
    chunks = []

    for idx, words in enumerate(text_toks):
        for i in range(0, len(words), word_length):
            chunk = words[i: i + word_length]
            if (
                (i + word_length) > len(words)
                and (len(chunk) < word_length)
                and (len(text_toks) != (idx + 1))
            ):
                text_toks[idx + 1] = chunk + text_toks[idx + 1]
                continue
            chunk = " ".join(chunk).strip()
            chunk = f"[Page: {idx+start_page} from {path}]" + \
                " " + '"' + chunk + '"'
            chunks.append(chunk)
    return chunks


def normal_process(path: str) -> List[Document]:
    documents: List[Document] = []
    texts = pdf_to_text(path)
    data = text_to_chunks(texts, path)
    for text in data:
        documents.append(Document(page_content=text))
    print(f"Processed {len(documents)} documents from {path}")
    return documents


def flatten_array(arr):
    result = []
    for i in arr:
        if isinstance(i, list):
            result.extend(flatten_array(i))
        else:
            result.append(i)
    return result


def embed_directory(path: str) -> List[Document]:
    documents: List[Document] = []
    pdf_files = glob.glob("pdfs/*.pdf")
    for pdf_file in pdf_files:
        docs.append(normal_process(pdf_file))

    docs = flatten_array([normal_process(pdf_file) for pdf_file in pdf_files])

    print(f"Processed {len(documents)} documents from {path}")
    return documents

In [None]:
docs = []
pdf_files = glob.glob("pdfs/*.pdf")
for pdf_file in pdf_files:
    docs.append(normal_process(pdf_file))

docs = [normal_process(pdf_file) for pdf_file in pdf_files]


In [167]:
db = FAISS.from_documents(docs, embeddings)
db.save_local(folder_path="dbs", index_name="manual_db")


In [49]:
db = FAISS.load_local(
    folder_path="dbs", index_name="manual_db", embeddings=embeddings)
compressor = LLMChainExtractor.from_llm(
    llm=OpenAI(temperature=0, max_tokens=512))
retriever = db.as_retriever()
retriever = ContextualCompressionRetriever(
    base_retriever=retriever, base_compressor=compressor
)

In [100]:
answer_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, answer the question below."
    "Do not leave any information out."
    "Do not add any information that is not in the context."
    "Answer step-by-step. \n\nQuery: {query}\nAnswer: ",
    input_variables=["query", "context"],
)

summary_prompt = PromptTemplate(
    template="Context:\n{context}\n\n"
    "Instruction: Using the context above, write a concise summary of the text as it relates to the query below."
    "Make sure to include the source of the information. The source is indicated by [Page: X from Y.pdf]."
    "Answer step-by-step.\nQuery:\n{query}\n\nSummary: ",
    input_variables=["context", "query"],
)

In [103]:
answer_chain = LLMChain(llm=OpenAI(temperature=0, max_tokens=512), prompt=answer_prompt)
summmary_chain = LLMChain(
    llm=OpenAI(temperature=0, max_tokens=1048), prompt=summary_prompt
)


def qa(query: str) -> str:
    context = retriever.get_relevant_documents(query=query)
    pretty_print_docs(context)
    summary = summmary_chain.run({"query": query, "context": context})
    return summary

Document 1:

Centriacinar (centrilobular) emphysema is the most common form, constituting more than 95% of clinically significant cases. It occurs predominantly in heavy smokers with COPD. In this type of emphysema the central or proximal parts of the acini, formed by respiratory bronchioles, are affected, whereas distal alveoli are spared. [Page: 694 from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).pdf] Panacinar emphysema with initial distention of the alveolus and alveolar duct. [Page: 694 from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).pdf] 

NO_OUTPUT
----------------------------------------------------------------------------------------------------
Document 2:

"Emphysema  Irreversible enlargement of airspaces  Distal to the terminal bronchiole  Destruction of walls  No obvious fibrosis  Types of Emphysema  Centriacinar  Most common (95%)  The respiratory bronchiole are affected, alveoli spared  Wal

In [99]:
retriever.get_relevant_documents(query="what is neoplasia")

[Document(page_content='"a neoplasm is defined as a genetic disorder of cell growth that is triggered by acquired or less commonly inherited mutations affecting a single cell and its clonal progeny" [Page: 283 from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).pdf]', metadata={}),
 Document(page_content='Neoplasia means “new growth”, Uncontrolled and excessive growth leads to formation of neoplasm, Malignant tumours are referred to as cancers, from the Latin for crab, Malignant tumours named for origin of tissue type, Differentiation, Proliferation, Relationship with other tissues, Benign tumours grows by expansion, localised to site of origin, often associated with a fibrous capsule, Relatively slow growth, Malignant tumours growth by expansion and invasion, Relatively rapid growth, Infiltrates surrounding tissues. [Page: 12 from pdfs/Neoplasia.pdf]', metadata={}),
 Document(page_content='"Cells do not always grow in an ordered, controlled manner. Cel

Document 1:

"a neoplasm is defined as a genetic disorder of cell growth that is triggered by acquired or less commonly inherited mutations affecting a single cell and its clonal progeny. As discussed later, these causative mutations alter the function of particular genes and give the neoplastic cells a survival and growth advantage, resulting in excessive proliferation that is independent of physiological growth signals and controls." [Page: 283 from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).pdf]
----------------------------------------------------------------------------------------------------
Document 2:

Neoplasia means “new growth”, Uncontrolled and excessive growth leads to formation of neoplasm, Malignant tumours are referred to as cancers, from the Latin for crab, Malignant tumours named for origin of tissue type, Differentiation, Proliferation, Relationship with other tissues, Benign tumours grows by expansion, localised to site of origin

In [95]:
import gradio as gr

In [104]:
demo = gr.Interface(fn=qa, inputs="text", outputs="text")

demo.launch(share=True)


Running on local URL:  http://127.0.0.1:7862
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Running on public URL: https://78e47436f347afcd30.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




Document 1:

Obstructive lung diseases include chronic obstructive pulmonary disease (COPD), asthma, and bronchiectasis. COPD has two major clinicopathologic manifestations, emphysema and chronic bronchitis. Asthma is distinguished from chronic bronchitis and emphysema by the presence of reversible bronchospasm. Chronic Obstructive Pulmonary Disease (COPD) is defined by the World Health Organization (WHO) as “a common, preventable and treatable disease that is characterized by persistent respiratory symptoms and airflow limitation that is due to airway and/or alveolar abnormalities caused by exposure to noxious particles or gases.” Centriacinar emphysema and panacinar (panlobular) emphysema are two types of emphysema.
----------------------------------------------------------------------------------------------------
Document 2:

Chronic bronchitis: Mucous gland hyperplasia, hypersecretion [Page: 692 from pdfs/Robbins and Cotran Pathologic Basis of Disease, 10th Edition (VetBooks.ir).p