In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_pdf_files(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    docs = loader.load()
    return docs

In [3]:
docs = load_pdf_files(data="../data")

In [5]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [6]:
chunks = text_split(docs)
print(f'Length = {len(chunks)}')

Length = 5860


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM

MAX_TOKENS = 2048
llm = OllamaLLM(model="llama3.2", max_tokens=MAX_TOKENS, temperature=0.5)

embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cuda'}
)

In [8]:
query = embedding.embed_query("Hello World")
print(f'length = {len(query)}')

length = 384


In [10]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')

pc = Pinecone(api_key=PINECONE_API_KEY)
pc.create_index(
    name="med",
    dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [11]:
from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_documents(
    documents=chunks,
    index_name="med",
    embedding=embedding
)

In [12]:
from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_existing_index(
    index_name="med",
    embedding=embedding
)

In [13]:
retriever = doc_search.as_retriever(search_type='similarity', search_kwargs={"k": 3})

In [14]:
retriever_docs = retriever.invoke("What is acne?")
retriever_docs

[Document(id='1b375bb8-ab84-4037-9642-22b86f8d736c', metadata={'page': 39.0, 'source': '..\\data\\med.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='79500998-7a74-4c95-94df-a2ddbddfad86', metadata={'page': 38.0, 'source': '..\\data\\med.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='5cf7fd6e-575a-4d54-8012-1413800e03ad', metadata={'page': 37.0, 'source': '..\\data\\med.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of 

In [16]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    'You are an assistant for question-answering tasks.'
    'Use the following pieces of retrieval context to answer '
    'the question. If you do not know the answer, just say that'
    "you don't know. Use three sentence maximum and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [17]:
ques_ans_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, ques_ans_chain)

In [20]:
res = rag_chain.invoke({"input": "What is acne?"})
print(res["answer"])

I can answer that. Acne is a common skin disease characterized by pimples on the face, chest, and back, caused by clogged pores with oil, dead skin cells, and bacteria. It affects nearly 17 million people in the United States, according to the medical term "acne vulgaris".


In [21]:
res = rag_chain.invoke({"input": "What is Arcomegaly and gigantism?"})
print(res["answer"])

Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue, as well as various other bodily disturbances. Acromegaly typically occurs in adults after normal growth has stopped, while gigantism is more common in children before growth has completed. Both conditions can cause symptoms such as excessive grinding of teeth (bruxism), poisoning from carbon monoxide, and hallucinations.
