In [8]:
import os

In [9]:
os.chdir("../")
%pwd

'f:\\ProjectAI\\Medical-Chatbot'

In [10]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
def load_pdf_files(folder_path):
    loader = DirectoryLoader(folder_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [12]:
extracted_documents = load_pdf_files("data")

In [13]:
extracted_documents

[Document(metadata={'producer': '3-Heights™ PDF Optimization Shell 6.3.1.5 (http://www.pdf-tools.com)', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2013-08-29T10:34:34-04:00', 'trapped': '/False', 'moddate': '2024-05-10T15:00:52+00:00', 'source': 'data\\Medical-book.pdf', 'total_pages': 128, 'page': 0, 'page_label': '1'}, page_content='Medical Abortion\nStudy Guide\nSecond Edition\nDisclaimer: The regularly updated Clinical Updates in Reproductive Health \n(www.ipas.org/clinicalupdates) provides Ipas’s most up-to-date clinical \nguidance, which supersedes any guidance that may differ in Ipas curricula \nor other materials.'),
 Document(metadata={'producer': '3-Heights™ PDF Optimization Shell 6.3.1.5 (http://www.pdf-tools.com)', 'creator': 'Adobe InDesign CS6 (Macintosh)', 'creationdate': '2013-08-29T10:34:34-04:00', 'trapped': '/False', 'moddate': '2024-05-10T15:00:52+00:00', 'source': 'data\\Medical-book.pdf', 'total_pages': 128, 'page': 1, 'page_label': '2'}, page_c

In [14]:
len(extracted_documents)

128

In [15]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_doc = Document(page_content=doc.page_content, metadata={"source": src})
        minimal_docs.append(minimal_doc)
    return minimal_docs

In [16]:
minimal_docs = filter_to_minimal_docs(extracted_documents)

In [17]:
minimal_docs

[Document(metadata={'source': 'data\\Medical-book.pdf'}, page_content='Medical Abortion\nStudy Guide\nSecond Edition\nDisclaimer: The regularly updated Clinical Updates in Reproductive Health \n(www.ipas.org/clinicalupdates) provides Ipas’s most up-to-date clinical \nguidance, which supersedes any guidance that may differ in Ipas curricula \nor other materials.'),
 Document(metadata={'source': 'data\\Medical-book.pdf'}, page_content='ISBN: 1-933095-46-6\n© 2009, 2013 Ipas.  \nProduced in the United States of America.\nSuggested citation: Ipas. (2013). Medical abortion study guide (second ed.) K. L. Turner (Ed.), Chapel Hill, NC: Ipas.  \nIpas is a nonprofit organization that works around the world to increase women’s ability to exercise their sexual and \nreproductive rights, especially the right to safe abortion. We seek to eliminate unsafe abortion and the resulting \ndeaths and injuries and to expand women’s access to comprehensive abortion care, including contraception and \nrelate

In [18]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
    split_docs = text_splitter.split_documents(minimal_docs)
    return split_docs

In [19]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 342


In [20]:
texts_chunk

[Document(metadata={'source': 'data\\Medical-book.pdf'}, page_content='Medical Abortion\nStudy Guide\nSecond Edition\nDisclaimer: The regularly updated Clinical Updates in Reproductive Health \n(www.ipas.org/clinicalupdates) provides Ipas’s most up-to-date clinical \nguidance, which supersedes any guidance that may differ in Ipas curricula \nor other materials.'),
 Document(metadata={'source': 'data\\Medical-book.pdf'}, page_content='ISBN: 1-933095-46-6\n© 2009, 2013 Ipas.  \nProduced in the United States of America.\nSuggested citation: Ipas. (2013). Medical abortion study guide (second ed.) K. L. Turner (Ed.), Chapel Hill, NC: Ipas.  \nIpas is a nonprofit organization that works around the world to increase women’s ability to exercise their sexual and \nreproductive rights, especially the right to safe abortion. We seek to eliminate unsafe abortion and the resulting \ndeaths and injuries and to expand women’s access to comprehensive abortion care, including contraception and \nrelate

In [21]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [22]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [23]:
vector = embedding.embed_query("Hello world")
len(vector)

384

In [24]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [25]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [26]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [27]:
pc

<pinecone.pinecone.Pinecone at 0x1bb17029e70>

In [33]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        serverless_spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)

In [34]:
from langchain_pinecone import PineconeVectorStore

doc_search = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [35]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [36]:
dswith = Document(
    page_content="Nhut Nam is a good boy and he loves programming.",
    metadata={"source": "test.pdf"}
)

In [37]:
docsearch.add_documents(documents=[dswith])

['806093cb-8ef9-4de2-aefa-5c02c3f95fca']

In [38]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [40]:
retriever_docs = retriever.invoke("Who is Nhut Nam?")
retriever_docs

[Document(id='806093cb-8ef9-4de2-aefa-5c02c3f95fca', metadata={'source': 'test.pdf'}, page_content='Nhut Nam is a good boy and he loves programming.'),
 Document(id='9bb26de7-b0e1-4f59-b271-6ab6b291bc2a', metadata={'source': 'data\\Medical-book.pdf'}, page_content='Takele Geressu, Ethiopia\nTiemoko Ouattara, Burkina \nFaso\nTraci Baird, USA\nWe give our appreciation to the Swedish International \nDevelopment Agency for contributing funding to the development \nof the first edition of this training package through a grant to \nIpas.'),
 Document(id='6f3e77e3-f07c-48e1-8687-030b246fdb24', metadata={'source': 'data\\Medical-book.pdf'}, page_content='allowed by law. \nCover photo credits: © Richard Lord\nIllustrations: Stephen C. Edgerton\nThe illustrations and photographs used in this publication are for illustrative purposes only. No similarity to any actual \nperson, living or dead, is intended.\nFor more information or to donate to Ipas:\nIpas \nP .O. Box 9990 \nChapel Hill, NC 27515 U

In [41]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model="mistral", temperature=0.7)

In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [44]:
system_prompt = (
    "You are a helpful medical assistant. Use the following context to answer the question.\n"
    "If you don't know the answer, just say you don't know. Do not try to make up an answer.\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
    ]
)

In [45]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [47]:
response = rag_chain.invoke({"input": "Who is Nhut Nam?"})
print(response["answer"])

 In the provided context, Nhut Nam is not explicitly mentioned as a person with any specific role or background. However, it's clear from the sentence "Nhut Nam is a good boy and he loves programming" that Nhut Nam is likely a child who enjoys programming, but there isn't additional information about him in this text.
