In [1]:
%pwd

'd:\\Codes\\Chatbots\\MedBot\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'd:\\Codes\\Chatbots\\MedBot'

In [16]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


In [5]:
# Extract Data From the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    documents=loader.load()

    return documents

In [6]:
extracted_data = load_pdf_file(data='data/')

In [None]:
type(extracted_data)
# extracted_data

In [10]:
# Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks


In [11]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 6983


In [9]:
def download_hf_embeddings():
    # sentence-transformers/all-MiniLM-L6-v2  dimension of vectors generated by this is 384
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
embeddings = download_hf_embeddings()

In [23]:
query_result = embeddings.embed_query("Hello world")
print("Length",len(query_result))

Length 384


In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [12]:
index_name = "medbot"

In [28]:
# i have not blindly provided 384 instead it is the maximum dimension that sentence-transformers/all-MiniLM-L6-v2 give
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [52]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

TypeError: str expected, not NoneType

In [32]:
from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into your Pinecone index

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [13]:
# Load Existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [34]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2294a452c20>

In [21]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [40]:
retrieved_docs = retriever.invoke("What is  Diabetes mellitus?")
retrieved_docs

[Document(id='f3896693-2b1d-4c35-abc8-b6c39aa16478', metadata={'page': 435.0, 'source': 'data\\THE_GALE_ENCYCLOPEDIA_MEDICINE.pdf'}, page_content='glucose in the blood cannot be absorbed into the cells of\nthe body. Symptoms include frequent urination, lethargy,\nexcessive thirst, and hunger. The treatment includes\nchanges in diet, oral medications, and in some cases,\ndaily injections of insulin.\nDescription\nDiabetes mellitus is a chronic disease that causes\nserious health complications including renal (kidney)\nfailure, heart disease, stroke , and blindness. Approxi-\nmately 14 million Americans (about 5% of the popula-'),
 Document(id='b7b25c46-65cb-49c6-9c1e-0acb500b58e0', metadata={'page': 434.0, 'source': 'data\\THE_GALE_ENCYCLOPEDIA_MEDICINE.pdf'}, page_content='York: McGraw-Hill, 1997.\nPERIODICALS\nAdam, Patricia. “Evaluation and Management of Diabetes\nInsipidus.” American Family Physician 55, no. 6 (1 May\n1997): 2146+.\nSinger, Irwin, et al. “The Management of Diabetes 

In [14]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4,max_tokens=500,api_key=OPENAI_API_KEY)

In [19]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [22]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [26]:
response = rag_chain.invoke({"input": "what is Diabetes insipidus?"})
print(response["answer"])



I don't know.
