In [15]:
%pwd

'd:\\Projects\\DocEase\\research'

In [16]:
import os
os.chdir("../")

In [17]:
%pwd

'd:\\Projects\\DocEase'

In [18]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [19]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [20]:
# Example usage
extracted_data=load_pdf_file(data='Data/')

In [20]:
#extracted_data

In [21]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [22]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5860


In [24]:
#text_chunks

In [23]:
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer

In [24]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [25]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [26]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [44]:
#query_result

In [27]:
from dotenv import load_dotenv
load_dotenv()
import os

In [28]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [29]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "docease"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [30]:
import os 
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [31]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [32]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [33]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1dcfa9c64d0>

In [181]:
retriever = docsearch.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.8, "k": 3}  # Increase threshold if needed
)


retrieved_docs = retriever.invoke("What are the symptoms of diabetes?")
retrieved_docs

[Document(id='3ec2cf55-d67e-421a-a67b-6e3193f5248d', metadata={'page': 437.0, 'page_label': '438', 'source': 'Data\\Medical_book.pdf'}, page_content='• Type I diabetes mellitus. Characterized by fatigue and\nan abnormally high level of glucose in the blood\n(hyperglycemia).\n• Amyotrophic lateral schlerosis. First signs are stum-\nbling and difficulty climbing stairs. Later, muscle\ncramps and twitching may be observed as well as\nweakness in the hands making fastening buttons or\nturning a key difficult. Speech may become slowed or\nslurred. There may also be difficluty swallowing. As\nrespiratory muscles atrophy, there is increased danger')]

In [166]:
from langchain_community.llms import Ollama

# Initialize the DeepSeek model from Ollama
llm = Ollama(model="deepseek-r1:1.5b", temperature=0.4, num_predict=750)

In [167]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [168]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [170]:
# Ask a question
response = rag_chain.invoke({"input": "What is Acne?"})
# Remove unwanted "<think>" parts if they appear
cleaned_answer = response["answer"].split("</think>")[-1].strip()

print(cleaned_answer)


Acne is a common skin disease characterized by pimples on the face, chest, and back, caused by clogged pores containing oil, dead skin cells, and bacteria. It affects nearly 17 million people in the U.S., making it the most prevalent skin condition.


In [171]:
# Ask a question
response = rag_chain.invoke({"input": "Write an Python Code to print hello world"})
# Remove unwanted "<think>" parts if they appear
cleaned_answer = response["answer"].split("</think>")[-1].strip()

print(cleaned_answer)


No relevant docs were retrieved using the relevance score threshold 0.7


To print "hello world" in Python, you can use the `print()` function:

```python
print("hello world")
```

This will output the string "hello world".


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for medical-related questions. "
    "Use only the following pieces of retrieved context to answer "
    "the question. If the context does not provide an answer, say: "
    "'I'm sorry, but I can only answer medical-related questions.' "
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)


# Create a chat prompt for the model
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

# Create the question-answer chain with LLM and prompt
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)




def strict_medical_rag(query):
    retrieved_docs = retriever.invoke(query)  # Get retrieved docs
    print("\n🔎 Retrieved Documents:\n", retrieved_docs)  # Debugging Step
    
    # If no relevant medical docs found, return strict response
    if not retrieved_docs:
        return {"answer": "I'm sorry, but I can only answer medical-related questions."}
    
    # Otherwise, pass query and context to RAG chain
    return rag_chain.invoke({"input": query, "context": retrieved_docs})







In [180]:
# Test cases
response1 = strict_medical_rag("What are the symptoms of diabetes?")
cleaned_answer = response1["answer"].split("</think>")[-1].strip()

print(cleaned_answer)  # Should return "I'm sorry, but I can only answer medical-related questions."

Type I Diabetes Mellitus is characterized by fatigue, weight gain, and hyperglycemia (high blood glucose levels). Symptoms may include muscle wasting, weakness, and increased risk of cardiovascular events. Amyotrophic Lateral Schlerosis presents with early signs like stumbles and difficulty climbing stairs, followed by muscle cramps, weakness in button turning, and possible swallowing difficulties as the respiratory muscles atrophy.


In [178]:
response2 = strict_medical_rag("Write a Python Code to print hello world")
cleaned_answer = response2["answer"].split("</think>")[-1].strip()

print(cleaned_answer)  # Should return "I'm sorry, but I can only answer medical-related questions."

No relevant docs were retrieved using the relevance score threshold 0.7


I'm sorry, but I can only answer medical-related questions.
