In [1]:
%pwd

'd:\\Projects\\Gen AI\\medical-assistant\\research'

In [2]:
import os 
os.chdir("../")
%pwd

'd:\\Projects\\Gen AI\\medical-assistant'

In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [4]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

In [5]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)
pc

<pinecone.pinecone.Pinecone at 0x17cfa6d6010>

In [6]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

index_name = "medical-experiment"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "chunk_text"}
        }   
    )
    print(index_name, "created")

index = pc.Index(index_name)

medical-experiment created


In [7]:
info = pc.describe_index(index_name)
print(info)

{'deletion_protection': 'disabled',
 'dimension': 1024,
 'embed': {'dimension': 1024,
           'field_map': {'text': 'chunk_text'},
           'metric': 'cosine',
           'model': 'llama-text-embed-v2',
           'read_parameters': {'dimension': 1024.0,
                               'input_type': 'query',
                               'truncate': 'END'},
           'vector_type': 'dense',
           'write_parameters': {'dimension': 1024.0,
                                'input_type': 'passage',
                                'truncate': 'END'}},
 'host': 'medical-experiment-zs4v7ft.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-experiment',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': None,
 'vector_type': 'dense'}


In [10]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# pick one file for testing
pdf_path = "data/patient_data/Patient_P0004.pdf"
patient_id = os.path.basename(pdf_path).replace(".pdf", "")   

# load PDF
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# split into chunks
#splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#chunks = splitter.split_documents(docs)

#print(f"✅ Loaded {len(chunks)} chunks for {patient_id}")
#print(chunks[0].page_content[:1000])   # show preview of first chunk


In [11]:
import re

def visit_aware_chunker_with_merge(text, chunk_size=1000, chunk_overlap=200, min_merge_size=500):
    # Step 1: Split on 'visit'
    parts = re.split(r'(?=\bvisit\b)', text, flags=re.IGNORECASE)
    
    # Step 2: Chunk each part respecting chunk_size & chunk_overlap
    temp_chunks = []
    for part in parts:
        part = part.strip()
        if not part:
            continue
        
        if len(part) <= chunk_size:
            temp_chunks.append(part)
        else:
            start = 0
            while start < len(part):
                end = start + chunk_size
                chunk = part[start:end]
                temp_chunks.append(chunk)
                start += chunk_size - chunk_overlap  # apply overlap

    # Step 3: Merge small consecutive chunks if combined length < min_merge_size
    final_chunks = []
    buffer = ""
    
    for chunk in temp_chunks:
        if len(buffer) + len(chunk) < min_merge_size:
            buffer = (buffer + " " + chunk).strip()
        else:
            if buffer:
                final_chunks.append(buffer)
            buffer = chunk
    
    if buffer:
        final_chunks.append(buffer)
    
    return final_chunks





In [13]:
import fitz  # PyMuPDF

doc = fitz.open(pdf_path)

text = ""
for page in doc:
    text += page.get_text("text") + "\n"


chunks = visit_aware_chunker_with_merge(text, chunk_size=1000,chunk_overlap=200, min_merge_size=800)
print(chunks[0])

Patient: Arjun Kumar 
Patient ID: HK-728-455 
DOB: 15/08/1975 (Age 48 at first visit) 
Gender: Male 
Occupation: Software Project Manager 
Known Allergies (Established prior to Visit 1): 
Sulfa Drugs (Trimethoprim/Sulfamethoxazole) - Causes generalized urticaria (hives) and 
pruritus (itching). 
No known food allergies. 
Past Medical History (PMH): 
Hypertension (HTN), diagnosed 5 years ago, well-controlled on medication. 
Dyslipidemia, diagnosed 5 years ago. 
Medications on file (Pre-


In [14]:
records = []
for i, chunk in enumerate(chunks):
    records.append({
        "_id": f"{patient_id}_{i}", 
        "chunk_text": chunk,
        "patient_id": patient_id
    })

print(records[0])   # preview to confirm structure



{'_id': 'Patient_P0004_0', 'chunk_text': 'Patient: Arjun Kumar \nPatient ID: HK-728-455 \nDOB: 15/08/1975 (Age 48 at first visit) \nGender: Male \nOccupation: Software Project Manager \nKnown Allergies (Established prior to Visit 1): \nSulfa Drugs (Trimethoprim/Sulfamethoxazole) - Causes generalized urticaria (hives) and \npruritus (itching). \nNo known food allergies. \nPast Medical History (PMH): \nHypertension (HTN), diagnosed 5 years ago, well-controlled on medication. \nDyslipidemia, diagnosed 5 years ago. \nMedications on file (Pre-', 'patient_id': 'Patient_P0004'}


In [15]:
index.upsert_records(patient_id, records)

In [16]:
print(patient_id)
query = "patient visit 1"

results = index.search(
    namespace=patient_id,
    query={
        "top_k": 5,
        "inputs": {
            'text': query
        }
    }
)

print(results)

Patient_P0004
{'result': {'hits': [{'_id': 'Patient_P0004_1',
                      '_score': 0.470281720161438,
                      'fields': {'chunk_text': 'Visit 1): \n'
                                               'Tab. Telmisartan 40 mg - once '
                                               'daily \n'
                                               'Tab. Atorvastatin 10 mg - once '
                                               'at night \n'
                                               'Social History: \n'
                                               'Smokes occasionally (5-10 '
                                               'cigarettes/week, social '
                                               'smoker). Denies alcohol use. '
                                               'Sedentary job. \n'
                                               'Surgical History: \n'
                                               'Appendectomy (2001) \n'
                                       

In [17]:
# Assume `results` is your SearchRecordsResponse object
hits = results.result['hits']

# Extract all chunk_text
all_chunks = [hit['fields']['chunk_text'] for hit in hits]

# Merge into a single string / paragraph
merged_chunks = "\n".join(all_chunks)  # you can also use " " instead of "\n"

print(merged_chunks)

Visit 1): 
Tab. Telmisartan 40 mg - once daily 
Tab. Atorvastatin 10 mg - once at night 
Social History: 
Smokes occasionally (5-10 cigarettes/week, social smoker). Denies alcohol use. Sedentary job. 
Surgical History: 
Appendectomy (2001) 
Family History: 
Father: History of Coronary Artery Disease (CAD), died of MI at 65. 
Mother: Alive, has Type 2 Diabetes and HTN. Visit 1: Initial Consultation for New Symptoms 

Date: October 26, 2023 
Reason for
Patient: Arjun Kumar 
Patient ID: HK-728-455 
DOB: 15/08/1975 (Age 48 at first visit) 
Gender: Male 
Occupation: Software Project Manager 
Known Allergies (Established prior to Visit 1): 
Sulfa Drugs (Trimethoprim/Sulfamethoxazole) - Causes generalized urticaria (hives) and 
pruritus (itching). 
No known food allergies. 
Past Medical History (PMH): 
Hypertension (HTN), diagnosed 5 years ago, well-controlled on medication. 
Dyslipidemia, diagnosed 5 years ago. 
Medications on file (Pre-
Visit: "Routine follow-up for HTN and GERD. Now has a 

In [18]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()

chatModel = ChatOpenAI(
    model="deepseek/deepseek-chat",    # ✅ safe working model
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1",
    timeout=30,                        # ⏱ fail fast instead of hanging forever
    max_retries=1,                     # no endless retries
    default_headers={
        "HTTP-Referer": "http://localhost:8501",
        "X-Title": "Medical Chatbot 1"
    },
)

print("Sending request...")
resp = chatModel.invoke("Quick test: say hello if you're working! And tell me are you coming from OpenRouter, if no then from where?")
print("Response:", resp.content)


Sending request...
Response: Hello! I'm working! 😊  

I'm not coming from OpenRouter—I'm powered by **DeepSeek Chat**, developed by **DeepSeek**. Let me know how I can help you today! 🚀


In [19]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [20]:
system_prompt = (
    "You are a diligent and cautious Clinical AI Co-Pilot. Your primary role is to assist physicians by providing accurate, context-aware information and critical safety advisories based solely on the provided patient data and medical knowledge context."
    " Core Rules: Your knowledge is strictly limited to the provided Patient Context and Medical Knowledge Base. Never fabricate information. Your most critical function is to prevent medical errors. You must proactively cross-reference any mentioned medications or procedures against the patient's profile (especially allergies, current conditions, and medications) and flag any potential issues with high severity. Provide direct, concise answers first. Elaborate only if the physician asks for detail or if a safety issue requires a detailed explanation. If the context does not contain information needed to answer a question safely or completely, you must state 'The patient's records do not contain information on that.' Do not speculate."
    " How to Respond: For Summaries and Questions, analyze the provided patient context and give a clear, factual answer. For Prescription Advice (e.g., 'Can I prescribe X for Y?'): First, check the patient's allergy list and state a clear warning if there is a conflict. Second, consult the Medical Knowledge Base to see if 'X' is indicated for condition 'Y'. Third, if the patient's active medications are known, check for potential drug-drug interactions. Finally, combine these checks into a structured advisory."
    " Always conclude with the reminder: 'This is a clinical decision support tool. Please use your professional judgment and verify critical information.'"
    " Use the following retrieved context to inform your response: "
    "\n\n {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [22]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

# Wrap string, split, create vectorstore, and retriever
docs = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents([Document(page_content=merged_chunks)])
retriever = FAISS.from_documents(docs, HuggingFaceEmbeddings()).as_retriever()




  retriever = FAISS.from_documents(docs, HuggingFaceEmbeddings()).as_retriever()


In [23]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [24]:
response = rag_chain.invoke({"input": "information about patient's visit 1"})
print(response["answer"])

Here is the key information from **Visit 1** for **Arjun Kumar (Patient ID: HK-728-455)**:

### **Visit 1 Details (Initial Consultation for New Symptoms)**  
**Date:** October 26, 2023  
**Reason for Visit:** Routine follow-up for **hypertension (HTN)** and **GERD**, plus a new complaint of a **painful blistering rash** on the sole of his right foot.  

### **Subjective (Patient Reports)**  
- **GERD:** Excellent response to **Pantoprazole 40 mg**—chest burning symptoms resolved.  
- **Hypertension:** BP well-controlled on increased **Telmisartan 80 mg** (home BP log averages: **128/82 mmHg**). Less fatigued.  
- **New Issue:**  
  - **Painful blistering rash** on the sole of the **right foot** for **3 days**.  
  - Described as **"cluster of tiny blisters on a red base"**, extremely tender, making it hard to wear shoes.  
  - **No recent trauma, new soaps, or detergents.**  
  - **History of childhood chickenpox.**  

### **Objective (Exam Findings)**  
- **Vitals:** BP **126/80 mmHg*

In [25]:
response = rag_chain.invoke({"input": "any allergies"})
print(response["answer"])

The patient, Arjun Kumar, has a known allergy to **Sulfa Drugs (Trimethoprim/Sulfamethoxazole)**, which causes generalized urticaria (hives) and pruritus (itching).  

**Critical Safety Reminder:**  
- Avoid prescribing sulfonamide antibiotics (e.g., Bactrim, Septra) or other sulfa-containing medications.  
- Cross-check any new medication for potential sulfa components before administration.  

*No other allergies (e.g., food, environmental) are documented in the patient's records.*  

**This is a clinical decision support tool. Please use your professional judgment and verify critical information.**
