In [None]:
#! pip install langchain
#! pip install sentence-transformers
#! pip install langchain-huggingface
#! pip install langchain-chroma
# !pip install langchain_google_genai

In [1]:
# ClinRAG: Clinical Knowledge Assistant RAG-Based Medical Question Answering System


from langchain_community.document_loaders import DirectoryLoader
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# ==============================
# STEP 1: Load Documents
# ==============================
loader = DirectoryLoader('wiki_documents', glob="*.txt", loader_cls=TextLoader)
kb_docs = loader.load()

# ==============================
# STEP 2: Chunking
# ==============================

# Chunk the loaded documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(kb_docs)

# ==============================
# STEP 3: Embedding Model
# ==============================
# Create object for embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# ==============================
# STEP 4: Vector Database
# ==============================
# Initialize the database connection
# If database exist, it will connect with the collection_name and persist_directory
# Otherwise a new collection will be created
db = Chroma(collection_name="vector_database", 
            embedding_function=embedding_model, 
            persist_directory="./chroma_db_")

# Insert chunks in the vector database
# Only add documents if DB is empty
if db._collection.count() == 0:
    db.add_documents(documents=chunks)

# ==============================
# STEP 5 & 6: Retriever
# ==============================
# Create Retriever. 
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

# ==============================
# STEP 7: Prompt Template
# ==============================
# Define Prompt Template
PROMPT_TEMPLATE = """
Answer the question based only on the following context:
{context}
Answer the question based on the above context: {question}.
Provide a detailed answer.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
"""

prompt_template = ChatPromptTemplate(
    messages=[
        PROMPT_TEMPLATE
    ]
)


# ==============================
# Chat Model/LLM Configuration
# ==============================
#load Key
f = open("keys/.google_api_key.txt")
GOOGLE_API_KEY = f.read()

# Initialize LLM
chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=GOOGLE_API_KEY
)

#Initialize a Output Parser
parser = StrOutputParser()

# Format Retrieved Documents. Helper function.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# ==============================
# RAG Chain
# ==============================
# Create Generator Chain
generator_chain = prompt_template | chat_model | parser

# Define a RAG Chain
rag_chain = {
    "context": retriever | format_docs, 
    "question": RunnablePassthrough()
} | generator_chain

# Invoke the Chain
query = 'Who first described Alzheimer’s disease?'

rag_chain.invoke(query)



Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mMPNetModel LOAD REPORT[0m from: sentence-transformers/all-mpnet-base-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
  db = Chroma(collection_name="vector_database",


"German psychiatrist and pathologist Alois Alzheimer first described Alzheimer's disease in 1906."