# rag_knowledgebase.ipynb

## Load & chunk documents (TXT + PDF)

In [1]:
import os, re, glob
from typing import List

from langchain_community.document_loaders import PyPDFLoader

ROOT = r"D:\\MedicalAI-Assistant"
BASE_DIR = os.path.join(ROOT, 'data')

TEXTS: List[str] = []

# single guidelines.txt
guidelines_txt = os.path.join(BASE_DIR, 'guidelines.txt')
if os.path.exists(guidelines_txt):
    with open(guidelines_txt, 'r', encoding='utf-8', errors='ignore') as f:
        TEXTS.append(f.read())

# multiple .txt inside data/guidelines/
G_DIR = os.path.join(BASE_DIR, 'guidelines')
if os.path.isdir(G_DIR):
    for path in glob.glob(os.path.join(G_DIR, '**', '*.txt'), recursive=True):
        with open(path, 'r', encoding='utf-8', errors='ignore') as f:
            TEXTS.append(f.read())

#  Load PDF guidelines
pdf_path = os.path.join(BASE_DIR, 'guidelines.pdf')
if os.path.exists(pdf_path):
    loader = PyPDFLoader(pdf_path)
    pdf_docs = loader.load()
    TEXTS.extend([d.page_content for d in pdf_docs])

print("Docs loaded:", len(TEXTS))


# Smart cleaning function
def clean_text(text: str) -> str:
    # normalize spaces/newlines
    text = re.sub(r'\s+', ' ', text)

    # preserve medical abbreviations (all-caps words), numbers, %, °C, subscript digits
    # remove only strange symbols
    text = re.sub(r'[^A-Za-z0-9.,;:%°()\-–\/\[\] ]+', ' ', text)


    text = re.sub(r'\s+', ' ', text)

    return text.strip()


# Chunk documents 
CHUNKS = []
for doc in TEXTS:
    doc = clean_text(doc)
    for i in range(0, len(doc), 800):  # 800 char chunks
        CHUNKS.append(doc[i:i+800])

print("Chunks after cleaning:", len(CHUNKS))


#  Build FAISS store 
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

emb = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

docs = [Document(page_content=ch) for ch in CHUNKS]
vectorstore = FAISS.from_documents(docs, emb)

SAVE_DIR = os.path.join(ROOT, 'artifacts', 'faiss_index')

# clear old index and save new one
import shutil
shutil.rmtree(SAVE_DIR, ignore_errors=True)
vectorstore.save_local(SAVE_DIR)

print("FAISS index saved to", SAVE_DIR)


#  Quick QA test (local LLM) 
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as hf_pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

# Load retriever
retriever = FAISS.load_local(
    SAVE_DIR, emb, allow_dangerous_deserialization=True
).as_retriever(search_kwargs={"k":3})

# Local model for QA
llm_tok = AutoTokenizer.from_pretrained('google/flan-t5-base')
llm_mod = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-base')

llm_pipe = hf_pipeline(
    'text2text-generation',
    model=llm_mod,
    tokenizer=llm_tok,
    max_length=256
)

llm = HuggingFacePipeline(pipeline=llm_pipe)

qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type='stuff')

# Example test
print(qa.run("What is the first-line treatment for ventilator-associated pneumonia?"))


Docs loaded: 179
Chunks after cleaning: 660


  emb = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm
W0824 14:16:19.418000 9292 torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


FAISS index saved to D:\\MedicalAI-Assistant\artifacts\faiss_index


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=llm_pipe)
  print(qa.run("What is the first-line treatment for ventilator-associated pneumonia?"))
Token indices sequence length is longer than the specified maximum sequence length for this model (699 > 512). Running this sequence through the model will result in indexing errors


Chest physiotherapy
