In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

file_path = '/content/drive/MyDrive/filtered_NOTEEVENTS.csv'  # update this if needed
df = pd.read_csv(file_path)

# Preview the data
df.head()


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,length,clean_text
0,42102,58526,100001.0,2117-09-17,,,Discharge summary,Report,,,Admission Date: [**2117-9-11**] ...,8285,admission date: discharge date: date of birth:...
1,19215,54610,100003.0,2150-04-21,,,Discharge summary,Report,,,Admission Date: [**2150-4-17**] ...,9981,admission date: discharge date: date of birth:...
2,8772,9895,100006.0,2108-04-17,,,Discharge summary,Report,,,Admission Date: [**2108-4-6**] Discharg...,5943,admission date: discharge date: date of birth:...
3,50238,23018,100007.0,2145-04-07,,,Discharge summary,Report,,,Admission Date: [**2145-3-31**] ...,7857,admission date: discharge date: date of birth:...
4,21119,533,100009.0,2162-05-21,,,Discharge summary,Report,,,Admission Date: [**2162-5-16**] ...,11276,admission date: discharge date: date of birth:...


In [4]:
from textwrap import wrap

chunks = []
max_chunk_size = 1000  # character-based chunks (safe for LLMs)

for note in df["TEXT"]:
    if pd.notna(note):
        cleaned = note.replace("\n", " ").strip()
        note_chunks = wrap(cleaned, width=max_chunk_size)
        chunks.extend(note_chunks)


In [5]:
import pickle

with open('/content/drive/MyDrive/chunks.pkl', 'wb') as f:
    pickle.dump(chunks, f)

print(f"✅ Saved {len(chunks)} chunks!")

✅ Saved 580083 chunks!


In [6]:
!pip install chromadb langchain

Collecting chromadb
  Downloading chromadb-1.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.25.0-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [7]:
import pandas as pd

embeddings_df = pd.read_csv('/content/drive/MyDrive/clinical_note_embeddings.csv')
embeddings = embeddings_df.to_numpy().tolist()

print(f"✅ Loaded {len(embeddings)} embeddings with {len(embeddings[0])} dimensions each")

✅ Loaded 52726 embeddings with 769 dimensions each


In [8]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.0.0-py3-no

In [9]:
!pip install faiss-cpu langchain


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [10]:
# Trim chunks to match number of embeddings
chunks = chunks[:len(embeddings)]
print(f"🧠 Matched chunks: {len(chunks)}, embeddings: {len(embeddings)}")


🧠 Matched chunks: 52726, embeddings: 52726


In [11]:
from langchain.docstore.document import Document

documents = [
    Document(page_content=chunks[i], metadata={"source": f"chunk_{i}"})
    for i in range(len(chunks))
]


In [12]:
from langchain.embeddings.base import Embeddings
import numpy as np

class PrecomputedEmbeddings(Embeddings):
    def __init__(self, vectors):
        self.vectors = vectors

    def embed_documents(self, texts):
        return self.vectors

    def embed_query(self, text):
        raise NotImplementedError("Only storing vectors for now")


In [13]:
from langchain.vectorstores import FAISS

embedding_vectors = np.array(embeddings)
embedder = PrecomputedEmbeddings(vectors=embedding_vectors)

faiss_db = FAISS.from_documents(
    documents=documents,
    embedding=embedder
)

# Save locally
faiss_db.save_local("/content/faiss_index")
print("✅ FAISS vector store saved locally.")


✅ FAISS vector store saved locally.


In [14]:
import shutil

shutil.copytree("/content/faiss_index", "/content/drive/MyDrive/MedRAG/faiss_index")
print("✅ FAISS DB backed up to Drive.")


FileExistsError: [Errno 17] File exists: '/content/drive/MyDrive/MedRAG/faiss_index'

In [None]:
!pip install faiss-cpu langchain transformers sentence-transformers


In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="emilyalsentzer/Bio_ClinicalBERT")


In [None]:
from langchain.vectorstores import FAISS

faiss_db = FAISS.load_local(
    folder_path="/content/drive/MyDrive/MedRAG/faiss_index",
    embeddings=embedding_model,
    index_name="index",
    allow_dangerous_deserialization=True
)

retriever = faiss_db.as_retriever(search_kwargs={"k": 5})


In [None]:
# ✅ Load FREE HuggingFace LLM: google/flan-t5-base
hf_pipeline = pipeline(
    task="text2text-generation",
    model="google/flan-t5-base",
    max_length=512
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)


In [None]:
# ✅ Create the RAG chain
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"
)


In [None]:
# ✅ (Optional) Ask a test question before UI
query = "Summarize the discharge instructions of the patient."
response = rag_chain.run(query)
print("🧠 Response:", response)

In [None]:
import numpy as np

vec = embedding_model.embed_query("test")
print("Embedding dimension:", np.array(vec).shape)
