In [None]:
!pip uninstall -y langchain langchain-community langchain-core langchain-text-splitters
!pip install -U langchain==0.2.0 langchain-community==0.2.0 langchain-core==0.2.0 langchain-text-splitters==0.2.0 langchain-huggingface chromadb pandas sentence-transformers
!pip install ctransformers
import pandas as pd
import os
import shutil
import time
import chromadb

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.llms import CTransformers
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


DATA_PATH = "/kaggle/input/medicaltranscriptions/mtsamples.csv"
DB_PATH = "/kaggle/working/chroma_medical_db"

if os.path.exists(DB_PATH):
    shutil.rmtree(DB_PATH)

print("Loading Data...")
df = pd.read_csv(DATA_PATH).dropna(subset=['transcription']).head(600)

documents = []
for i, row in df.iterrows():
    content = f"Medical Specialty: {row['medical_specialty']}\nSample: {row['sample_name']}\nContent: {row['transcription']}"
    documents.append(Document(page_content=content, metadata={"source": str(row['sample_name'])}))

print(f"Splitting {len(documents)} documents...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
docs = text_splitter.split_documents(documents)

print("Creating Embeddings & Vector Store...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(docs, embeddings, persist_directory=DB_PATH)
retriever = vectorstore.as_retriever(search_kwargs={'k': 3})

print("Loading Mistral-7B...")
config = {'max_new_tokens': 512, 'temperature': 0.1, 'context_length': 2048, 'gpu_layers': 40}
llm = CTransformers(
    model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
    model_file="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    model_type="mistral",
    config=config
)
system_prompt = (
    "You are a medical assistant. Answer based ONLY on the context provided. "
    "If the answer is missing, say 'I do not know'.\n\nContext:\n{context}"
)
prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", "{input}")])
rag_chain = create_retrieval_chain(retriever, create_stuff_documents_chain(llm, prompt))

print("Pipeline Ready!")
queries = [
    "What are the symptoms of allergic rhinitis?",
    "What is chronic kidney disease?",
    "Medications for hypertension?",
    "Managing Type 2 Diabetes?",
    "Symptoms of multiple sclerosis?",
    "Treating urinary tract infection?",
    "What is atrial fibrillation?",
    "What is GERD (Gastroesophageal Reflux Disease)?",
    "Asthma treatments?",
    "What are the symptoms of pneumonia?",
    "How are migraines treated?",
    "Treatment for anxiety?",

    # Surgical Procedures
    "Surgery for carpal tunnel?",
    "Treatment for hip fracture?",
    "Procedure for colonoscopy?",
    "Treatment for acute appendicitis?",
    "Procedure for cataract surgery?",
    "Laparoscopic cholecystectomy details?",
    "Knee arthroscopy procedure?",
    "What options are available for hernia treatment?",
    "What is a vasectomy?",
    "What are the risks of a C-section?",
    "Describe the surgery for a Rotator Cuff Tear.",
    "Post-operative care for tonsillectomy?",
    
    # Diagnostics
    "What is degenerative disc disease?",
    "Signs of sleep apnea?",
    "Diagnosing breast cancer?",
    "What is a biopsy?",
    "What is an echocardiogram?",

    # Negative Controls 
    "What is the capital of France?",
    "How do I bake a chocolate cake?",
    "Write a Python script to sort a list."
]

results = []
print(f"Running {len(queries)} Test Queries...")

for i, q in enumerate(queries):
    print(f"[{i+1}/{len(queries)}] Asking: {q}...")
    try:
        # Invoke the chain
        ans = rag_chain.invoke({"input": q})["answer"]
        results.append({"Query": q, "Answer": ans})
    except Exception as e:
        print(f"Error on query '{q}': {e}")
        results.append({"Query": q, "Answer": "Error"})

df_results = pd.DataFrame(results)
df_results.to_csv("rag_results.csv", index=False)
print("Done! Results saved to 'rag_results.csv'")

Found existing installation: langchain 0.3.27
Uninstalling langchain-0.3.27:
  Successfully uninstalled langchain-0.3.27
[0mFound existing installation: langchain-core 0.3.72
Uninstalling langchain-core-0.3.72:
  Successfully uninstalled langchain-core-0.3.72
Found existing installation: langchain-text-splitters 0.3.9
Uninstalling langchain-text-splitters-0.3.9:
  Successfully uninstalled langchain-text-splitters-0.3.9
Collecting langchain==0.2.0
  Downloading langchain-0.2.0-py3-none-any.whl.metadata (13 kB)
Collecting langchain-community==0.2.0
  Downloading langchain_community-0.2.0-py3-none-any.whl.metadata (8.8 kB)
Collecting langchain-core==0.2.0
  Downloading langchain_core-0.2.0-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain-text-splitters==0.2.0
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.1.0-py3-none-any.whl.metadata (2.8 kB)
Collecting chromadb
  Downloading c