In [1]:
!pip install langchain langchain-community langchain-google-genai
!pip install chromadb sentence-transformers PyMuPDF
!pip install gradio python-dotenv psutil




In [2]:
import os
import fitz  # PyMuPDF
import chromadb
from chromadb.config import Settings
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import gradio as gr
import numpy as np
from typing import List, Dict, Any
import json
from datetime import datetime
import psutil
import socket
import requests
import signal
import time
import gc

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [3]:
class Config:
    # Embedding model (free, runs in Colab)
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"

    # Chunk settings
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200

    # ChromaDB settings
    CHROMA_PERSIST_DIR = "./chroma_db"
    COLLECTION_NAME = "medical_encyclopedia"

    # Gemini settings
    GEMINI_MODEL = "gemini-1.5-flash"

    # Retrieval settings
    TOP_K_RESULTS = 5

    # Network timeout settings
    API_TIMEOUT = 30
    MAX_RETRIES = 3

print("✅ Configuration set up!")
print(f"📊 Chunk size: {Config.CHUNK_SIZE} characters")
print(f"🔄 Chunk overlap: {Config.CHUNK_OVERLAP} characters")
print(f"🎯 Will retrieve top {Config.TOP_K_RESULTS} results")

✅ Configuration set up!
📊 Chunk size: 1000 characters
🔄 Chunk overlap: 200 characters
🎯 Will retrieve top 5 results


In [4]:
# Clear any cached network connections
try:
    requests.Session().close()
    print("🧹 Network cache cleared")
except:
    pass

# Get Gemini API key from Colab secrets
try:
    from google.colab import userdata
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')

    # Configure with better network settings
    genai.configure(
        api_key=GEMINI_API_KEY,
        transport='rest'  # Force REST instead of gRPC for better stability
    )
    print("✅ Gemini API key loaded and configured")
    print(f"🔑 Key status: {GEMINI_API_KEY[:10]}...{GEMINI_API_KEY[-5:]}")

except Exception as e:
    print("⚠️ Could not load GEMINI_API_KEY from secrets")
    print("Make sure you've added 'GEMINI_API_KEY' to your Colab secrets")
    print("Go to the key icon on the left sidebar > Add new secret")
    GEMINI_API_KEY = None

🧹 Network cache cleared
✅ Gemini API key loaded and configured
🔑 Key status: AIzaSyCJ87...KEqqo


In [5]:
class PDFProcessor:
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
        print("📄 PDF processor initialized")

    def extract_text_from_pdf(self, pdf_path: str) -> List[Document]:
        """Extract text from PDF and create Document objects"""
        documents = []

        print(f"📄 Processing PDF: {pdf_path}")
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()

            if text.strip():  # Only add non-empty pages
                documents.append(Document(
                    page_content=text,
                    metadata={
                        "source": pdf_path,
                        "page": page_num + 1,
                        "total_pages": len(doc)
                    }
                ))

            # Show progress every 50 pages
            if (page_num + 1) % 50 == 0:
                print(f"📖 Processed {page_num + 1}/{len(doc)} pages...")

        doc.close()
        print(f"✅ Extracted text from {len(documents)} pages")
        return documents

    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """Split documents into smaller chunks"""
        print(f"🔪 Chunking {len(documents)} documents...")
        chunks = self.text_splitter.split_documents(documents)
        print(f"✅ Created {len(chunks)} chunks")

        # Show chunk statistics
        chunk_lengths = [len(chunk.page_content) for chunk in chunks]
        avg_length = sum(chunk_lengths) / len(chunk_lengths)
        print(f"📊 Average chunk length: {avg_length:.0f} characters")
        print(f"📊 Min/Max chunk length: {min(chunk_lengths)}/{max(chunk_lengths)} characters")

        return chunks

# Initialize PDF processor
pdf_processor = PDFProcessor()

📄 PDF processor initialized


In [6]:
class EmbeddingManager:
    def __init__(self):
        print("🤖 Loading embedding model...")
        self.model = SentenceTransformer(Config.EMBEDDING_MODEL)
        print("✅ Embedding model loaded")
        print(f"📐 Embedding dimension: {self.model.get_sentence_embedding_dimension()}")

        # Initialize ChromaDB
        self.client = chromadb.PersistentClient(path=Config.CHROMA_PERSIST_DIR)
        print(f"💾 ChromaDB initialized at: {Config.CHROMA_PERSIST_DIR}")

    def create_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Create embeddings for a list of texts"""
        print(f"🔄 Creating embeddings for {len(texts)} texts...")
        embeddings = self.model.encode(texts, show_progress_bar=True)
        print(f"✅ Created embeddings with shape: {embeddings.shape}")
        return embeddings.tolist()

    def setup_vector_store(self, chunks: List[Document]) -> chromadb.Collection:
        """Create or get ChromaDB collection and add documents"""

        # Try to get existing collection
        try:
            collection = self.client.get_collection(Config.COLLECTION_NAME)
            existing_count = collection.count()
            print(f"📚 Found existing collection with {existing_count} documents")

            # Ask if user wants to recreate
            if existing_count > 0:
                print("⚠️ Collection already exists with data.")
                print("Options: (1) Use existing, (2) Add to existing, (3) Recreate")
                choice = input("Enter choice (1/2/3): ").strip()

                if choice == "3":
                    self.client.delete_collection(Config.COLLECTION_NAME)
                    collection = self.client.create_collection(Config.COLLECTION_NAME)
                    print("🗑️ Deleted old collection, created new one")
                elif choice == "1":
                    return collection
                # choice == "2" continues to add documents

        except Exception:
            collection = self.client.create_collection(Config.COLLECTION_NAME)
            print("📚 Created new collection")

        # Prepare data for ChromaDB
        texts = [chunk.page_content for chunk in chunks]
        metadatas = [chunk.metadata for chunk in chunks]

        # Generate unique IDs based on existing count
        existing_count = collection.count()
        ids = [f"chunk_{existing_count + i}" for i in range(len(chunks))]

        print(f"🔢 Generating {len(ids)} unique document IDs...")

        # Create embeddings
        embeddings = self.create_embeddings(texts)

        # Add to collection in batches (ChromaDB has limits)
        batch_size = 100
        total_batches = (len(texts) - 1) // batch_size + 1

        for i in range(0, len(texts), batch_size):
            batch_end = min(i + batch_size, len(texts))
            batch_num = i // batch_size + 1

            print(f"📥 Adding batch {batch_num}/{total_batches} ({batch_end - i} documents)...")

            collection.add(
                documents=texts[i:batch_end],
                embeddings=embeddings[i:batch_end],
                metadatas=metadatas[i:batch_end],
                ids=ids[i:batch_end]
            )

        final_count = collection.count()
        print(f"✅ Vector store ready with {final_count} total documents")
        return collection

# Initialize embedding manager
embedding_manager = EmbeddingManager()

🤖 Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


✅ Embedding model loaded
📐 Embedding dimension: 384
💾 ChromaDB initialized at: ./chroma_db


In [7]:
# Upload your medical encyclopedia PDF
from google.colab import files
import os

print("📁 Please upload your medical encyclopedia PDF:")
uploaded = files.upload()

# Get the uploaded file path
pdf_path = list(uploaded.keys())[0]
print(f"📄 Uploaded file: {pdf_path}")

# Check file size
file_size = os.path.getsize(pdf_path) / (1024 * 1024)  # MB
print(f"📊 File size: {file_size:.1f} MB")

# Extract text from PDF
documents = pdf_processor.extract_text_from_pdf(pdf_path)

# Create chunks
chunks = pdf_processor.chunk_documents(documents)

print(f"📋 Processing Summary:")
print(f"   📄 Total pages: {len(documents)}")
print(f"   🔪 Total chunks: {len(chunks)}")
print(f"   💽 Estimated storage: ~{len(chunks) * 384 * 4 / 1024 / 1024:.1f} MB")



📁 Please upload your medical encyclopedia PDF:


Saving Medical_book.pdf to Medical_book (1).pdf
📄 Uploaded file: Medical_book (1).pdf
📊 File size: 15.4 MB
📄 Processing PDF: Medical_book (1).pdf
📖 Processed 50/637 pages...
📖 Processed 100/637 pages...
📖 Processed 150/637 pages...
📖 Processed 200/637 pages...
📖 Processed 250/637 pages...
📖 Processed 300/637 pages...
📖 Processed 350/637 pages...
📖 Processed 400/637 pages...
📖 Processed 450/637 pages...
📖 Processed 500/637 pages...
📖 Processed 550/637 pages...
📖 Processed 600/637 pages...
✅ Extracted text from 636 pages
🔪 Chunking 636 documents...
✅ Created 3391 chunks
📊 Average chunk length: 905 characters
📊 Min/Max chunk length: 48/1000 characters
📋 Processing Summary:
   📄 Total pages: 636
   🔪 Total chunks: 3391
   💽 Estimated storage: ~5.0 MB


In [8]:
# Create Vector Store

print("🚀 Creating vector store...")
print("This may take a few minutes depending on document size...")

# Create the vector store
collection = embedding_manager.setup_vector_store(chunks)

print("✅ Vector store creation complete!")
print(f"📊 Final statistics:")
print(f"   💾 Total documents in database: {collection.count()}")
print(f"   🔍 Ready for similarity search!")

🚀 Creating vector store...
This may take a few minutes depending on document size...
📚 Found existing collection with 3391 documents
⚠️ Collection already exists with data.
Options: (1) Use existing, (2) Add to existing, (3) Recreate
Enter choice (1/2/3): 1
✅ Vector store creation complete!
📊 Final statistics:
   💾 Total documents in database: 3391
   🔍 Ready for similarity search!


In [9]:
# Enhanced RAG System Class

class MedicalRAG:
    def __init__(self, collection: chromadb.Collection, embedding_manager: EmbeddingManager):
        self.collection = collection
        self.embedding_manager = embedding_manager
        self.gemini_available = False

        if GEMINI_API_KEY:
            try:
                self.llm = genai.GenerativeModel(Config.GEMINI_MODEL)
                # Test if Gemini is working
                self._test_gemini()
            except:
                self.llm = None
                print("⚠️ Gemini initialization failed - using retrieval-only mode")
        else:
            self.llm = None
            print("⚠️ No Gemini API key - retrieval-only mode")

    def _test_gemini(self):
        """Test if Gemini API is working"""
        try:
            def timeout_handler(signum, frame):
                raise TimeoutError("Gemini test timed out")

            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(5)  # 5 second timeout for test

            response = self.llm.generate_content(
                "Hi",
                generation_config=genai.types.GenerationConfig(max_output_tokens=5)
            )
            signal.alarm(0)  # Cancel timeout

            self.gemini_available = True
            print("✅ Gemini LLM tested and working")

        except Exception as e:
            signal.alarm(0)  # Cancel timeout
            self.gemini_available = False
            print(f"⚠️ Gemini test failed: {str(e)[:100]}...")
            print("📚 Using retrieval-only mode (still very useful!)")

    def retrieve_documents(self, query: str, top_k: int = Config.TOP_K_RESULTS) -> List[Dict]:
        """Retrieve relevant documents for a query"""
        print(f"🔍 Searching for: '{query}'")

        # Create query embedding
        query_embedding = self.embedding_manager.create_embeddings([query])[0]

        # Search in ChromaDB
        results = self.collection.query(
            query_embeddings=[query_embedding],
            n_results=top_k,
            include=['documents', 'metadatas', 'distances']
        )

        # Format results
        retrieved_docs = []
        for i in range(len(results['documents'][0])):
            similarity_score = 1 - results['distances'][0][i]  # Convert distance to similarity
            retrieved_docs.append({
                'content': results['documents'][0][i],
                'metadata': results['metadatas'][0][i],
                'score': similarity_score
            })

        print(f"✅ Retrieved {len(retrieved_docs)} relevant documents")
        for i, doc in enumerate(retrieved_docs, 1):
            page = doc['metadata'].get('page', '?')
            score = doc['score']
            print(f"   {i}. Page {page} (similarity: {score:.3f})")

        return retrieved_docs

    def generate_answer(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Generate answer using Gemini and retrieved documents"""
        if not self.gemini_available:
            return self._format_retrieval_answer(query, retrieved_docs)

        # Prepare context from retrieved documents - limit size to prevent timeouts
        context_parts = []
        total_chars = 0
        max_context_chars = 3000  # Limit context size

        for i, doc in enumerate(retrieved_docs, 1):
            page_info = f"Page {doc['metadata'].get('page', '?')}"
            content = doc['content']

            # Limit individual document size
            if len(content) > 800:
                content = content[:800] + "..."

            doc_text = f"[Source {i} - {page_info}]\n{content}\n"

            if total_chars + len(doc_text) > max_context_chars:
                break

            context_parts.append(doc_text)
            total_chars += len(doc_text)

        context = "\n".join(context_parts)

        # Create medical-focused prompt
        prompt = f"""You are a medical AI assistant. Answer based ONLY on the provided medical encyclopedia context.

GUIDELINES:
- Use only information from the provided context
- Include source references (e.g., "According to Source 1, Page X...")
- Be precise and factual
- Always remind users to consult healthcare professionals

CONTEXT:
{context}

QUESTION: {query}

ANSWER:"""

        try:
            def timeout_handler(signum, frame):
                raise TimeoutError("Generation timed out")

            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(Config.API_TIMEOUT)  # 30 second timeout

            response = self.llm.generate_content(
                prompt,
                generation_config=genai.types.GenerationConfig(
                    max_output_tokens=500,  # Limit output size
                    temperature=0.1
                )
            )
            signal.alarm(0)  # Cancel timeout
            return response.text

        except Exception as e:
            signal.alarm(0)  # Cancel timeout
            print(f"❌ Gemini generation failed: {str(e)[:100]}...")
            print("📚 Falling back to retrieval-only answer")
            return self._format_retrieval_answer(query, retrieved_docs)

    def _format_retrieval_answer(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Format a professional answer using only retrieval results"""
        answer_parts = []
        answer_parts.append(f"📚 **Medical Encyclopedia Results for: {query}**\n")

        for i, doc in enumerate(retrieved_docs, 1):
            page = doc['metadata'].get('page', '?')
            score = doc['score']
            content = doc['content']

            answer_parts.append(f"**📄 Source {i} - Page {page} (Relevance: {score:.1%})**")

            # Clean and format content
            paragraphs = content.split('\n\n')
            for para in paragraphs[:2]:  # Show first 2 paragraphs
                if para.strip():
                    answer_parts.append(para.strip())

            if len(paragraphs) > 2:
                answer_parts.append("*[Additional content available...]*")

            answer_parts.append("")  # Empty line between sources

        answer_parts.append("⚠️ **Medical Disclaimer:** This information is for educational purposes only. Always consult qualified healthcare professionals for medical advice, diagnosis, or treatment.")

        return "\n".join(answer_parts)

    def ask(self, query: str) -> Dict[str, Any]:
        """Main RAG function: retrieve + generate"""
        start_time = datetime.now()

        # Retrieve relevant documents
        retrieved_docs = self.retrieve_documents(query)

        # Generate answer
        answer = self.generate_answer(query, retrieved_docs)

        end_time = datetime.now()
        processing_time = (end_time - start_time).total_seconds()

        return {
            'query': query,
            'answer': answer,
            'sources': retrieved_docs,
            'timestamp': start_time.isoformat(),
            'processing_time': processing_time,
            'gemini_used': self.gemini_available
        }

# Initialize RAG system
rag_system = MedicalRAG(collection, embedding_manager)

✅ Gemini LLM tested and working


In [10]:
def system_diagnostic():
    """Run comprehensive system diagnostic"""
    print("🔧 MEDICAL RAG SYSTEM DIAGNOSTIC")
    print("="*50)

    # Basic system check
    try:
        print(f"⏰ Current time: {datetime.now()}")
        print(f"💾 Memory usage: {psutil.virtual_memory().percent:.1f}%")
        print(f"🔢 Documents in database: {collection.count():,}")
        print(f"✅ RAG system type: {type(rag_system)}")
        print(f"🤖 Gemini available: {'Yes' if rag_system.gemini_available else 'No'}")
        print(f"🔑 API key exists: {'Yes' if GEMINI_API_KEY else 'No'}")
    except Exception as e:
        print(f"❌ Basic diagnostic failed: {e}")
        return False

    # Test retrieval
    try:
        print(f"\n🔍 Testing document retrieval...")
        test_docs = rag_system.retrieve_documents("test", 1)
        print(f"✅ Retrieval works - found page {test_docs[0]['metadata']['page']}")
    except Exception as e:
        print(f"❌ Retrieval test failed: {e}")
        return False

    # Test network connectivity
    try:
        print(f"\n🌐 Testing network connectivity...")
        socket.create_connection(("8.8.8.8", 53), timeout=5)
        print("✅ Network connectivity good")
    except Exception as e:
        print(f"⚠️ Network issue: {e}")

    print(f"\n🎉 System diagnostic complete!")
    return True

# Run diagnostic
system_diagnostic()



🔧 MEDICAL RAG SYSTEM DIAGNOSTIC
⏰ Current time: 2025-08-20 18:51:19.731279
💾 Memory usage: 19.0%
🔢 Documents in database: 3,391
✅ RAG system type: <class '__main__.MedicalRAG'>
🤖 Gemini available: Yes
🔑 API key exists: Yes

🔍 Testing document retrieval...
🔍 Searching for: 'test'
🔄 Creating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 1 relevant documents
   1. Page 479 (similarity: -0.200)
✅ Retrieval works - found page 479

🌐 Testing network connectivity...
✅ Network connectivity good

🎉 System diagnostic complete!


True

In [11]:
# Medical Search Functions

def medical_search(question: str, num_results: int = 5):
    """Professional medical search function"""
    print("🏥 MEDICAL ENCYCLOPEDIA SEARCH")
    print("="*60)
    print(f"Question: {question}")
    print("="*60)

    # Get search results
    docs = rag_system.retrieve_documents(question, num_results)

    print(f"\n📚 EVIDENCE FROM MEDICAL ENCYCLOPEDIA:")
    print(f"Found {len(docs)} relevant sections from your 637-page encyclopedia\n")

    for i, doc in enumerate(docs, 1):
        page = doc['metadata'].get('page', '?')
        score = doc['score']

        print(f"📄 **SOURCE {i}: PAGE {page}**")
        print(f"🎯 Relevance: {score:.1%}")
        print("─" * 50)

        # Format content nicely
        content = doc['content'].strip()
        paragraphs = content.split('\n\n')

        for j, para in enumerate(paragraphs):
            if para.strip():
                print(para.strip())
                if j < len(paragraphs) - 1:  # Add spacing between paragraphs
                    print()

        print("─" * 50)
        print()

    print("⚠️ **Medical Disclaimer:** This information is for educational purposes only.")
    print("Always consult qualified healthcare professionals for medical advice.\n")

    return docs

def quick_query(question: str):
    """Quick query with both search results and AI answer"""
    print(f"❓ **Question:** {question}")
    print("=" * 60)

    result = rag_system.ask(question)

    print("🤖 **ANSWER:**")
    print(result['answer'])
    print(f"\n⏱️ Processing time: {result['processing_time']:.2f} seconds")
    print(f"🔧 Method: {'AI + Retrieval' if result['gemini_used'] else 'Retrieval Only'}")

    return result

# Example searches you can run:
print("💡 **Available Functions:**")
print("• medical_search('your question') - Detailed search results")
print("• quick_query('your question') - Quick answer with AI")
  # print("\n🧪 **Try these examples:**")
  # print("• medical_search('diabetes symptoms')")
  # print("• quick_query('How is hypertension treated?')")
  # print("• medical_search('heart disease causes')")

💡 **Available Functions:**
• medical_search('your question') - Detailed search results
• quick_query('your question') - Quick answer with AI


In [12]:
# Cardiovascular
medical_search('heart attack symptoms')

# Endocrine
medical_search('thyroid disorders')

# Respiratory
medical_search('fatty liver treatment')



🏥 MEDICAL ENCYCLOPEDIA SEARCH
Question: heart attack symptoms
🔍 Searching for: 'heart attack symptoms'
🔄 Creating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 5 relevant documents
   1. Page 209 (similarity: 0.200)
   2. Page 208 (similarity: 0.123)
   3. Page 342 (similarity: 0.044)
   4. Page 343 (similarity: 0.034)
   5. Page 421 (similarity: 0.015)

📚 EVIDENCE FROM MEDICAL ENCYCLOPEDIA:
Found 5 relevant sections from your 637-page encyclopedia

📄 **SOURCE 1: PAGE 209**
🎯 Relevance: 20.0%
──────────────────────────────────────────────────
nary artery muscle spasm of insufficient duration or
intensity to cause an actual heart attack.
Causes and symptoms
Angina causes a pressing pain or sensation of heavi-
ness, usually in the chest area under the breast bone (ster-
num). It occasionally is experienced in the shoulder, arm,
neck, or jaw regions. Because episodes of angina occur
when the heart’s need for oxygen increases beyond the
oxygen available from the blood nourishing the heart, the
condition is often precipitated by physical exertion. In
most cases, the symptoms are relieved within

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 5 relevant documents
   1. Page 436 (similarity: 0.085)
   2. Page 437 (similarity: -0.030)
   3. Page 337 (similarity: -0.032)
   4. Page 75 (similarity: -0.034)
   5. Page 402 (similarity: -0.062)

📚 EVIDENCE FROM MEDICAL ENCYCLOPEDIA:
Found 5 relevant sections from your 637-page encyclopedia

📄 **SOURCE 1: PAGE 436**
🎯 Relevance: 8.5%
──────────────────────────────────────────────────
tem attacks and destroys the tissues that line bone joints
and cartilage. The disease occurs throughout the body,
although some joints may be more affected than others.
• Goodpasture’s syndrome. Occurs when antibodies are
deposited in the membranes of both the lung and kid-
neys, causing both inflammation of kidney glomerulus
(glomerulonephritis) and lung bleeding. It is typically
a disease of young males.
• Grave’s disease. Caused by an antibody that binds to
specific cells in the thyroid gland, causing them to
make excessive amounts of thyroid hor

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 5 relevant documents
   1. Page 495 (similarity: -0.042)
   2. Page 624 (similarity: -0.051)
   3. Page 410 (similarity: -0.065)
   4. Page 112 (similarity: -0.092)
   5. Page 410 (similarity: -0.092)

📚 EVIDENCE FROM MEDICAL ENCYCLOPEDIA:
Found 5 relevant sections from your 637-page encyclopedia

📄 **SOURCE 1: PAGE 495**
🎯 Relevance: -4.2%
──────────────────────────────────────────────────
Resources
BOOKS
Balistreri, William F. “Cholestasis.” In Nelson Textbook of
Pediatrics, ed. Richard E. Behrman. Philadelphia: W. B.
Saunders Co., 1996.
Feldman, Mark, et al. “Diseases of the Bile Ducts.” Sleisenger
& Fordtran’s Gastrointestinal and Liver Disease.
Philadelphia: W. B. Saunders Co., 1998.
PERIODICALS
Ryckman, F., R. Fisher, and S. Pedersen, et al. “Improved Sur-
vival in Biliary Atresia Patients in the Present Era of Liver
Transplantation.” Journal of Pediatric Surgery 28 (1993):
382.
J. Ricker Polsdorfer, MD
Biliary duct cancer see

[{'content': 'Resources\nBOOKS\nBalistreri, William F. “Cholestasis.” In Nelson Textbook of\nPediatrics, ed. Richard E. Behrman. Philadelphia: W. B.\nSaunders Co., 1996.\nFeldman, Mark, et al. “Diseases of the Bile Ducts.” Sleisenger\n& Fordtran’s Gastrointestinal and Liver Disease.\nPhiladelphia: W. B. Saunders Co., 1998.\nPERIODICALS\nRyckman, F., R. Fisher, and S. Pedersen, et al. “Improved Sur-\nvival in Biliary Atresia Patients in the Present Era of Liver\nTransplantation.” Journal of Pediatric Surgery 28 (1993):\n382.\nJ. Ricker Polsdorfer, MD\nBiliary duct cancer see Gallbladder cancer\nBiliary tract cancer see Bile duct cancer\nBilirubin test see Liver function tests\nBinge-eating disorder\nDefinition\nBinge eating disorder (BED) is characterized by a\nloss of control over eating behaviors. The binge eater\nconsumes unnaturally large amounts of food in a short\ntime period, but unlike a bulimic, does not regularly\nengage in any inappropriate weight-reducing behaviors',
  'meta

In [13]:
# import gradio as gr
# import socket
# import random

# def find_free_port(start_port=7860, max_attempts=50):
#     """Find a free port starting from start_port"""
#     for port in range(start_port, start_port + max_attempts):
#         try:
#             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
#                 s.bind(('localhost', port))
#                 return port
#         except OSError:
#             continue
#     # If no port found in range, try random ports
#     for _ in range(20):
#         port = random.randint(8000, 9999)
#         try:
#             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
#                 s.bind(('localhost', port))
#                 return port
#         except OSError:
#             continue
#     raise OSError("Could not find a free port")

# def create_gradio_interface():
#     """Create enhanced Gradio interface with fallback support"""

#     def process_query(query: str, show_sources: bool = True, num_sources: int = 5, use_ai: bool = True):
#         if not query.strip():
#             return "Please enter a medical question.", "", ""

#         try:
#             # Force retrieval-only mode if requested or if Gemini unavailable
#             if not use_ai or not rag_system.gemini_available:
#                 # Use medical_search function for retrieval-only
#                 docs = rag_system.retrieve_documents(query, num_sources)
#                 answer = rag_system._format_retrieval_answer(query, docs)
#                 processing_time = 0.5  # Estimated time for retrieval
#                 gemini_used = False
#             else:
#                 # Try full RAG with AI
#                 result = rag_system.ask(query)
#                 answer = result['answer']
#                 docs = result['sources']
#                 processing_time = result['processing_time']
#                 gemini_used = result['gemini_used']

#             # Format sources
#             sources_text = ""
#             stats_text = f"⏱️ Time: {processing_time:.2f}s | 📊 Sources: {len(docs)} | 🤖 AI: {'Yes' if gemini_used else 'No'}"

#             if show_sources:
#                 sources_text = "\n\n📚 **SOURCE REFERENCES:**\n" + "="*50 + "\n"
#                 for i, doc in enumerate(docs, 1):
#                     page = doc['metadata'].get('page', '?')
#                     score = doc['score']
#                     content_preview = doc['content'][:300] + "..." if len(doc['content']) > 300 else doc['content']

#                     sources_text += f"\n**[{i}] 📄 Page {page} | 🎯 Relevance: {score:.3f}**\n"
#                     sources_text += f"{content_preview}\n"
#                     sources_text += "-" * 50 + "\n"

#             return answer, sources_text, stats_text

#         except Exception as e:
#             error_msg = f"❌ Error: {str(e)}"
#             return error_msg, "", error_msg

#     # Create interface
#     with gr.Blocks(
#         title="Medical Encyclopedia RAG Chatbot",
#         theme=gr.themes.Soft(),
#         css="""
#         .medical-header {
#             background: linear-gradient(90deg, #4CAF50, #2196F3);
#             color: white;
#             padding: 20px;
#             border-radius: 10px;
#             text-align: center;
#             margin-bottom: 20px;
#         }
#         """
#     ) as demo:

#         # Header with status
#         status_indicator = "🤖 AI Enabled" if rag_system.gemini_available else "📚 Retrieval Mode"
#         gr.HTML(f"""
#         <div class="medical-header">
#             <h1>🏥 Medical Encyclopedia RAG Chatbot</h1>
#             <p>Ask evidence-based medical questions • {status_indicator} • 637-page Medical Encyclopedia</p>
#         </div>
#         """)

#         with gr.Row():
#             with gr.Column(scale=2):
#                 gr.Markdown("### 🔍 Ask Your Medical Question")

#                 query_input = gr.Textbox(
#                     label="Medical Question",
#                     placeholder="e.g., What are the symptoms and treatment options for diabetes?",
#                     lines=3
#                 )

#                 with gr.Row():
#                     submit_btn = gr.Button("🔍 Search & Answer", variant="primary", size="lg")
#                     clear_btn = gr.Button("🗑️ Clear", variant="secondary")

#                 with gr.Accordion("⚙️ Search Options", open=False):
#                     show_sources = gr.Checkbox(label="Show source references", value=True)
#                     num_sources = gr.Slider(1, 10, value=5, step=1, label="Number of sources")
#                     use_ai = gr.Checkbox(
#                         label="Use AI generation (if available)",
#                         value=rag_system.gemini_available,
#                         interactive=rag_system.gemini_available
#                     )

#             with gr.Column(scale=3):
#                 gr.Markdown("### 🤖 Medical Information Response")

#                 answer_output = gr.Textbox(
#                     label="Medical Information",
#                     lines=12,
#                     max_lines=20,
#                     show_copy_button=True
#                 )

#                 stats_output = gr.Textbox(label="Query Statistics", lines=1)
#                 sources_output = gr.Textbox(label="Source References", lines=8, show_copy_button=True)

#         # Event handlers
#         submit_btn.click(
#             process_query,
#             inputs=[query_input, show_sources, num_sources, use_ai],
#             outputs=[answer_output, sources_output, stats_output]
#         )

#         query_input.submit(
#             process_query,
#             inputs=[query_input, show_sources, num_sources, use_ai],
#             outputs=[answer_output, sources_output, stats_output]
#         )

#         clear_btn.click(lambda: ("", "", "", ""), outputs=[query_input, answer_output, sources_output, stats_output])

#         # Example queries
#         gr.Markdown("### 💡 Example Medical Questions")
#         examples = [
#             "What are the symptoms of diabetes?",
#             "How is hypertension treated?",
#             "What causes heart disease?",
#             "What are the side effects of aspirin?",
#             "How do antibiotics work?",
#             "What is the anatomy of the heart?",
#             "Symptoms of heart attack vs stroke",
#             "Treatment options for cancer"
#         ]

#         with gr.Row():
#             for example in examples[:4]:
#                 gr.Button(example, size="sm").click(lambda x=example: x, outputs=query_input)

#         with gr.Row():
#             for example in examples[4:]:
#                 gr.Button(example, size="sm").click(lambda x=example: x, outputs=query_input)

#         # Disclaimer
#         gr.Markdown("""
#         ---
#         ### ⚠️ Important Medical Disclaimer
#         **This AI assistant is for educational and informational purposes only:**
#         - 🩺 Always consult qualified healthcare professionals for medical advice
#         - 🚨 Not a substitute for professional medical diagnosis or treatment
#         - 📚 Information based on medical encyclopedia content only
#         - ⚕️ For emergencies, contact emergency services immediately
#         """)

#     return demo

# # Launch interface with dynamic port finding
# print("🌐 Creating enhanced Gradio interface...")
# demo = create_gradio_interface()

# print("✅ Interface ready with fallback support!")
# print("🔍 Finding available port...")

# try:
#     free_port = find_free_port()
#     print(f"🚀 Launching on port {free_port}...")

#     demo.launch(
#         share=True,
#         server_name="0.0.0.0",
#         server_port=free_port,
#         show_error=True,
#         quiet=False
#     )
# except Exception as e:
#     print(f"❌ Error launching Gradio: {e}")
#     print("💡 Try manually specifying a different port:")
#     print("demo.launch(server_port=8080)  # or any other available port")

In [14]:
print("🧪 **QUICK TESTS** - Try these functions:")

# Test the working retrieval system
print("\n1. Testing medical search...")
medical_search("diabetes symptoms", 3)

print("\n" + "="*60)
print("2. Testing quick query...")
quick_query("What causes hypertension?")

print("\n" + "="*60)
print("🎉 **YOUR MEDICAL RAG SYSTEM IS WORKING!**")
print("\n💡 **Available Functions:**")
print("• medical_search('question', num_results) - Detailed search")
print("• quick_query('question') - Fast answer")
print("• system_diagnostic() - Check system status")

print(f"\n📊 **System Status:**")
print(f"• Database: {collection.count():,} medical text chunks ready")
print(f"• Retrieval: ✅ Working perfectly")
print(f"• AI Generation: {'✅ Available' if rag_system.gemini_available else '📚 Retrieval-only mode'}")
#print(f"• Web Interface: ✅ Running at your Gradio URL")

🧪 **QUICK TESTS** - Try these functions:

1. Testing medical search...
🏥 MEDICAL ENCYCLOPEDIA SEARCH
Question: diabetes symptoms
🔍 Searching for: 'diabetes symptoms'
🔄 Creating embeddings for 1 texts...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 3 relevant documents
   1. Page 543 (similarity: 0.057)
   2. Page 277 (similarity: 0.030)
   3. Page 544 (similarity: -0.040)

📚 EVIDENCE FROM MEDICAL ENCYCLOPEDIA:
Found 3 relevant sections from your 637-page encyclopedia

📄 **SOURCE 1: PAGE 543**
🎯 Relevance: 5.7%
──────────────────────────────────────────────────
begin to fall. A person with diabetes mellitus either does
not make enough insulin, or makes insulin that does not
work properly. The result is blood sugar that remains
high, a condition called hyperglycemia.
Diabetes must be diagnosed as early as possible. If
left untreated, it can damage or cause failure of the eyes,
kidneys, nerves, heart, blood vessels, and other body
organs. Hypoglycemia, or low blood sugar, may also be
discovered through blood sugar testing. Hypoglycemia is
caused by various hormone disorders and liver disease,
as well as by too much insulin.
Description
There are a variety of ways to measure a pe

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Created embeddings with shape: (1, 384)
✅ Retrieved 5 relevant documents
   1. Page 219 (similarity: 0.165)
   2. Page 115 (similarity: -0.018)
   3. Page 143 (similarity: -0.030)
   4. Page 249 (similarity: -0.033)
   5. Page 408 (similarity: -0.042)
🤖 **ANSWER:**
According to Source 1, page 219,  angiotensin II, produced by the conversion of angiotensin I by ACE (angiotensin-converting enzyme), causes narrowing of blood vessels, increasing blood pressure.  Additionally, angiotensin II stimulates aldosterone, further increasing blood pressure.  Certain kidney disorders can also increase angiotensin II production, contributing to hypertension.  Source 3, page 143, states that high blood pressure (hypertension) puts strain on the heart and arteries, and over time can cause damage leading to stroke, heart failure, or kidney failure.  Source 2, page 115 mentions elevated aldosterone levels are seen in secondary aldosteronism, stress, and malignant hypertension.  Please consult a healthc

In [15]:
# def export_vector_db():
#     """Export ChromaDB for local use"""
#     import zipfile
#     from google.colab import files

#     if os.path.exists(Config.CHROMA_PERSIST_DIR):
#         print("📦 Creating vector database export...")

#         zip_path = "medical_rag_vectordb.zip"
#         with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#             for root, dirs, file_list in os.walk(Config.CHROMA_PERSIST_DIR):
#                 for file in file_list:
#                     file_path = os.path.join(root, file)
#                     arc_path = os.path.relpath(file_path, Config.CHROMA_PERSIST_DIR)
#                     zipf.write(file_path, f"chroma_db/{arc_path}")

#         zip_size = os.path.getsize(zip_path) / (1024 * 1024)
#         print(f"✅ Export ready: {zip_path} ({zip_size:.1f} MB)")

#         files.download(zip_path)
#         print("📥 Download started! Use this with local RAG setups.")

#     else:
#         print("❌ No vector database found to export")

# def show_database_stats():
#     """Show detailed database statistics"""
#     print("📊 MEDICAL RAG DATABASE STATISTICS")
#     print("="*50)
#     print(f"📚 Collection: {Config.COLLECTION_NAME}")
#     print(f"💾 Storage: {Config.CHROMA_PERSIST_DIR}")
#     print(f"🔢 Total documents: {collection.count():,}")
#     print(f"🔍 Embedding model: {Config.EMBEDDING_MODEL}")
#     print(f"📐 Embedding dimension: {embedding_manager.model.get_sentence_embedding_dimension()}")
#     print(f"⚙️ Chunk size: {Config.CHUNK_SIZE:,} characters")
#     print(f"🔄 Chunk overlap: {Config.CHUNK_OVERLAP} characters")
#     print(f"🎯 Default retrieval: {Config.TOP_K_RESULTS} documents")

#     # Sample some documents to show variety
#     sample_docs = rag_system.retrieve_documents("medical", 3)
#     print(f"\n📋 Sample document pages: ", end="")
#     pages = [str(doc['metadata']['page']) for doc in sample_docs]
#     print(", ".join(pages))

# def search_by_page(page_number: int):
#     """Find documents from a specific page"""
#     print(f"📄 Searching for content from page {page_number}...")

#     # Query ChromaDB for specific page
#     results = collection.get(
#         where={"page": page_number},
#         include=['documents', 'metadatas']
#     )

#     if results['documents']:
#         print(f"✅ Found {len(results['documents'])} chunks from page {page_number}")
#         for i, doc in enumerate(results['documents']):
#             print(f"\nChunk {i+1}:")
#             print("-" * 30)
#             preview = doc[:300] + "..." if len(doc) > 300 else doc
#             print(preview)
#     else:
#         print(f"❌ No content found for page {page_number}")

# def find_pages_about(topic: str, threshold: float = 0.1):
#     """Find all pages that mention a specific medical topic"""
#     print(f"🔍 Finding pages about '{topic}'...")

#     docs = rag_system.retrieve_documents(topic, 20)  # Get more results
#     relevant_pages = set()

#     for doc in docs:
#         if doc['score'] > threshold:
#             relevant_pages.add(doc['metadata']['page'])

#     sorted_pages = sorted(list(relevant_pages))
#     print(f"📚 Found {len(sorted_pages)} pages about '{topic}': {sorted_pages}")
#     return sorted_pages

# def compare_treatments(condition1: str, condition2: str):
#     """Compare treatments for two medical conditions"""
#     print(f"⚖️ COMPARING TREATMENTS: {condition1} vs {condition2}")
#     print("="*60)

#     # Search for each condition
#     docs1 = rag_system.retrieve_documents(f"{condition1} treatment", 3)
#     docs2 = rag_system.retrieve_documents(f"{condition2} treatment", 3)

#     print(f"📋 **{condition1.upper()} TREATMENT:**")
#     for doc in docs1:
#         page = doc['metadata']['page']
#         score = doc['score']
#         preview = doc['content'][:200] + "..."
#         print(f"Page {page} ({score:.1%}): {preview}")
#         print()

#     print(f"📋 **{condition2.upper()} TREATMENT:**")
#     for doc in docs2:
#         page = doc['metadata']['page']
#         score = doc['score']
#         preview = doc['content'][:200] + "..."
#         print(f"Page {page} ({score:.1%}): {preview}")
#         print()

# def medical_terminology_search(term: str):
#     """Search for medical terminology definitions"""
#     print(f"🔬 MEDICAL TERMINOLOGY: '{term}'")
#     print("="*40)

#     # Search for the term
#     docs = rag_system.retrieve_documents(f"what is {term} definition", 5)

#     best_match = docs[0] if docs else None
#     if best_match and best_match['score'] > 0.1:
#         page = best_match['metadata']['page']
#         content = best_match['content']

#         print(f"📖 **Definition found on Page {page}:**")
#         print(content[:500] + ("..." if len(content) > 500 else ""))
#     else:
#         print(f"❌ No clear definition found for '{term}'")
#         print("📋 Related content:")
#         for doc in docs[:3]:
#             page = doc['metadata']['page']
#             preview = doc['content'][:150] + "..."
#             print(f"Page {page}: {preview}")

# def symptom_checker(symptoms: List[str]):
#     """Check what conditions might be associated with given symptoms"""
#     print(f"🩺 SYMPTOM ANALYSIS")
#     print("="*30)
#     print(f"Symptoms: {', '.join(symptoms)}")
#     print("="*30)

#     all_docs = []
#     for symptom in symptoms:
#         docs = rag_system.retrieve_documents(f"{symptom} symptoms disease", 3)
#         all_docs.extend(docs)

#     # Group by page to find common conditions
#     page_scores = {}
#     for doc in all_docs:
#         page = doc['metadata']['page']
#         if page in page_scores:
#             page_scores[page] = max(page_scores[page], doc['score'])
#         else:
#             page_scores[page] = doc['score']

#     # Sort by relevance
#     sorted_pages = sorted(page_scores.items(), key=lambda x: x[1], reverse=True)

#     print("📋 **Possible conditions to investigate:**")
#     for page, score in sorted_pages[:5]:
#         if score > 0.05:  # Only show relevant matches
#             # Get content for this page
#             page_docs = [doc for doc in all_docs if doc['metadata']['page'] == page]
#             if page_docs:
#                 preview = page_docs[0]['content'][:200] + "..."
#                 print(f"Page {page} ({score:.1%}): {preview}")
#                 print()

In [16]:
# def drug_interaction_search(drug1: str, drug2: str = None):
#     """Search for drug information and interactions"""
#     if drug2:
#         query = f"{drug1} {drug2} interaction side effects"
#         print(f"💊 DRUG INTERACTION SEARCH: {drug1} + {drug2}")
#     else:
#         query = f"{drug1} side effects contraindications"
#         print(f"💊 DRUG INFORMATION: {drug1}")

#     print("="*50)

#     docs = rag_system.retrieve_documents(query, 5)

#     for i, doc in enumerate(docs, 1):
#         if doc['score'] > 0.05:  # Only show relevant results
#             page = doc['metadata']['page']
#             score = doc['score']
#             content = doc['content']

#             print(f"📄 **Source {i} - Page {page} ({score:.1%})**")

#             # Look for key drug information
#             sentences = content.split('. ')
#             relevant_sentences = []

#             search_terms = [drug1.lower()]
#             if drug2:
#                 search_terms.append(drug2.lower())

#             for sentence in sentences:
#                 if any(term in sentence.lower() for term in search_terms):
#                     relevant_sentences.append(sentence.strip())

#             if relevant_sentences:
#                 for sentence in relevant_sentences[:3]:  # Show top 3 relevant sentences
#                     print(f"• {sentence}")
#             else:
#                 # Fallback to content preview
#                 print(doc['content'][:300] + "...")

#             print()

# def anatomy_explorer(body_part: str):
#     """Explore anatomy of a specific body part"""
#     print(f"🫀 ANATOMY EXPLORER: {body_part}")
#     print("="*40)

#     queries = [
#         f"{body_part} anatomy structure",
#         f"{body_part} function physiology",
#         f"{body_part} location"
#     ]

#     all_docs = []
#     for query in queries:
#         docs = rag_system.retrieve_documents(query, 3)
#         all_docs.extend(docs)

#     # Remove duplicates and sort by relevance
#     seen_pages = set()
#     unique_docs = []
#     for doc in all_docs:
#         page = doc['metadata']['page']
#         if page not in seen_pages:
#             seen_pages.add(page)
#             unique_docs.append(doc)

#     unique_docs.sort(key=lambda x: x['score'], reverse=True)

#     print(f"📖 **Anatomical information about {body_part}:**")
#     for doc in unique_docs[:5]:
#         if doc['score'] > 0.08:
#             page = doc['metadata']['page']
#             score = doc['score']
#             print(f"\n📄 Page {page} ({score:.1%}):")
#             print(doc['content'][:400] + "...")

# def disease_progression_search(disease: str):
#     """Search for information about disease progression and stages"""
#     print(f"📈 DISEASE PROGRESSION: {disease}")
#     print("="*40)

#     queries = [
#         f"{disease} stages progression",
#         f"{disease} early late symptoms",
#         f"{disease} prognosis outcome"
#     ]

#     for i, query in enumerate(queries, 1):
#         print(f"\n**{i}. {query.title()}:**")
#         docs = rag_system.retrieve_documents(query, 2)

#         for doc in docs:
#             if doc['score'] > 0.1:
#                 page = doc['metadata']['page']
#                 preview = doc['content'][:250] + "..."
#                 print(f"Page {page}: {preview}")

In [17]:
# def batch_medical_search(questions: List[str]):
#     """Process multiple medical questions at once"""
#     print("🔄 BATCH MEDICAL SEARCH")
#     print("="*40)

#     results = []
#     for i, question in enumerate(questions, 1):
#         print(f"\n{i}. {question}")
#         print("-" * 30)

#         result = rag_system.ask(question)
#         results.append(result)

#         print(f"⏱️ Time: {result['processing_time']:.2f}s")
#         print(f"📊 Sources: {len(result['sources'])}")

#         # Show brief answer
#         answer_preview = result['answer'][:200] + "..." if len(result['answer']) > 200 else result['answer']
#         print(f"📝 Answer: {answer_preview}")
#         print()

#     return results

# def create_medical_report(topic: str):
#     """Create a comprehensive medical report on a topic"""
#     print(f"📋 COMPREHENSIVE MEDICAL REPORT: {topic}")
#     print("="*60)

#     # Different aspects to research
#     aspects = [
#         f"{topic} definition causes",
#         f"{topic} symptoms signs",
#         f"{topic} diagnosis tests",
#         f"{topic} treatment management",
#         f"{topic} prevention complications"
#     ]

#     report_sections = []

#     for aspect in aspects:
#         section_name = aspect.split()[1]  # Extract the aspect name
#         docs = rag_system.retrieve_documents(aspect, 3)

#         if docs and docs[0]['score'] > 0.05:
#             report_sections.append({
#                 'title': section_name.title(),
#                 'content': docs[0]['content'][:500] + "...",
#                 'page': docs[0]['metadata']['page'],
#                 'score': docs[0]['score']
#             })

#     # Generate report
#     print(f"**MEDICAL REPORT: {topic.upper()}**")
#     print(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
#     print("="*60)

#     for section in report_sections:
#         print(f"\n**{section['title']} (Page {section['page']})**")
#         print(section['content'])
#         print()

#     print("⚠️ This report is for educational purposes only. Consult healthcare professionals for medical advice.")

#     return report_sections

# def trending_medical_topics():
#     """Find the most referenced medical topics in your encyclopedia"""
#     print("📊 TRENDING MEDICAL TOPICS ANALYSIS")
#     print("="*40)

#     common_terms = [
#         "diabetes", "hypertension", "cancer", "heart disease",
#         "infection", "inflammation", "treatment", "medication",
#         "symptoms", "diagnosis", "therapy", "surgery"
#     ]

#     topic_scores = {}

#     for term in common_terms:
#         docs = rag_system.retrieve_documents(term, 10)
#         if docs:
#             # Calculate average relevance score
#             avg_score = sum(doc['score'] for doc in docs) / len(docs)
#             topic_scores[term] = {
#                 'avg_score': avg_score,
#                 'pages': len(set(doc['metadata']['page'] for doc in docs)),
#                 'top_score': docs[0]['score'] if docs else 0
#             }

#     # Sort by relevance
#     sorted_topics = sorted(topic_scores.items(), key=lambda x: x[1]['avg_score'], reverse=True)

#     print("📈 **Most Covered Medical Topics:**")
#     for topic, stats in sorted_topics[:8]:
#         print(f"• {topic.title()}: {stats['pages']} pages, avg relevance {stats['avg_score']:.1%}")

In [21]:
# Install required package if not available
!pip install nbformat

# Clear widget metadata to fix upload error
import json
import nbformat
from google.colab import files

# Your notebook name (keeping the same name)
notebook_name = 'medibot_prototype.ipynb'

try:
    # Read current notebook
    with open(notebook_name, 'r') as f:
        nb = nbformat.read(f, as_version=4)

    # Remove problematic widgets metadata
    if 'widgets' in nb.metadata:
        del nb.metadata['widgets']
        print("✅ Removed widgets metadata")

    # Also clear any cell-level widget state
    widgets_removed = 0
    for cell in nb.cells:
        if hasattr(cell, 'metadata') and 'widgets' in cell.metadata:
            del cell.metadata['widgets']
            widgets_removed += 1

    if widgets_removed > 0:
        print(f"✅ Removed widgets from {widgets_removed} cells")

    # Save back to the same file name
    with open(notebook_name, 'w') as f:
        nbformat.write(nb, f)

    print(f"✅ Fixed! Download the same file: {notebook_name}")

    # Download the fixed version (same name)
    files.download(notebook_name)

except Exception as e:
    print(f"❌ Error: {e}")
    print("File might not exist in current directory")
    # List files to check
    !ls *.ipynb

❌ Error: [Errno 2] No such file or directory: 'medibot_prototype.ipynb'
File might not exist in current directory
ls: cannot access '*.ipynb': No such file or directory


In [22]:
# Check what files are in the current directory
import os
print("Current directory files:")
for file in os.listdir('.'):
    print(f"  {file}")

print("\n" + "="*50)

# Search for any .ipynb files in the system
print("Searching for .ipynb files...")
!find /content -name "*.ipynb" 2>/dev/null | head -10

Current directory files:
  .config
  Medical_book.pdf
  chroma_db
  Medical_book (1).pdf
  .gradio
  sample_data

Searching for .ipynb files...
