<a href="https://colab.research.google.com/github/melaku-tilahun/polio/blob/main/polio_virus_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install chromadb
!pip install PyPDF2
!pip install python-docx
!pip install google-generativeai

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [13]:
import chromadb
import google.generativeai as genai
import os
import PyPDF2
from docx import Document
from chromadb.utils import embedding_functions
import uuid

# Configuration
GEMINI_API_KEY = "AIzaSyAGO5xxrt5DrQi6dl9PhyzJ6zkaOv1BZtI"  # Set your Gemini API key in environment variables
MODEL_NAME = "gemini-2.0-flash"  # Adjust based on available Gemini models
CHROMA_PATH = "chroma_db"
DOCS_DIRECTORY = "./documents"  # Directory containing your PDF and Word files

# Initialize Gemini API
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(MODEL_NAME)

# Initialize Chroma client
client = chromadb.PersistentClient(path=CHROMA_PATH)
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
collection = client.get_or_create_collection(name="documents", embedding_function=embedding_function)

def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            return text.strip()
    except Exception as e:
        print(f"Error reading PDF {file_path}: {e}")
        return ""

def extract_text_from_docx(file_path):
    """Extract text from a Word document."""
    try:
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        return text.strip()
    except Exception as e:
        print(f"Error reading Word document {file_path}: {e}")
        return ""

def load_documents_from_directory(directory):
    """Load and extract text from all PDF and Word files in the directory."""
    sample_documents = []
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.lower().endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
            if text:
                sample_documents.append(text)
                print(f"Loaded PDF: {filename}")
        elif filename.lower().endswith((".docx", ".doc")):
            text = extract_text_from_docx(file_path)
            if text:
                sample_documents.append(text)
                print(f"Loaded Word: {filename}")
    return sample_documents

def index_documents(documents):
    """Index documents into Chroma vector store."""
    doc_ids = [str(uuid.uuid4()) for _ in documents]
    collection.add(
        documents=documents,
        ids=doc_ids
    )
    print(f"Indexed {len(documents)} documents.")

def retrieve_documents(query, n_results=2):
    """Retrieve relevant documents from Chroma based on query."""
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    return results['documents'][0]

def generate_response(query, context_docs):
    """Generate a response using Gemini API with retrieved context."""
    context = "\n".join(context_docs)
    prompt = f"""
    You are a helpful assistant. Use the following context to answer the query.
    Context:
    {context}

    Query: {query}
    Answer:
    """
    response = model.generate_content(prompt)
    return response.text

def rag_query(query):
    """Main RAG pipeline: retrieve and generate."""
    # Retrieve relevant documents
    retrieved_docs = retrieve_documents(query)
    # Generate response with context
    response = generate_response(query, retrieved_docs)
    return {
        "query": query,
        "retrieved_documents": retrieved_docs,
        "response": response
    }

if __name__ == "__main__":
    # Load documents from directory
    sample_documents = load_documents_from_directory(DOCS_DIRECTORY)

    if not sample_documents:
        print("No documents loaded. Please check the directory and file formats.")
    else:
        # Index documents
        index_documents(sample_documents)

        # Example query
        query = "can you explain in detail types of polio virus"
        result = rag_query(query)

        print(f"Query: {result['query']}")
        print("Retrieved Documents:")
        for doc in result['retrieved_documents']:
            print(f"- {doc[:100]}...")  # Truncate for readability
        print(f"Response: {result['response']}")

Loaded PDF: polio-the-beginning-of-the-end.pdf
Indexed 1 documents.
Query: can you explain in detail types of polio virus
Retrieved Documents:
- POLIO
The beginning of the end
 • World Health Organization •
Geneva 1997
POLIO
The
beginning
of the...
- POLIO
The beginning of the end
 • World Health Organization •
Geneva 1997
POLIO
The
beginning
of the...
Response: There are three related enteroviruses that cause polio: poliovirus type 1, type 2, or type 3. Polio infection provides lifelong immunity to the disease, but the protection is limited to the particular type of poliovirus involved and fails to provide cross-protection against the other two types of poliovirus.
