# Part 1: Data Selection & Processing
## Medical Knowledge RAG System - Data Preprocessing Notebook

In [None]:
import os
import pandas as pd
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import matplotlib.pyplot as plt
import numpy as np

## 1. Data Loading and Exploration

In [None]:
# Load the medical PDF
loader = PyMuPDFLoader("../data/Medical_book.pdf")
docs = loader.load()

print(f"Loaded {len(docs)} pages from medical book")
print(f"First page preview: {docs[0].page_content[:500]}...")

## 2. Text Preprocessing and Analysis

In [None]:
# Analyze document statistics
page_lengths = [len(doc.page_content) for doc in docs]
total_chars = sum(page_lengths)
avg_page_length = np.mean(page_lengths)

print(f"Total characters: {total_chars:,}")
print(f"Average page length: {avg_page_length:.0f} characters")
print(f"Min page length: {min(page_lengths)}")
print(f"Max page length: {max(page_lengths)}")

# Visualize page length distribution
plt.figure(figsize=(10, 6))
plt.hist(page_lengths, bins=20, alpha=0.7)
plt.xlabel('Page Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Page Lengths')
plt.show()

## 3. Text Chunking Strategy

In [None]:
# Test different chunking strategies
chunk_sizes = [300, 500, 800, 1000]
overlap_ratios = [0.1, 0.2, 0.3]

chunking_results = []

for chunk_size in chunk_sizes:
    for overlap_ratio in overlap_ratios:
        overlap = int(chunk_size * overlap_ratio)
        
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap
        )
        
        chunks = splitter.split_documents(docs)
        
        chunking_results.append({
            'chunk_size': chunk_size,
            'overlap_ratio': overlap_ratio,
            'overlap': overlap,
            'num_chunks': len(chunks),
            'avg_chunk_length': np.mean([len(chunk.page_content) for chunk in chunks])
        })

# Display chunking analysis
chunking_df = pd.DataFrame(chunking_results)
print("Chunking Strategy Analysis:")
print(chunking_df)

## 4. Optimal Chunking and Embedding Generation

In [None]:
# Use optimal chunking strategy (500 chars with 10% overlap)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

chunks = splitter.split_documents(docs)
print(f"Created {len(chunks)} chunks")

# Sample chunk analysis
print("\nSample chunks:")
for i, chunk in enumerate(chunks[:3]):
    print(f"Chunk {i+1} (length: {len(chunk.page_content)}):")
    print(f"{chunk.page_content[:200]}...\n")

## 5. Embedding Generation and Vector Store Creation

In [None]:
# Initialize embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("Embedding model loaded successfully")
print(f"Embedding dimension: {len(embeddings.embed_query('test'))}")

In [None]:
# Create vector database
print("Creating vector database...")

db = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="../chroma_db"
)

db.persist()
print(f"Vector database created with {len(chunks)} document chunks")

## 6. Vector Database Validation

In [None]:
# Test retrieval functionality
test_queries = [
    "What are the symptoms of diabetes?",
    "How is blood pressure measured?",
    "What causes heart disease?"
]

retriever = db.as_retriever(search_kwargs={"k": 3})

for query in test_queries:
    print(f"\nQuery: {query}")
    results = retriever.get_relevant_documents(query)
    
    for i, doc in enumerate(results):
        print(f"Result {i+1}: {doc.page_content[:150]}...")
    print("-" * 50)

## Summary

This notebook demonstrates:
1. **Data Loading**: Successfully loaded medical PDF content
2. **Preprocessing**: Analyzed document structure and content
3. **Chunking Strategy**: Tested multiple approaches and selected optimal parameters
4. **Embeddings**: Generated vector representations using SentenceTransformers
5. **Vector Storage**: Created persistent ChromaDB for efficient retrieval
6. **Validation**: Tested retrieval functionality with sample queries

The processed embeddings are now ready for the RAG pipeline!