In [15]:
from dotenv import load_dotenv
load_dotenv()

True

In [16]:
import dspy
from pydantic import BaseModel, Field
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logging.getLogger("litellm").setLevel(logging.WARNING)
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

dspy.configure(lm=dspy.LM("gemini/gemini-2.5-flash", max_tokens=20000, cache=False))

In [17]:
import dspy
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from pathlib import Path
import json
import re
import frontmatter
from enum import Enum

# =============================================================================
# Data Models (Simplified)
# =============================================================================

class DocumentType(str, Enum):
    REFERENCE = "reference"
    GUIDE = "guide"
    API = "api"
    EXAMPLE = "example"
    OVERVIEW = "overview"
    CONFIG = "configuration"
    TROUBLESHOOTING = "troubleshooting"
    CHANGELOG = "changelog"

class ComplexityLevel(str, Enum):
    BEGINNER = "beginner"
    INTERMEDIATE = "intermediate"
    ADVANCED = "advanced"

class DocumentAnalysis(BaseModel):
    """Simplified document analysis result"""
    file_path: str
    title: str
    doc_type: DocumentType
    complexity_level: ComplexityLevel
    key_concepts: List[str]
    learning_objectives: List[str]
    semantic_summary: str
    code_languages: List[str]
    headings: List[str]
    prerequisites: List[str]
    related_topics: List[str]

In [18]:
# =============================================================================
# Agent Signatures (Simplified)
# =============================================================================

class BasicMetadataExtractor(dspy.Signature):
    """Extract basic metadata from document content"""
    content: str = dspy.InputField(desc="Raw document content")
    filename: str = dspy.InputField(desc="Document filename")
    
    title: str = dspy.OutputField(desc="Document title")
    headings: str = dspy.OutputField(desc="JSON list of document headings")
    code_languages: str = dspy.OutputField(desc="JSON list of programming languages found")

class DocumentClassifier(dspy.Signature):
    """Classify document type and complexity level"""
    content: str = dspy.InputField(desc="Document content")
    title: str = dspy.InputField(desc="Document title")
    overview_context: str = dspy.InputField(desc="Overview context from the repository. This should provide a high-level overview of the project.")
    
    doc_type: DocumentType = dspy.OutputField(desc="Document type classification")
    complexity_level: ComplexityLevel = dspy.OutputField(desc="Complexity level assessment")

class ConceptExtractor(dspy.Signature):
    """Extract key concepts and learning objectives from document"""
    content: str = dspy.InputField(desc="Document content")
    doc_type: str = dspy.InputField(desc="Document type")
    title: str = dspy.InputField(desc="Document title")
    
    key_concepts: str = dspy.OutputField(desc="JSON list of 3-5 key concepts")
    learning_objectives: str = dspy.OutputField(desc="JSON list of learning objectives")

class SemanticAnalyzer(dspy.Signature):
    """Generate semantic summary and analyze relationships"""
    content: str = dspy.InputField(desc="Document content")
    key_concepts: str = dspy.InputField(desc="Key concepts found")
    doc_type: str = dspy.InputField(desc="Document type")
    
    semantic_summary: str = dspy.OutputField(desc="5-7 sentence semantic summary")
    prerequisites: str = dspy.OutputField(desc="JSON list of prerequisites")
    related_topics: str = dspy.OutputField(desc="JSON list of related topics")

In [19]:
# =============================================================================
# Helper Functions
# =============================================================================

def extract_basic_metadata(content: str, filepath: Path) -> Dict[str, Any]:
    """Extract basic metadata - similar to your existing ContentExtractor"""
    try:
        post = frontmatter.loads(content)
        frontmatter_data = post.metadata
        clean_content = post.content
    except:
        frontmatter_data = {}
        clean_content = content
    
    title = extract_title(clean_content, frontmatter_data, filepath.name)
    headings = extract_headings(clean_content)
    code_blocks = extract_code_blocks(clean_content)
    
    # Get primary language
    code_languages = list(set(block['language'] for block in code_blocks 
                            if block['language'] not in ['text', 'txt', '']))
    
    return {
        'title': title,
        'headings': headings,
        'code_languages': code_languages,
        'frontmatter': frontmatter_data,
        'clean_content': clean_content
    }

def extract_title(content: str, frontmatter_data: dict, filename: str) -> str:
    """Extract document title"""
    if 'title' in frontmatter_data:
        return frontmatter_data['title'].strip()
    
    h1_match = re.search(r'^# (.+)$', content, re.MULTILINE)
    if h1_match:
        return h1_match.group(1).strip()
    
    return filename.replace('.md', '').replace('.mdx', '').replace('_', ' ').replace('-', ' ').title().strip()

def extract_headings(content: str) -> List[str]:
    """Extract all headings from content"""
    headings = []
    for match in re.finditer(r'^(#{1,6})\s+(.+)$', content, re.MULTILINE):
        hashes = match.group(1)
        text = match.group(2).strip()
        headings.append(f"{hashes} {text}")
    return headings

def extract_code_blocks(content: str) -> List[Dict[str, str]]:
    """Extract code blocks with language information"""
    code_blocks = []
    pattern = r'```(\w+)?\n(.*?)\n```'
    for match in re.finditer(pattern, content, re.DOTALL):
        language = match.group(1) or 'text'
        code_content = match.group(2).strip()
        code_blocks.append({
            'language': language,
            'content': code_content
        })
    return code_blocks

def safe_json_parse(json_str: str, fallback: list = None) -> list:
    """Safely parse JSON string with fallback"""
    if fallback is None:
        fallback = []
    try:
        result = json.loads(json_str)
        return result if isinstance(result, list) else fallback
    except:
        # Try to parse as comma-separated string
        if isinstance(json_str, str) and json_str.strip():
            return [item.strip() for item in json_str.split(',') if item.strip()]
        return fallback


def get_n_words(text: str, n: int) -> str:
    """Get the first n words from a text"""
    return ' '.join(text.split()[:n])

In [20]:
# =============================================================================
# Main Document Analyzer
# =============================================================================

class DocumentAnalyzer(dspy.Module):
    """Simplified document analyzer with focused multi-agent approach"""
    
    def __init__(self):
        super().__init__()
        self.metadata_extractor = dspy.ChainOfThought(BasicMetadataExtractor)
        self.classifier = dspy.ChainOfThought(DocumentClassifier)
        self.concept_extractor = dspy.ChainOfThought(ConceptExtractor)
        self.semantic_analyzer = dspy.ChainOfThought(SemanticAnalyzer)

        # configs
        self.max_overview_words = 10000
        self.max_content_words = 20000
    
    def analyze_document(self, file_path: str, overview_context: str = "") -> DocumentAnalysis:
        """Analyze a single document with multi-agent approach"""
        
        # Read document
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        filepath = Path(file_path)
        
        # Extract basic metadata (similar to your existing approach)
        basic_data = extract_basic_metadata(content, filepath)
        
        # Step 1: Enhanced metadata extraction (only if basic data is incomplete)
        needs_enhanced_extraction = (
            not basic_data['title'] or 
            not basic_data['headings'] or 
            not basic_data['code_languages']
        )
        
        if needs_enhanced_extraction:
            try:
                metadata_result = self.metadata_extractor(
                    content=get_n_words(content, self.max_content_words),
                    filename=filepath.name
                )
                
                # Merge with basic extraction
                title = metadata_result.title or basic_data['title']
                headings = safe_json_parse(metadata_result.headings, basic_data['headings'])
                code_languages = safe_json_parse(metadata_result.code_languages, basic_data['code_languages'])
                
            except Exception as e:
                logger.error(f"Error in metadata extraction: {e}")
                # Fallback to basic extraction
                title = basic_data['title']
                headings = basic_data['headings']
                code_languages = basic_data['code_languages']
        else:
            # Use basic extraction results directly
            title = basic_data['title']
            headings = basic_data['headings']
            code_languages = basic_data['code_languages']
        
        # Step 2: Classification
        try:
            classification_result = self.classifier(
                content=get_n_words(content, self.max_content_words),  # First 2000 chars
                title=title,
                overview_context=get_n_words(overview_context, self.max_overview_words)
            )
            doc_type = classification_result.doc_type
            complexity_level = classification_result.complexity_level
        except Exception as e:
            logger.error(f"Error in classification: {e}")
            # Fallback classification
            doc_type = DocumentType.GUIDE
            complexity_level = ComplexityLevel.INTERMEDIATE
        
        # Step 3: Concept extraction
        try:
            concept_result = self.concept_extractor(
                content=get_n_words(content, self.max_content_words),
                doc_type=doc_type.value,
                title=title
            )
            key_concepts = safe_json_parse(concept_result.key_concepts)
            learning_objectives = safe_json_parse(concept_result.learning_objectives)
        except Exception as e:
            logger.error(f"Error in concept extraction: {e}")
            key_concepts = []
            learning_objectives = []
        
        # Step 4: Semantic analysis
        try:
            semantic_result = self.semantic_analyzer(
                content=get_n_words(content, self.max_content_words),
                key_concepts=json.dumps(key_concepts),
                doc_type=doc_type.value
            )
            semantic_summary = semantic_result.semantic_summary
            prerequisites = safe_json_parse(semantic_result.prerequisites)
            related_topics = safe_json_parse(semantic_result.related_topics)
        except Exception as e:
            logger.error(f"Error in semantic analysis: {e}")
            semantic_summary = f"Documentation about {title}"
            prerequisites = []
            related_topics = []
        
        # Create analysis result
        return DocumentAnalysis(
            file_path=file_path,
            title=title,
            doc_type=doc_type,
            complexity_level=complexity_level,
            key_concepts=key_concepts,
            learning_objectives=learning_objectives,
            semantic_summary=semantic_summary,
            code_languages=code_languages,
            headings=headings,
            prerequisites=prerequisites,
            related_topics=related_topics
        )
    
    def analyze_batch(self, file_paths: List[str], overview_context: str = "") -> List[DocumentAnalysis]:
        """Analyze multiple documents"""
        results = []
        
        for file_path in file_paths:
            try:
                analysis = self.analyze_document(file_path, overview_context)
                results.append(analysis)
                logger.info(f"✓ Analyzed: {Path(file_path).name}")
            except Exception as e:
                logger.error(f"✗ Failed to analyze {Path(file_path).name}: {e}")
                continue
        
        return results

In [21]:
with open('/Users/arshath/play/naptha/tutor/.cache/modelcontextprotocol_docs_9b06b34c6341a02b233055dc593dd641/docs/concepts/architecture.mdx', 'r') as f:
    overview_content = f.read()

In [22]:
document_analyzer = DocumentAnalyzer()

In [23]:
test_file = '/Users/arshath/play/naptha/tutor/.cache/modelcontextprotocol_docs_9b06b34c6341a02b233055dc593dd641/docs/concepts/prompts.mdx'
doc_analysis = document_analyzer.analyze_document(
    file_path=test_file,
    overview_context="overview_content"
)

In [24]:
doc_analysis.model_dump()

{'file_path': '/Users/arshath/play/naptha/tutor/.cache/modelcontextprotocol_docs_9b06b34c6341a02b233055dc593dd641/docs/concepts/prompts.mdx',
 'title': 'Prompts',
 'doc_type': <DocumentType.GUIDE: 'guide'>,
 'complexity_level': <ComplexityLevel.INTERMEDIATE: 'intermediate'>,
 'key_concepts': ['Prompt Templates',
  'Prompt Structure and Arguments',
  'Client-Server Interaction for Prompts',
  'Dynamic Prompt Capabilities',
  'Prompt Security and Best Practices'],
 'learning_objectives': ['Define what prompts are within the Model Context Protocol (MCP) and explain their primary purpose in standardizing LLM interactions.',
  "Describe the essential components of a prompt's structure, including its name, description, and the role of dynamic arguments.",
  'Explain the client-server interaction mechanisms for discovering available prompts (`prompts/list`) and executing specific prompts (`prompts/get`).',
  'Illustrate how prompts can be dynamic by incorporating embedded resource context and