# File Structure Functionality Demonstration

In [1]:
import sys

# sys.path.append("/Users/sravanj/project_work/yellhorn-mcp/yellhorn-mcp")

In [2]:
print("\n".join(sys.path))

/opt/anaconda3/envs/sravan-yellhorn/lib/python311.zip
/opt/anaconda3/envs/sravan-yellhorn/lib/python3.11
/opt/anaconda3/envs/sravan-yellhorn/lib/python3.11/lib-dynload

/opt/anaconda3/envs/sravan-yellhorn/lib/python3.11/site-packages
/Users/sravanj/project_work/yellhorn-mcp


In [8]:
from yellhorn_mcp.server import get_codebase_snapshot
from pathlib import Path

In [28]:
repo_path = Path("/Users/sravanj/project_work/yellhorn-mcp")

user_task = "Debug MCP"

# Get file paths from codebase snapshot
# The get_codebase_snapshot already respects .gitignore patterns by default
# This will give us only tracked and untracked files that aren't ignored by git
file_paths, _ = await get_codebase_snapshot(repo_path, _mode="paths")

if not file_paths:
    raise YellhornMCPError("No files found in repository to analyze")

Found .yellhornignore file, using it for filtering
Filtering codebase with 14 blacklist and 2 whitelist patterns from .yellhornignore
Filtered from 44 to 9 files


In [None]:
import os

# Assume dir_chunk is your list of directory paths (e.g. ['.', 'src', 'tests', …])
# and file_paths is the full list of files (e.g. ['app.py', 'src/main.py', 'tests/test_main.py', …])



# Now you can inject `directory_tree` into your prompt
print(directory_tree)

top_directory
	.yellhornignore
	pyproject.toml
.claude
	settings.local.json
notebooks
	file_structure.ipynb
yellhorn_mcp
	__init__.py
	cli.py
	lsp_utils.py
	server.py
	tree_utils.py


In [29]:
# Extract and analyze directories from filtered files
all_dirs = set()
for file_path in file_paths:
    # Get all parent directories of this file
    parts = file_path.split('/')
    for i in range(1, len(parts)):
        dir_path = '/'.join(parts[:i])
        if dir_path:  # Skip empty strings
            all_dirs.add(dir_path)

# Add root directory ('.') if there are files at the root level
if any('/' not in f for f in file_paths):
    all_dirs.add('.')
    
# Sort directories for consistent output
sorted_dirs = sorted(list(all_dirs))

# Set chunk size based on reasoning mode
chunk_size = 3000  # Process more files per chunk for file structure mode
    
# Calculate number of chunks needed
total_chunks = (len(sorted_dirs) + chunk_size - 1) // chunk_size  # Ceiling division

# Create chunks of directories
dir_chunks = []
for i in range(0, len(sorted_dirs), chunk_size):
    dir_chunks.append(sorted_dirs[i:i + chunk_size])


In [None]:
# Track important directories
all_important_dirs = set()

# Helper function to process a single chunk
async def process_chunk(chunk_idx, dir_chunk):    
    lines = []
    for dir_path in dir_chunk:
        # Choose a nicer label for the root directory
        dir_label = 'top_directory' if dir_path == '.' else dir_path
        lines.append(dir_label)

        # Gather up to 5 direct children files of this directory
        if dir_path == '.':
            dir_files = [f for f in file_paths if '/' not in f]
        else:
            prefix = dir_path.rstrip('/') + '/'
            dir_files = [
                f for f in file_paths 
                if f.startswith(prefix) and '/' not in f[len(prefix):]
            ]
        samples = dir_files[:5]

        # Append each sample file under the directory, indented with a tab
        for f in samples:
            lines.append(f"\t{os.path.basename(f)}")

    # Final single representation
    directory_tree = "\n".join(lines)
    
    # Construct the prompt for this chunk
    prompt = f"""You are an expert software developer tasked with analyzing a codebase structure to identify important directories for AI context.

<user_task>
{user_task}
</user_task>

Your goal is to identify the most important directories that should be included when an AI assistant analyzes this codebase for the user's task.

Below is a list of directories from the codebase (chunk {chunk_idx + 1} of {total_chunks}):

<directories>
{directory_tree}
</directories>

Analyze these directories and identify the ones that:
1. Contain core application code relevant to the user's task
2. Likely contain important business logic
3. Would be essential for understanding the codebase architecture
4. Are needed to implement the requested task

Ignore directories that:
1. Contain only build artifacts or generated code
2. Store dependencies or vendor code
3. Contain temporary or cache files
4. Probably aren't relevant to the user's specific task

Return your analysis as a list of important directories, one per line, in this format:

```context
dir1
dir2
dir3
```

Don't include explanations for your choices, just return the list in the specified format.
"""
    print(prompt)
    
    # Call the appropriate AI model based on type
    # is_openai_model = model.startswith("gpt-") or model.startswith("o")
    
    
#     chunk_important_dirs = set()
    
#     try:
#         if is_openai_model:
#             if not openai_client:
#                 raise YellhornMCPError("OpenAI client not initialized. Is OPENAI_API_KEY set?")
                
#             # Convert the prompt to OpenAI messages format
#             messages = [{"role": "user", "content": prompt}]
            
#             # Call OpenAI API
#             response = await openai_client.chat.completions.create(
#                 model=model,
#                 messages=messages,
#             )
            
#             # Extract content
#             chunk_result = response.choices[0].message.content
#         else:
#             if gemini_client is None:
#                 raise YellhornMCPError("Gemini client not initialized. Is GEMINI_API_KEY set?")
            
#             # Call Gemini API
#             response = await gemini_client.aio.models.generate_content(model=model, contents=prompt)
#             chunk_result = response.text
        
#         # Extract directory paths from the result
#         in_context_block = False
#         for line in chunk_result.split('\n'):
#             line = line.strip()
            
#             if line == "```context":
#                 in_context_block = True
#                 continue
#             elif line == "```" and in_context_block:
#                 in_context_block = False
#                 continue
            
#             if in_context_block and line and not line.startswith('#'):
#                 chunk_important_dirs.add(line)
        
#         # If we didn't find a context block, try to extract directories directly
#         if not chunk_important_dirs and not in_context_block:
#             for line in chunk_result.split('\n'):
#                 line = line.strip()
#                 # Only add if it looks like a directory path (no spaces, existing in our list)
#                 if line and ' ' not in line and line in dir_chunk:
#                     chunk_important_dirs.add(line)
        
#         # Log the directories found
#         dirs_str = ", ".join(sorted(list(chunk_important_dirs))[:5])
#         if len(chunk_important_dirs) > 5:
#             dirs_str += f", ... ({len(chunk_important_dirs) - 5} more)"
        
#         await ctx.log(
#             level="info",
#             message=f"Chunk {chunk_idx + 1} processed, found {len(chunk_important_dirs)} important directories: {dirs_str}"
#         )
        
#     except Exception as chunk_error:
#         await ctx.log(
#             level="error", 
#             message=f"Error processing chunk {chunk_idx + 1}: {str(chunk_error)} ({type(chunk_error).__name__})"
#         )
#         # Continue with next chunk despite errors
    
#     # Return results from this chunk
#     return chunk_important_dirs

# # Use semaphore to limit concurrency to 5 parallel calls
# semaphore = asyncio.Semaphore(5)

# async def bounded_process_chunk(chunk_idx, dir_chunk):
#     async with semaphore:
#         return await process_chunk(chunk_idx, dir_chunk)

# # If we only have one chunk, process it directly
# if len(dir_chunks) == 1:
#     important_dirs = await process_chunk(0, dir_chunks[0])
#     all_important_dirs.update(important_dirs)
# else:
#     # Create tasks for all chunks
#     tasks = []
#     for chunk_idx, dir_chunk in enumerate(dir_chunks):
#         task = asyncio.create_task(bounded_process_chunk(chunk_idx, dir_chunk))
#         tasks.append(task)
    
#     # Wait for all tasks to complete and collect results
#     await ctx.log(level="info", message=f"Waiting for {len(tasks)} parallel LLM tasks to complete")
#     completed_tasks = await asyncio.gather(*tasks, return_exceptions=True)
    
#     # Process results
#     for result in completed_tasks:
#         if isinstance(result, Exception):
#             # Log the exception but continue
#             await ctx.log(level="error", message=f"Parallel task failed: {str(result)}")
#             continue
            
#         # Update our important directories collection
#         all_important_dirs.update(result)

# # If we didn't get any important directories, include all directories
# if not all_important_dirs:
#     await ctx.log(
#         level="warning",
#         message="No important directories identified, including all directories"
#     )
#     all_important_dirs = set(sorted_dirs)

# await ctx.log(
#     level="info", 
#     message=f"Processing complete, identified {len(all_important_dirs)} important directories"
# )
        
# # Generate the final .yellhorncontext file content with comments
# final_content = "# Yellhorn Context File - AI context optimization\n"
# final_content += f"# Generated by yellhorn-mcp curate_context tool\n"
# final_content += f"# Based on task: {user_task}\n\n"

# # If we have a .yellhornignore file, include its patterns first
# if has_ignore_file and (ignore_patterns or whitelist_patterns):
#     final_content += "# Patterns from .yellhornignore file\n"
    
#     # Include blacklist patterns from .yellhornignore
#     if ignore_patterns:
#         final_content += "# Files and directories to exclude (blacklist)\n"
#         final_content += "\n".join(sorted(ignore_patterns)) + "\n\n"
        
#     # Include whitelist patterns from .yellhornignore  
#     if whitelist_patterns:
#         final_content += "# Explicitly included patterns (whitelist)\n"
#         final_content += "\n".join("!" + pattern for pattern in sorted(whitelist_patterns)) + "\n\n"

# # Sort directories for consistent output
# sorted_important_dirs = sorted(list(all_important_dirs))

# # Add section for task-specific directory context
# final_content += "# Task-specific directories for AI context\n"

# # Convert important directories to explicit include patterns (with trailing slash for directories)
# if sorted_important_dirs:
#     final_content += "# Important directories to specifically include\n"
#     dir_includes = []
#     for dir_path in sorted_important_dirs:
#         # Add trailing slash for clarity that it's a directory pattern
#         if dir_path == '.':
#             # Root directory is a special case
#             dir_includes.append("!./")
#         else:
#             dir_includes.append(f"!{dir_path}/")
    
#     final_content += "\n".join(dir_includes) + "\n\n"

# # Add a section recommending to blacklist everything else except the important directories
# final_content += "# Recommended: blacklist everything else (uncomment to enable)\n"
# final_content += "# **/*\n"

In [38]:
await process_chunk(0, dir_chunks[0])

You are an expert software developer tasked with analyzing a codebase structure to identify important directories for AI context.

<user_task>
Debug MCP
</user_task>

Your goal is to identify the most important directories that should be included when an AI assistant analyzes this codebase for the user's task.

Below is a list of directories from the codebase (chunk 1 of 1):

<directories>
top_directory
	.yellhornignore
	pyproject.toml
.claude
	settings.local.json
notebooks
	file_structure.ipynb
yellhorn_mcp
	__init__.py
	cli.py
	lsp_utils.py
	server.py
	tree_utils.py
</directories>

Analyze these directories and identify the ones that:
1. Contain core application code relevant to the user's task
2. Likely contain important business logic
3. Would be essential for understanding the codebase architecture
4. Are needed to implement the requested task

Ignore directories that:
1. Contain only build artifacts or generated code
2. Store dependencies or vendor code
3. Contain temporary or ca

NameError: name 'model' is not defined

# LSP Prompt Formatting

In [42]:
repo_path = Path("/Users/sravanj/project_work/yellhorn-mcp")

from yellhorn_mcp.lsp_utils import (
    get_lsp_snapshot,
)
from yellhorn_mcp.server import format_codebase_for_prompt

# Get LSP snapshot (signatures only)
file_paths, file_contents = await get_lsp_snapshot(repo_path)

Found .yellhornignore file, using it for filtering
Filtering codebase with 14 blacklist and 2 whitelist patterns from .yellhornignore
Filtered from 45 to 9 files


In [43]:
file_contents

{'yellhorn_mcp/cli.py': 'def main()  # Run the Yellhorn MCP server as a standalone command.',
 'yellhorn_mcp/lsp_utils.py': 'def extract_python_api(file_path: Path) -> list[str]  # Extract Python API (function and class signatures with docstrings) from a file.\ndef extract_go_api(file_path: Path) -> list[str]  # Extract Go API (function, type, interface signatures, struct fields) from a file.\nasync def get_lsp_snapshot(repo_path: Path) -> tuple[list[str], dict[str, str]]  # Get an LSP-style snapshot of the codebase, extracting API information.\nasync def update_snapshot_with_full_diff_files(repo_path: Path, base_ref: str, head_ref: str, file_paths: list[str], file_contents: dict[str, str]) -> tuple[list[str], dict[str, str]]  # Update an LSP snapshot with full contents of files included in a diff.',
 'yellhorn_mcp/server.py': 'def calculate_cost(model: str, input_tokens: int, output_tokens: int) -> float | None  # Calculates the estimated cost for a model API call.\ndef format_metrics

In [113]:
import os
from pathlib import Path
from typing import List, Dict

async def format_codebase_for_prompt(
    file_paths: List[str],
    file_contents: Dict[str, str]
) -> str:
    """
    Format the codebase information for inclusion in the prompt.

    Args:
        file_paths: List of file paths.
        file_contents: Dictionary mapping file paths to contents.

    Returns:
        Formatted string with codebase tree and inlined file contents.
    """
    # 1. Gather unique directories (including root as '.')
    dirs = set(Path(fp).parent.as_posix() for fp in file_paths)
    # ensure root appears
    dirs.add('.')
    # sort so root comes first, then lexicographically
    dir_list = sorted(dirs, key=lambda d: (d != '.', d))

    lines: List[str] = []
    for dir_path in dir_list:
        # pretty label
        label = 'top_directory' if dir_path == '.' else dir_path
        lines.append(label)

        # find files directly in this directory
        if dir_path == '.':
            dir_files = [f for f in file_paths if '/' not in f]
        else:
            prefix = dir_path.rstrip('/') + '/'
            dir_files = [
                f for f in file_paths
                if f.startswith(prefix) and '/' not in f[len(prefix):]
            ]

        for fp in sorted(dir_files):
            name = os.path.basename(fp)
            lines.append(f"\t{name}")
            # inline the file’s contents right after its name
            content = file_contents.get(fp, "").rstrip()
            if content:
                # decide syntax highlighting by extension
                ext = Path(fp).suffix.lstrip('.')
                lang = ext or 'text'
                # indent each line of content by one more tab
                indented = "\n".join("\t\t" + l for l in content.splitlines())
                lines.append(f"\t\t```{lang}\n{indented}\n\t\t```")

    codebase_contents = "\n".join(lines)
    return f"""<codebase_tree>
{codebase_contents}
</codebase_tree>"""

In [52]:
codebase_info = await format_codebase_for_prompt(file_paths, file_contents)
print(codebase_info)


<codebase_tree>
top_directory
	.yellhornignore
	pyproject.toml
.claude
	settings.local.json
notebooks
	file_structure.ipynb
yellhorn_mcp
	__init__.py
	cli.py
		```py
		def main()  # Run the Yellhorn MCP server as a standalone command.
		```
	lsp_utils.py
		```py
		def extract_python_api(file_path: Path) -> list[str]  # Extract Python API (function and class signatures with docstrings) from a file.
		def extract_go_api(file_path: Path) -> list[str]  # Extract Go API (function, type, interface signatures, struct fields) from a file.
		async def get_lsp_snapshot(repo_path: Path) -> tuple[list[str], dict[str, str]]  # Get an LSP-style snapshot of the codebase, extracting API information.
		async def update_snapshot_with_full_diff_files(repo_path: Path, base_ref: str, head_ref: str, file_paths: list[str], file_contents: dict[str, str]) -> tuple[list[str], dict[str, str]]  # Update an LSP snapshot with full contents of files included in a diff.
		```
	server.py
		```py
		def calculate_cost(m

In [1]:
from yellhorn_mcp.lsp_utils import get_lsp_snapshot
from yellhorn_mcp.server import format_codebase_for_prompt
from pathlib import Path

repo_path = Path("/Users/sravanj/project_work/yellhorn-mcp")

file_paths, file_contents = await get_lsp_snapshot(repo_path)
# For lsp mode, format with tree and LSP file contents
codebase_info = await format_codebase_for_prompt(file_paths, file_contents)

Found .yellhornignore file, using it for filtering
Filtering codebase with 14 blacklist and 2 whitelist patterns from .yellhornignore
Filtered from 42 to 8 files


In [2]:
print(codebase_info)

<codebase_tree>
top_directory
	.yellhornignore
	pyproject.toml
.claude
	settings.local.json
notebooks
	file_structure.ipynb
yellhorn_mcp
	__init__.py
	cli.py
		```py
		def main()  # Run the Yellhorn MCP server as a standalone command.
		```
	lsp_utils.py
		```py
		def extract_python_api(file_path: Path) -> list[str]  # Extract Python API (function and class signatures with docstrings) from a file.
		def extract_go_api(file_path: Path) -> list[str]  # Extract Go API (function, type, interface signatures, struct fields) from a file.
		async def get_lsp_snapshot(repo_path: Path) -> tuple[list[str], dict[str, str]]  # Get an LSP-style snapshot of the codebase, extracting API information.
		async def update_snapshot_with_full_diff_files(repo_path: Path, base_ref: str, head_ref: str, file_paths: list[str], file_contents: dict[str, str]) -> tuple[list[str], dict[str, str]]  # Update an LSP snapshot with full contents of files included in a diff.
		```
	server.py
		```py
		def calculate_cost(m

# Inspect the File Filtering

In [None]:
repo_path = Path("/Users/sravanj/project_work/yellhorn-mcp")

In [78]:
from yellhorn_mcp.server import run_git_command
# import fnmatch
from fnmatch import fnmatch
from pathlib import Path

In [63]:
async def get_codebase_snapshot(repo_path: Path, _mode: str = "full", log_function = print) -> tuple[list[str], dict[str, str]]:
    # Get list of all tracked and untracked files (respects .gitignore by default)
    files_output = await run_git_command(repo_path, ["ls-files", "-c", "-o", "--exclude-standard"])
    file_paths = [f for f in files_output.split("\n") if f]

    # Priority order: .yellhorncontext overrides .yellhornignore
    yellhorncontext_path = repo_path / ".yellhorncontext"
    context_exists = yellhorncontext_path.exists() and yellhorncontext_path.is_file()

    yellhornignore_path = repo_path / ".yellhornignore"
    ignore_exists = yellhornignore_path.exists() and yellhornignore_path.is_file()

    # Initialize pattern lists
    ignore_patterns = []
    whitelist_patterns = []

    # First try .yellhorncontext, then fall back to .yellhornignore
    if context_exists:
        # Read patterns from .yellhorncontext
        with open(yellhorncontext_path, "r") as f:
            for line in f:
                pattern = line.strip()
                if pattern:
                    if pattern.startswith("!"):
                        whitelist_patterns.append(pattern[1:])
                    else:
                        ignore_patterns.append(pattern)
    elif ignore_exists:
        # Read patterns from .yellhornignore
        with open(yellhornignore_path, "r") as f:
            for line in f:
                pattern = line.strip()
                if pattern:
                    ignore_patterns.append(pattern)

    print(ignore_patterns)
    # Apply filtering with fnmatch
    if ignore_patterns or whitelist_patterns:
        def is_ignored(file_path: str) -> bool:
            # Check whitelist patterns first (take precedence)
            for pattern in whitelist_patterns:
                if fnmatch.fnmatch(file_path, pattern):
                    return False  # Don't ignore whitelisted files

            # Then check blacklist patterns
            for pattern in ignore_patterns:
                if fnmatch.fnmatch(file_path, pattern):
                    return True  # Ignore matching files
            return False

        # Filter files
        filtered_paths = [f for f in file_paths if not is_ignored(f)]
        file_paths = filtered_paths
    
    return file_paths

In [64]:
file_paths = await get_codebase_snapshot(repo_path)
print(file_paths)

['# Yellhorn Ignore File - AI-optimized patterns', '# Generated by yellhorn-mcp curate_ignore_file tool', '# Files and directories to exclude from AI context', '.github/', '.gitignore', '.mcp.json', '.python-version', '.vscode/', 'CHANGELOG.md', 'CLAUDE.md', 'README.md', 'assets/', 'coverage_stats.txt', 'docs/', 'examples/', 'tests/', 'tmp_repo/', '# Important files to explicitly include despite matching blacklist patterns', '!pyproject.toml', '!yellhorn_mcp/']
['.claude/settings.local.json', '.yellhornignore', 'examples/lsp_workplan_example.py', 'notebooks/file_structure.ipynb', '.github/workflows/publish.yml', '.github/workflows/tests.yml', '.vscode/mcp.json', 'assets/yellhorn.png', 'docs/USAGE.md', 'docs/coverage_baseline.md', 'examples/__init__.py', 'examples/client_example.py', 'pyproject.toml', 'tests/__init__.py', 'tests/conftest.py', 'tests/helpers.py', 'tests/test_async_flows_openai.py', 'tests/test_cli.py', 'tests/test_cli_errors.py', 'tests/test_cost_metrics.py', 'tests/test

In [None]:
path = 'hello/.github/workflows/publish.yml'
pat = '*.github/'
fnmatch.fnmatch(path, pat.rstrip("/") + "/*")

True

In [102]:
import fnmatch

whitelist_patterns = ["python/**/*.py", "*.py", "*.ipynb", "*.md"]
ignore_patterns = [".gitignore", ".yellhornignore", "hello/*"]

# Use the same is_ignored function that get_codebase_snapshot uses
def is_ignored(file_path: str) -> bool:
    # First check if the file is whitelisted
    for pattern in whitelist_patterns:
        # Regular pattern matching (e.g., "*.py")
        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, pattern.rstrip("/") + "/*"):
            return False  # Whitelisted, don't ignore
    
    # Then check if it matches any ignore patterns
    for pattern in ignore_patterns:
        # Regular pattern matching (e.g., "*.log")
        if fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_path, pattern.rstrip("/") + "/*"):
            return True

    return False

In [109]:
fnmatch.fnmatch("hello/hello/poetry.lock", "poetry.lock")

False

In [104]:
is_ignored("hello/hello/hello.js")

True