# Code Splitter Testing Notebook

This notebook tests the LlamaIndex CodeSplitter functionality with various test code examples.

In [20]:
# install requirements.txt  
%pip install -r requirements.txt

Collecting tree_sitter_language_pack (from -r requirements.txt (line 6))
  Downloading tree_sitter_language_pack-0.9.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (17 kB)
[0mCollecting tree-sitter-c-sharp>=0.23.1 (from tree_sitter_language_pack->-r requirements.txt (line 6))
  Downloading tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tree-sitter-embedded-template>=0.23.2 (from tree_sitter_language_pack->-r requirements.txt (line 6))
  Downloading tree_sitter_embedded_template-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting tree-sitter-yaml>=0.7.0 (from tree_sitter_language_pack->-r requirements.txt (line 6))
  Downloading tree_sitter_yaml-0.7.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading tree_sitter_language_pack-0.9

In [21]:
# Import required libraries
from llama_index.core.node_parser import CodeSplitter
from llama_index.core import Document
import json

## Test Code Examples

Let's define various test code snippets to test the code splitter with different programming constructs.

In [22]:
# Test code 1: Simple class with methods
test_code_1 = '''
class Calculator:
    """A simple calculator class."""
    
    def __init__(self):
        self.history = []
    
    def add(self, a, b):
        """Add two numbers."""
        result = a + b
        self.history.append(f"{a} + {b} = {result}")
        return result
    
    def subtract(self, a, b):
        """Subtract two numbers."""
        result = a - b
        self.history.append(f"{a} - {b} = {result}")
        return result
    
    def multiply(self, a, b):
        """Multiply two numbers."""
        result = a * b
        self.history.append(f"{a} * {b} = {result}")
        return result
    
    def get_history(self):
        """Get calculation history."""
        return self.history
'''

In [23]:
# Test code 2: Functions with decorators and complex logic
test_code_2 = '''
import functools
import time
from typing import List, Dict, Optional

def timing_decorator(func):
    """Decorator to measure function execution time."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.4f} seconds")
        return result
    return wrapper

@timing_decorator
def process_data(data: List[Dict]) -> Optional[Dict]:
    """Process a list of dictionaries and return aggregated results."""
    if not data:
        return None
    
    total_count = len(data)
    sum_values = sum(item.get('value', 0) for item in data)
    avg_value = sum_values / total_count if total_count > 0 else 0
    
    result = {
        'total_count': total_count,
        'sum_values': sum_values,
        'average_value': avg_value,
        'max_value': max(item.get('value', 0) for item in data),
        'min_value': min(item.get('value', 0) for item in data)
    }
    
    return result

def fibonacci(n: int) -> int:
    """Calculate fibonacci number recursively."""
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

class DataProcessor:
    """A class for processing various data types."""
    
    def __init__(self, config: Dict):
        self.config = config
        self.processed_items = 0
    
    @timing_decorator
    def batch_process(self, items: List) -> List:
        """Process items in batches."""
        batch_size = self.config.get('batch_size', 10)
        results = []
        
        for i in range(0, len(items), batch_size):
            batch = items[i:i + batch_size]
            batch_result = self._process_batch(batch)
            results.extend(batch_result)
            self.processed_items += len(batch)
        
        return results
    
    def _process_batch(self, batch: List) -> List:
        """Process a single batch of items."""
        return [self._transform_item(item) for item in batch]
    
    def _transform_item(self, item):
        """Transform a single item based on configuration."""
        transform_type = self.config.get('transform', 'none')
        
        if transform_type == 'uppercase' and isinstance(item, str):
            return item.upper()
        elif transform_type == 'square' and isinstance(item, (int, float)):
            return item ** 2
        else:
            return item
'''

In [24]:
# Test code 3: API and web development code
test_code_3 = '''
from flask import Flask, request, jsonify
from sqlalchemy import create_engine, Column, Integer, String, DateTime
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from datetime import datetime
import os

app = Flask(__name__)
Base = declarative_base()

class User(Base):
    """User model for the database."""
    __tablename__ = 'users'
    
    id = Column(Integer, primary_key=True)
    username = Column(String(80), unique=True, nullable=False)
    email = Column(String(120), unique=True, nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow)
    
    def to_dict(self):
        return {
            'id': self.id,
            'username': self.username,
            'email': self.email,
            'created_at': self.created_at.isoformat() if self.created_at else None
        }

# Database setup
DATABASE_URL = os.getenv('DATABASE_URL', 'sqlite:///app.db')
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)

@app.route('/api/users', methods=['GET'])
def get_users():
    """Get all users from the database."""
    session = Session()
    try:
        users = session.query(User).all()
        return jsonify([user.to_dict() for user in users])
    except Exception as e:
        return jsonify({'error': str(e)}), 500
    finally:
        session.close()

@app.route('/api/users', methods=['POST'])
def create_user():
    """Create a new user."""
    session = Session()
    try:
        data = request.get_json()
        
        if not data or 'username' not in data or 'email' not in data:
            return jsonify({'error': 'Username and email are required'}), 400
        
        user = User(username=data['username'], email=data['email'])
        session.add(user)
        session.commit()
        
        return jsonify(user.to_dict()), 201
    except Exception as e:
        session.rollback()
        return jsonify({'error': str(e)}), 500
    finally:
        session.close()

@app.route('/api/users/<int:user_id>', methods=['GET'])
def get_user(user_id):
    """Get a specific user by ID."""
    session = Session()
    try:
        user = session.query(User).filter(User.id == user_id).first()
        if not user:
            return jsonify({'error': 'User not found'}), 404
        return jsonify(user.to_dict())
    except Exception as e:
        return jsonify({'error': str(e)}), 500
    finally:
        session.close()

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)
'''

## Initialize Code Splitters

Let's create code splitters with different configurations to test how they handle the code.

In [25]:
# Initialize different code splitters
splitter_default = CodeSplitter.from_defaults(language="python")

splitter_small_chunks = CodeSplitter.from_defaults(
    language="python",
    chunk_lines=20,
    chunk_lines_overlap=5,
    max_chars=800
)

splitter_large_chunks = CodeSplitter.from_defaults(
    language="python",
    chunk_lines=80,
    chunk_lines_overlap=20,
    max_chars=3000
)

print("Code splitters initialized successfully!")

Code splitters initialized successfully!


## Helper Functions for Testing

In [26]:
def analyze_chunks(chunks, title="Analysis"):
    """Analyze and display information about code chunks."""
    print(f"\n=== {title} ===")
    print(f"Number of chunks: {len(chunks)}")
    
    for i, chunk in enumerate(chunks):
        lines = chunk.split('\n')
        print(f"\nChunk {i+1}:")
        print(f"  Lines: {len(lines)}")
        print(f"  Characters: {len(chunk)}")
        print(f"  First line: {lines[0][:50]}..." if lines[0] else "  First line: (empty)")
        if len(lines) > 1:
            print(f"  Last line: {lines[-1][:50]}..." if lines[-1] else "  Last line: (empty)")

def display_chunk_content(chunks, max_chunks=3):
    """Display the actual content of chunks (limited number)."""
    print(f"\n=== Chunk Contents (showing first {min(max_chunks, len(chunks))} chunks) ===")
    
    for i, chunk in enumerate(chunks[:max_chunks]):
        print(f"\n--- Chunk {i+1} ---")
        print(chunk)
        print("\n" + "="*50)

## Test 1: Simple Calculator Class

In [27]:
# Test with default splitter
chunks_1_default = splitter_default.split_text(test_code_1)
analyze_chunks(chunks_1_default, "Test 1 - Default Splitter")
display_chunk_content(chunks_1_default)


=== Test 1 - Default Splitter ===
Number of chunks: 1

Chunk 1:
  Lines: 27
  Characters: 685
  First line: class Calculator:...
  Last line:         return self.history...

=== Chunk Contents (showing first 1 chunks) ===

--- Chunk 1 ---
class Calculator:
    """A simple calculator class."""

    def __init__(self):
        self.history = []

    def add(self, a, b):
        """Add two numbers."""
        result = a + b
        self.history.append(f"{a} + {b} = {result}")
        return result

    def subtract(self, a, b):
        """Subtract two numbers."""
        result = a - b
        self.history.append(f"{a} - {b} = {result}")
        return result

    def multiply(self, a, b):
        """Multiply two numbers."""
        result = a * b
        self.history.append(f"{a} * {b} = {result}")
        return result

    def get_history(self):
        """Get calculation history."""
        return self.history



In [28]:
# Test with small chunks splitter
chunks_1_small = splitter_small_chunks.split_text(test_code_1)
analyze_chunks(chunks_1_small, "Test 1 - Small Chunks Splitter")
display_chunk_content(chunks_1_small)


=== Test 1 - Small Chunks Splitter ===
Number of chunks: 1

Chunk 1:
  Lines: 27
  Characters: 685
  First line: class Calculator:...
  Last line:         return self.history...

=== Chunk Contents (showing first 1 chunks) ===

--- Chunk 1 ---
class Calculator:
    """A simple calculator class."""

    def __init__(self):
        self.history = []

    def add(self, a, b):
        """Add two numbers."""
        result = a + b
        self.history.append(f"{a} + {b} = {result}")
        return result

    def subtract(self, a, b):
        """Subtract two numbers."""
        result = a - b
        self.history.append(f"{a} - {b} = {result}")
        return result

    def multiply(self, a, b):
        """Multiply two numbers."""
        result = a * b
        self.history.append(f"{a} * {b} = {result}")
        return result

    def get_history(self):
        """Get calculation history."""
        return self.history



## Test 2: Complex Functions and Classes

In [29]:
# Test with default splitter
chunks_2_default = splitter_default.split_text(test_code_2)
analyze_chunks(chunks_2_default, "Test 2 - Default Splitter")
display_chunk_content(chunks_2_default, max_chunks=2)


=== Test 2 - Default Splitter ===
Number of chunks: 2

Chunk 1:
  Lines: 40
  Characters: 1203
  First line: import functools...
  Last line:     return fibonacci(n-1) + fibonacci(n-2)...

Chunk 2:
  Lines: 35
  Characters: 1192
  First line: class DataProcessor:...
  Last line:             return item...

=== Chunk Contents (showing first 2 chunks) ===

--- Chunk 1 ---
import functools
import time
from typing import List, Dict, Optional

def timing_decorator(func):
    """Decorator to measure function execution time."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.4f} seconds")
        return result
    return wrapper

@timing_decorator
def process_data(data: List[Dict]) -> Optional[Dict]:
    """Process a list of dictionaries and return aggregated results."""
    if not data:
        return None

    total

In [30]:
# Test with large chunks splitter
chunks_2_large = splitter_large_chunks.split_text(test_code_2)
analyze_chunks(chunks_2_large, "Test 2 - Large Chunks Splitter")
display_chunk_content(chunks_2_large, max_chunks=2)


=== Test 2 - Large Chunks Splitter ===
Number of chunks: 1

Chunk 1:
  Lines: 76
  Characters: 2397
  First line: import functools...
  Last line:             return item...

=== Chunk Contents (showing first 1 chunks) ===

--- Chunk 1 ---
import functools
import time
from typing import List, Dict, Optional

def timing_decorator(func):
    """Decorator to measure function execution time."""
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"{func.__name__} took {end_time - start_time:.4f} seconds")
        return result
    return wrapper

@timing_decorator
def process_data(data: List[Dict]) -> Optional[Dict]:
    """Process a list of dictionaries and return aggregated results."""
    if not data:
        return None

    total_count = len(data)
    sum_values = sum(item.get('value', 0) for item in data)
    avg_value = sum_values / total_count if total_coun

## Test 3: Flask API Code

In [None]:
# Test with default splitter
chunks_3_default = splitter_default.split_text(test_code_3)
analyze_chunks(chunks_3_default, "Test 3 - Default Splitter")
display_chunk_content(chunks_3_default, max_chunks=2)

In [None]:
# Test with small chunks splitter
chunks_3_small = splitter_small_chunks.split_text(test_code_3)
analyze_chunks(chunks_3_small, "Test 3 - Small Chunks Splitter")
display_chunk_content(chunks_3_small, max_chunks=3)

## Test 4: Creating Documents with Metadata

In [None]:
def create_documents_from_chunks(chunks, file_path="test_file.py"):
    """Create Document objects from chunks with metadata."""
    documents = []
    for idx, chunk in enumerate(chunks):
        metadata = {
            "file_path": file_path,
            "chunk_index": idx,
            "chunk_size": len(chunk),
            "line_count": len(chunk.split('\n'))
        }
        documents.append(Document(text=chunk, metadata=metadata))
    return documents

# Create documents for test code 2
docs = create_documents_from_chunks(chunks_2_default, "complex_functions.py")

print(f"Created {len(docs)} documents")
for i, doc in enumerate(docs):
    print(f"\nDocument {i+1}:")
    print(f"  Metadata: {doc.metadata}")
    print(f"  Text preview: {doc.text[:100]}...")

## Test 5: Comparing Different Splitter Configurations

In [None]:
def compare_splitters(test_code, test_name):
    """Compare different splitter configurations on the same code."""
    print(f"\n{'='*60}")
    print(f"COMPARISON FOR {test_name}")
    print(f"{'='*60}")
    
    splitters = {
        "Default": splitter_default,
        "Small Chunks": splitter_small_chunks,
        "Large Chunks": splitter_large_chunks
    }
    
    results = {}
    
    for name, splitter in splitters.items():
        chunks = splitter.split_text(test_code)
        results[name] = {
            "chunk_count": len(chunks),
            "total_chars": sum(len(chunk) for chunk in chunks),
            "avg_chunk_size": sum(len(chunk) for chunk in chunks) / len(chunks) if chunks else 0,
            "avg_lines": sum(len(chunk.split('\n')) for chunk in chunks) / len(chunks) if chunks else 0
        }
    
    # Display comparison table
    print(f"{'Splitter':<15} {'Chunks':<8} {'Total Chars':<12} {'Avg Size':<10} {'Avg Lines':<10}")
    print("-" * 65)
    
    for name, stats in results.items():
        print(f"{name:<15} {stats['chunk_count']:<8} {stats['total_chars']:<12} "
              f"{stats['avg_chunk_size']:<10.1f} {stats['avg_lines']:<10.1f}")
    
    return results

# Compare all test codes
comparison_1 = compare_splitters(test_code_1, "CALCULATOR CLASS")
comparison_2 = compare_splitters(test_code_2, "COMPLEX FUNCTIONS")
comparison_3 = compare_splitters(test_code_3, "FLASK API")

## Test 6: Edge Cases and Special Scenarios

In [None]:
# Test with very short code
short_code = """
def hello():
    print("Hello, World!")
"""

# Test with code containing long strings
long_string_code = """
long_text = \"\"\"
This is a very long string that contains multiple lines and might affect
how the code splitter handles the splitting. It includes various characters
and formatting that could be challenging for the parser to handle correctly.
The string continues here with more content to make it even longer and
test the boundaries of the code splitter functionality.
\"\"\"

def process_text(text):
    return text.upper()
"""

# Test with comments and docstrings
comment_heavy_code = """
# This is a comment at the top
# Another comment line

def documented_function(param1, param2):
    \"\"\"
    This function demonstrates how the code splitter handles
    functions with extensive documentation.
    
    Args:
        param1: The first parameter
        param2: The second parameter
    
    Returns:
        The result of processing the parameters
    
    Raises:
        ValueError: If parameters are invalid
    \"\"\"
    # Inline comment
    if not param1:  # Another inline comment
        raise ValueError("param1 cannot be None")
    
    # Process the parameters
    result = param1 + param2
    return result  # Return comment
"""

print("Testing edge cases...")

# Test short code
short_chunks = splitter_default.split_text(short_code)
print(f"\nShort code chunks: {len(short_chunks)}")
print(f"Content: {short_chunks[0] if short_chunks else 'No chunks'}")

# Test long string code
long_string_chunks = splitter_default.split_text(long_string_code)
analyze_chunks(long_string_chunks, "Long String Code")

# Test comment heavy code
comment_chunks = splitter_default.split_text(comment_heavy_code)
analyze_chunks(comment_chunks, "Comment Heavy Code")

## Summary and Conclusions

In [None]:
def generate_summary():
    """Generate a summary of all the tests performed."""
    print("\n" + "="*70)
    print("CODE SPLITTER TEST SUMMARY")
    print("="*70)
    
    print("\nTests Performed:")
    print("1. ✓ Simple Calculator Class - Basic class with methods")
    print("2. ✓ Complex Functions - Decorators, type hints, complex logic")
    print("3. ✓ Flask API Code - Web framework with database models")
    print("4. ✓ Document Creation - Converting chunks to Document objects")
    print("5. ✓ Splitter Comparison - Different configuration comparisons")
    print("6. ✓ Edge Cases - Short code, long strings, comments")
    
    print("\nKey Observations:")
    print("• The CodeSplitter respects Python AST structure")
    print("• Different configurations produce different chunking strategies")
    print("• Smaller chunk sizes result in more granular splits")
    print("• The splitter handles various Python constructs well")
    print("• Metadata can be easily attached to chunks for tracking")
    
    print("\nRecommendations:")
    print("• Use default settings for balanced chunk sizes")
    print("• Adjust chunk_lines and max_chars based on your use case")
    print("• Consider overlap for maintaining context between chunks")
    print("• Test with your specific code patterns before production use")
    
    print("\n" + "="*70)

generate_summary()