# 04 - Advanced Configuration Examples

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory

# Register Neo4j as the graph store backend
GraphStoreFactory.register(Neo4jGraphStoreFactory)

# Initialize graph and vector stores from environment configuration
graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

# Create the lexical graph index
graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

## Advanced Configuration Examples

### Batch Processing Multiple File Types from Multiple Sources

You can combine multiple reader providers to process different file types in a batch.

In [None]:
import os
from pathlib import Path
from graphrag_toolkit.lexical_graph.indexing.load.readers import (
    StructuredDataReaderProvider, StructuredDataReaderConfig,
    MarkdownReaderProvider, MarkdownReaderConfig
)

# Initialize all readers (now with S3 support)
readers = {
    '.csv': StructuredDataReaderProvider(StructuredDataReaderConfig(
        pandas_config={"sep": ","},
        metadata_fn=lambda path: {'source': 'csv', 'file_path': path}
    )),
    '.json': StructuredDataReaderProvider(StructuredDataReaderConfig(
        metadata_fn=lambda path: {'source': 'json', 'file_path': path}
    )),
    '.xlsx': StructuredDataReaderProvider(StructuredDataReaderConfig(
        metadata_fn=lambda path: {'source': 'excel', 'file_path': path}
    )),
    '.md': MarkdownReaderProvider(MarkdownReaderConfig(
        metadata_fn=lambda path: {'source': 'markdown', 'file_path': path}
    ))
}

# Define file sources (mix of local and S3)
file_sources = [
    # Local files
    'artifacts/sample.csv',
    'artifacts/sample.md',
    # S3 files
    's3://config-test-bucket-188967239867/artifacts/sample.json',
    's3://config-test-bucket-188967239867/artifacts/sample.xlsx'
]

all_docs = []

for file_path in file_sources:
    # Get file extension
    if file_path.startswith('s3://'):
        file_ext = '.' + file_path.split('.')[-1].lower()
    else:
        file_ext = Path(file_path).suffix.lower()
    
    if file_ext in readers:
        try:
            source_type = 's3' if file_path.startswith('s3://') else 'local'
            print(f"Processing {file_path} ({source_type}) with {file_ext} reader...")
            docs = readers[file_ext].read(file_path)
            all_docs.extend(docs)
            print(f"  Loaded {len(docs)} documents")
        except Exception as e:
            print(f"  Error processing {file_path}: {e}")
    else:
        print(f"  Skipping {file_path} - unsupported file type: {file_ext}")

if all_docs:
    print(f"\nTotal documents (chunks) loaded: {len(all_docs)}")
    print("Document sources:")
    sources = {}
    for doc in all_docs:
        source = doc.metadata.get('source', 'unknown')
        sources[source] = sources.get(source, 0) + 1
    
    for source, count in sources.items():
        print(f"  {source}: {count} documents")
    
    # Index all documents together
    print("\nIndexing all documents...")
    graph_index.extract_and_build(all_docs, show_progress=True)
else:
    print("No supported documents found")
    print("Supported file types: .csv, .json, .xlsx, .md")



### Custom Metadata Functions

You can create sophisticated metadata functions to enrich your documents:

In [None]:
import datetime
from pathlib import Path
from pathlib import Path
from graphrag_toolkit.lexical_graph.indexing.load.readers import (
    StructuredDataReaderProvider, StructuredDataReaderConfig,
    MarkdownReaderProvider, MarkdownReaderConfig
)

def advanced_file_metadata(file_path):
    """Extract detailed metadata from file path and system info."""
    path = Path(file_path)
    
    # Check if it's an S3 path
    if str(file_path).startswith('s3://'):
        return s3_metadata(file_path)
    
    metadata = {
        'source': 'file',
        'file_path': str(path),
        'file_name': path.name,
        'file_extension': path.suffix,
        'file_size': path.stat().st_size if path.exists() else 0,
        'created_date': datetime.datetime.fromtimestamp(path.stat().st_ctime).isoformat() if path.exists() else None,
        'modified_date': datetime.datetime.fromtimestamp(path.stat().st_mtime).isoformat() if path.exists() else None,
        'processing_date': datetime.datetime.now().isoformat()
    }
    
    return metadata

def youtube_metadata(video_url):
    """Extract metadata from YouTube URL."""
    import re
    
    # Extract video ID from URL
    video_id_match = re.search(r'(?:v=|/)([0-9A-Za-z_-]{11}).*', video_url)
    video_id = video_id_match.group(1) if video_id_match else 'unknown'
    
    return {
        'source': 'youtube',
        'video_url': video_url,
        'video_id': video_id,
        'content_type': 'transcript',
        'platform': 'youtube',
        'processing_date': datetime.datetime.now().isoformat()
    }

def s3_metadata(s3_path):
    """Extract metadata from S3 path."""
    path = Path(s3_path)
    return {
        'source': 's3',
        's3_path': s3_path,
        'file_name': path.name,
        'file_extension': path.suffix,
        'storage_type': 'cloud',
        'provider': 'aws',
        'processing_date': datetime.datetime.now().isoformat()
    }

# Updated readers with advanced metadata
readers = {
    '.csv': StructuredDataReaderProvider(StructuredDataReaderConfig(
        pandas_config={"sep": ","},
        metadata_fn=advanced_file_metadata
    )),
    '.json': StructuredDataReaderProvider(StructuredDataReaderConfig(
        metadata_fn=advanced_file_metadata
    )),
    '.xlsx': StructuredDataReaderProvider(StructuredDataReaderConfig(
        metadata_fn=advanced_file_metadata
    )),
    '.md': MarkdownReaderProvider(MarkdownReaderConfig(
        metadata_fn=advanced_file_metadata
    ))
}


In [None]:
# Define file sources (mix of local and S3)
file_sources = [
    # Local files
    'artifacts/sample.csv',
    'artifacts/sample.md',
    # S3 files
    's3://config-test-bucket-188967239867/artifacts/sample.json',
    's3://config-test-bucket-188967239867/artifacts/sample.xlsx'
]

all_docs = []

for file_path in file_sources:
    # Get file extension
    if file_path.startswith('s3://'):
        file_ext = '.' + file_path.split('.')[-1].lower()
    else:
        file_ext = Path(file_path).suffix.lower()
    
    if file_ext in readers:
        try:
            source_type = 's3' if file_path.startswith('s3://') else 'local'
            print(f"Processing {file_path} ({source_type}) with {file_ext} reader...")
            docs = readers[file_ext].read(file_path)
            all_docs.extend(docs)
            print(f"  Loaded {len(docs)} documents")
        except Exception as e:
            print(f"  Error processing {file_path}: {e}")
    else:
        print(f"  Skipping {file_path} - unsupported file type: {file_ext}")

# Demonstrate rich metadata
if all_docs:
    print(f"\nTotal documents loaded: {len(all_docs)}")
    
    # Show detailed metadata for first document from each source
    print("\nDetailed metadata examples:")
    seen_sources = set()
    for doc in all_docs:
        source = doc.metadata.get('source', 'unknown')
        if source not in seen_sources:
            print(f"\n{source.upper()} file metadata:")
            for key, value in doc.metadata.items():
                print(f"  {key}: {value}")
            seen_sources.add(source)
    
    # Index all documents
    print("\nIndexing all documents...")
    graph_index.extract_and_build(all_docs, show_progress=True)
else:
    print("No supported documents found")



### Key Features:

1. **Flexible Configuration**: Each provider supports custom metadata functions
2. **Error Handling**: Robust error handling for missing files or network issues
3. **Batch Processing**: Ability to process multiple file types together
4. **Cloud Integration**: Native AWS S3 support with proper credential handling
5. **Rich Metadata**: Support for extracting detailed file and content metadata

### Next Steps:

- Place sample files in the `artifacts/` directory to test the file readers
- Configure AWS credentials to test the S3 reader
- Try different YouTube videos with captions for transcript extraction
- Experiment with custom metadata functions for your specific use cases