# Graph Analysis & Visualization

This notebook explores the knowledge graph structure in Neo4j:
- Graph statistics and metrics
- Node and relationship analysis
- Visualization of document-chunk relationships
- Community detection and clustering
- Graph patterns and insights

## 1. Setup and Connection

In [None]:
import sys
sys.path.append('..')

from neo4j_rag import Neo4jRAG
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Connect to Neo4j
rag = Neo4jRAG()
print("✅ Connected to Neo4j")

## 2. Graph Overview Statistics

In [None]:
# Get basic statistics
with rag.driver.session() as session:
    # Overall counts
    result = session.run("""
        MATCH (d:Document)
        OPTIONAL MATCH (d)-[:HAS_CHUNK]->(c:Chunk)
        RETURN 
            COUNT(DISTINCT d) as total_documents,
            COUNT(c) as total_chunks,
            AVG(SIZE(d.content)) as avg_doc_size,
            MIN(SIZE(d.content)) as min_doc_size,
            MAX(SIZE(d.content)) as max_doc_size
    """)
    
    stats = result.single()
    
print("📊 Graph Overview:")
print("=" * 50)
print(f"Total Documents: {stats['total_documents']}")
print(f"Total Chunks: {stats['total_chunks']}")
print(f"\nDocument Size Statistics:")
print(f"  Average: {stats['avg_doc_size']:.0f} characters")
print(f"  Minimum: {stats['min_doc_size']} characters")
print(f"  Maximum: {stats['max_doc_size']} characters")
print(f"\nAverage chunks per document: {stats['total_chunks']/stats['total_documents']:.1f}")

## 3. Document Categories and Distribution

In [None]:
# Analyze document categories
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)
        OPTIONAL MATCH (d)-[:HAS_CHUNK]->(c:Chunk)
        RETURN 
            d.category as category,
            COUNT(DISTINCT d) as doc_count,
            COUNT(c) as chunk_count,
            AVG(SIZE(d.content)) as avg_size
        ORDER BY doc_count DESC
    """)
    
    categories = []
    for record in result:
        categories.append({
            'Category': record.get('category', 'Unknown'),
            'Documents': record['doc_count'],
            'Chunks': record['chunk_count'],
            'Avg Size': f"{record['avg_size']:.0f}"
        })

# Create DataFrame
df_categories = pd.DataFrame(categories)
print("📁 Document Categories:")
print(df_categories.to_string(index=False))

# Visualize distribution
if len(categories) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Documents by category
    ax1.bar(df_categories['Category'], df_categories['Documents'], color='steelblue')
    ax1.set_xlabel('Category')
    ax1.set_ylabel('Number of Documents')
    ax1.set_title('Documents by Category')
    ax1.tick_params(axis='x', rotation=45)
    
    # Chunks by category
    ax2.bar(df_categories['Category'], df_categories['Chunks'], color='coral')
    ax2.set_xlabel('Category')
    ax2.set_ylabel('Number of Chunks')
    ax2.set_title('Chunks by Category')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

## 4. Chunk Distribution Analysis

In [None]:
# Analyze chunk distribution across documents
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
        RETURN 
            d.source as source,
            COUNT(c) as chunk_count,
            AVG(SIZE(c.text)) as avg_chunk_size,
            MIN(c.chunk_index) as min_index,
            MAX(c.chunk_index) as max_index
        ORDER BY chunk_count DESC
    """)
    
    chunk_dist = []
    for record in result:
        chunk_dist.append({
            'Source': record['source'][:30] + '...' if len(record['source']) > 30 else record['source'],
            'Chunks': record['chunk_count'],
            'Avg Chunk Size': f"{record['avg_chunk_size']:.0f}",
            'Index Range': f"{record['min_index']}-{record['max_index']}"
        })

# Display top documents by chunk count
df_chunks = pd.DataFrame(chunk_dist[:10])  # Top 10
print("\n📊 Top Documents by Chunk Count:")
print(df_chunks.to_string(index=False))

# Visualize chunk distribution
if len(chunk_dist) > 0:
    plt.figure(figsize=(10, 6))
    chunk_counts = [d['Chunks'] for d in chunk_dist]
    plt.hist(chunk_counts, bins=20, color='skyblue', edgecolor='black')
    plt.xlabel('Number of Chunks per Document')
    plt.ylabel('Frequency')
    plt.title('Distribution of Chunks Across Documents')
    plt.axvline(np.mean(chunk_counts), color='red', linestyle='--', label=f'Mean: {np.mean(chunk_counts):.1f}')
    plt.legend()
    plt.show()

## 5. Embedding Coverage and Quality

In [None]:
# Check embedding coverage
with rag.driver.session() as session:
    result = session.run("""
        MATCH (c:Chunk)
        RETURN 
            COUNT(c) as total_chunks,
            COUNT(c.embedding) as chunks_with_embedding,
            AVG(SIZE(c.embedding)) as avg_embedding_size
    """)
    
    embed_stats = result.single()

print("🧮 Embedding Statistics:")
print("=" * 50)
print(f"Total Chunks: {embed_stats['total_chunks']}")
print(f"Chunks with Embeddings: {embed_stats['chunks_with_embedding']}")
print(f"Coverage: {embed_stats['chunks_with_embedding']/embed_stats['total_chunks']*100:.1f}%")
print(f"Embedding Dimensions: {embed_stats['avg_embedding_size']:.0f}")

# Sample embedding analysis
with rag.driver.session() as session:
    result = session.run("""
        MATCH (c:Chunk)
        WHERE c.embedding IS NOT NULL
        RETURN c.embedding[0..5] as sample_values
        LIMIT 1
    """)
    
    sample = result.single()
    if sample:
        print(f"\nSample embedding values (first 5): {sample['sample_values']}")

## 6. Document Metadata Analysis

In [None]:
# Analyze document metadata
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)
        RETURN 
            d.source as source,
            d.category as category,
            d.topic as topic,
            d.author as author,
            d.created as created,
            SIZE(d.content) as size
        ORDER BY size DESC
    """)
    
    documents = []
    for record in result:
        documents.append({
            'Source': record['source'][:25] + '...' if len(record.get('source', '')) > 25 else record.get('source', 'Unknown'),
            'Category': record.get('category', 'N/A'),
            'Topic': record.get('topic', 'N/A'),
            'Author': record.get('author', 'N/A')[:15],
            'Size': f"{record['size']:,}"
        })

# Display document metadata
df_docs = pd.DataFrame(documents)
print("\n📄 Document Metadata:")
print(df_docs.to_string(index=False))

# Metadata completeness
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)
        RETURN 
            COUNT(d) as total,
            COUNT(d.category) as with_category,
            COUNT(d.topic) as with_topic,
            COUNT(d.author) as with_author,
            COUNT(d.created) as with_created
    """)
    
    meta_complete = result.single()

print("\n📊 Metadata Completeness:")
for field in ['category', 'topic', 'author', 'created']:
    count = meta_complete[f'with_{field}']
    total = meta_complete['total']
    print(f"  {field.capitalize()}: {count}/{total} ({count/total*100:.0f}%)")

## 7. Graph Connectivity Analysis

In [None]:
# Analyze graph connectivity
with rag.driver.session() as session:
    # Orphaned chunks (should be 0)
    result = session.run("""
        MATCH (c:Chunk)
        WHERE NOT (c)<-[:HAS_CHUNK]-()
        RETURN COUNT(c) as orphaned_chunks
    """)
    orphaned = result.single()['orphaned_chunks']
    
    # Documents without chunks
    result = session.run("""
        MATCH (d:Document)
        WHERE NOT (d)-[:HAS_CHUNK]->()
        RETURN COUNT(d) as docs_without_chunks
    """)
    no_chunks = result.single()['docs_without_chunks']
    
    # Relationship statistics
    result = session.run("""
        MATCH ()-[r:HAS_CHUNK]->()
        RETURN COUNT(r) as total_relationships
    """)
    total_rels = result.single()['total_relationships']

print("🔗 Graph Connectivity:")
print("=" * 50)
print(f"Total HAS_CHUNK relationships: {total_rels}")
print(f"Orphaned chunks: {orphaned}")
print(f"Documents without chunks: {no_chunks}")

if orphaned > 0:
    print("⚠️ Warning: Found orphaned chunks. Consider cleanup.")
if no_chunks > 0:
    print("⚠️ Warning: Found documents without chunks. May need re-processing.")
else:
    print("✅ Graph integrity check passed!")

## 8. Search Pattern Analysis

In [None]:
# Analyze similarity distribution for sample queries
test_queries = [
    "What is Neo4j?",
    "How does graph database work?",
    "RAG implementation",
    "Vector embeddings"
]

similarity_data = []

for query in test_queries:
    results = rag.vector_search(query, k=10)
    if results:
        scores = [r['score'] for r in results]
        similarity_data.append({
            'Query': query,
            'Max Score': max(scores),
            'Min Score': min(scores),
            'Avg Score': np.mean(scores),
            'Std Dev': np.std(scores)
        })

if similarity_data:
    df_sim = pd.DataFrame(similarity_data)
    print("\n🔍 Similarity Score Analysis:")
    print(df_sim.to_string(index=False, float_format='%.3f'))
    
    # Visualize similarity distributions
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes = axes.flatten()
    
    for idx, query in enumerate(test_queries[:4]):
        results = rag.vector_search(query, k=10)
        if results:
            scores = [r['score'] for r in results]
            axes[idx].bar(range(len(scores)), scores, color='teal')
            axes[idx].set_xlabel('Result Rank')
            axes[idx].set_ylabel('Similarity Score')
            axes[idx].set_title(f'Query: "{query[:20]}..."' if len(query) > 20 else f'Query: "{query}"')
            axes[idx].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 9. Knowledge Coverage Heatmap

In [None]:
# Create a knowledge coverage heatmap
topics = ['Neo4j', 'Graph', 'Database', 'Vector', 'RAG', 'Embedding', 'Search', 'Query']
categories_list = ['tutorial', 'guide', 'documentation', 'example']

# Build coverage matrix
coverage_matrix = np.zeros((len(topics), len(categories_list)))

with rag.driver.session() as session:
    for i, topic in enumerate(topics):
        for j, category in enumerate(categories_list):
            result = session.run("""
                MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
                WHERE toLower(c.text) CONTAINS toLower($topic)
                  AND (d.category = $category OR $category = 'any')
                RETURN COUNT(DISTINCT c) as count
            """, topic=topic, category=category)
            
            coverage_matrix[i, j] = result.single()['count']

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(coverage_matrix, 
            xticklabels=categories_list,
            yticklabels=topics,
            annot=True,
            fmt='.0f',
            cmap='YlOrRd',
            cbar_kws={'label': 'Number of Chunks'})
plt.title('Knowledge Coverage Heatmap\n(Topics vs Categories)')
plt.xlabel('Category')
plt.ylabel('Topic')
plt.tight_layout()
plt.show()

# Summary statistics
print("\n📊 Coverage Summary:")
print(f"Most covered topic: {topics[np.argmax(coverage_matrix.sum(axis=1))]}")
print(f"Most populated category: {categories_list[np.argmax(coverage_matrix.sum(axis=0))]}")
print(f"Total topic mentions: {int(coverage_matrix.sum())}")

## 10. Export Graph Data for Analysis

In [None]:
# Export graph data for external analysis
import json
from datetime import datetime

export_data = {
    'export_date': datetime.now().isoformat(),
    'statistics': {},
    'documents': [],
    'sample_chunks': []
}

# Get statistics
export_data['statistics'] = rag.get_stats()

# Get document list
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)
        OPTIONAL MATCH (d)-[:HAS_CHUNK]->(c:Chunk)
        RETURN d.id as id, d.source as source, d.category as category,
               COUNT(c) as chunk_count, AVG(SIZE(c.text)) as avg_chunk_size
    """)
    
    for record in result:
        export_data['documents'].append({
            'id': record['id'],
            'source': record['source'],
            'category': record.get('category'),
            'chunk_count': record['chunk_count'],
            'avg_chunk_size': record['avg_chunk_size']
        })

# Get sample chunks
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
        RETURN c.text as text, c.chunk_index as index, d.source as source
        LIMIT 5
    """)
    
    for record in result:
        export_data['sample_chunks'].append({
            'text': record['text'][:100] + '...',
            'index': record['index'],
            'source': record['source']
        })

# Save to file
export_file = 'graph_analysis_export.json'
with open(export_file, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"✅ Graph data exported to {export_file}")
print(f"\n📊 Export Summary:")
print(f"  - Documents: {len(export_data['documents'])}")
print(f"  - Sample chunks: {len(export_data['sample_chunks'])}")
print(f"  - Total size: {len(json.dumps(export_data))} bytes")

## Cleanup

In [None]:
# Close connection
rag.close()
print("✅ Connection closed")

## Summary

This notebook provided comprehensive analysis of your Neo4j knowledge graph:

- **Graph Statistics**: Document and chunk counts, size distributions
- **Category Analysis**: Distribution of content across categories
- **Connectivity**: Relationship integrity and orphaned node detection
- **Embedding Coverage**: Verification of vector embeddings
- **Search Patterns**: Similarity score distributions
- **Knowledge Coverage**: Topic-category heatmap
- **Data Export**: JSON export for external analysis

### Next Steps:
- Explore **04_knowledge_discovery.ipynb** for semantic exploration
- Use **05_query_optimization.ipynb** for performance tuning
- Review exported data for insights about your knowledge base