# Knowledge Discovery & Semantic Exploration

This notebook helps you discover insights and patterns in your knowledge graph:
- Semantic similarity exploration
- Topic clustering and relationships
- Content gaps and recommendations
- Cross-document connections
- Knowledge path finding

## 1. Setup and Imports

In [None]:
import sys
sys.path.append('..')

from neo4j_rag import Neo4jRAG, RAGQueryEngine
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import networkx as nx

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('deep')

# Initialize connections
rag = Neo4jRAG()
engine = RAGQueryEngine(rag)
print("✅ Connected to Neo4j RAG System")

## 2. Semantic Similarity Network

In [None]:
# Get all chunks with embeddings
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
        WHERE c.embedding IS NOT NULL
        RETURN c.text as text, c.embedding as embedding, 
               d.source as source, d.category as category
        LIMIT 100
    """)
    
    chunks = []
    embeddings = []
    
    for record in result:
        chunks.append({
            'text': record['text'][:100],  # First 100 chars
            'source': record['source'],
            'category': record.get('category', 'unknown')
        })
        embeddings.append(record['embedding'])

if embeddings:
    # Calculate similarity matrix
    embeddings_array = np.array(embeddings)
    similarity_matrix = cosine_similarity(embeddings_array)
    
    # Find most similar pairs (excluding self-similarity)
    np.fill_diagonal(similarity_matrix, 0)
    
    # Get top 10 most similar pairs
    similar_pairs = []
    for i in range(len(chunks)):
        for j in range(i+1, len(chunks)):
            similar_pairs.append((i, j, similarity_matrix[i, j]))
    
    similar_pairs.sort(key=lambda x: x[2], reverse=True)
    
    print("🔗 Most Similar Content Pairs:\n")
    for idx, (i, j, score) in enumerate(similar_pairs[:5]):
        print(f"{idx+1}. Similarity: {score:.3f}")
        print(f"   Text 1: {chunks[i]['text'][:50]}...")
        print(f"   Text 2: {chunks[j]['text'][:50]}...")
        print()

## 3. Topic Clustering

In [None]:
if embeddings:
    # Perform K-means clustering
    n_clusters = min(5, len(embeddings))  # Adjust based on data
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings_array)
    
    # Reduce dimensions for visualization
    pca = PCA(n_components=2, random_state=42)
    embeddings_2d = pca.fit_transform(embeddings_array)
    
    # Visualize clusters
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                         c=cluster_labels, cmap='viridis', 
                         alpha=0.6, s=100)
    plt.colorbar(scatter, label='Cluster')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.title('Knowledge Clusters in Embedding Space')
    
    # Add cluster centers
    centers_2d = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centers_2d[:, 0], centers_2d[:, 1], 
               marker='*', s=300, c='red', edgecolor='black', linewidth=2)
    
    plt.grid(alpha=0.3)
    plt.show()
    
    # Analyze cluster composition
    print("\n📊 Cluster Analysis:")
    for cluster_id in range(n_clusters):
        cluster_chunks = [chunks[i] for i, label in enumerate(cluster_labels) if label == cluster_id]
        categories = [c['category'] for c in cluster_chunks]
        category_counts = Counter(categories)
        
        print(f"\nCluster {cluster_id} ({len(cluster_chunks)} chunks):")
        print(f"  Categories: {dict(category_counts)}")
        if cluster_chunks:
            print(f"  Sample: {cluster_chunks[0]['text'][:60]}...")

## 4. Knowledge Gaps Detection

In [None]:
# Define key concepts to check
key_concepts = [
    "installation",
    "configuration", 
    "performance tuning",
    "security",
    "backup",
    "migration",
    "troubleshooting",
    "best practices",
    "API reference",
    "examples"
]

coverage_results = []

for concept in key_concepts:
    # Search for each concept
    results = rag.vector_search(concept, k=5)
    
    if results:
        avg_score = np.mean([r['score'] for r in results])
        max_score = max([r['score'] for r in results])
        coverage_results.append({
            'Concept': concept,
            'Coverage': 'Good' if max_score > 0.7 else 'Partial' if max_score > 0.5 else 'Weak',
            'Max Score': max_score,
            'Avg Score': avg_score,
            'Matches': len(results)
        })
    else:
        coverage_results.append({
            'Concept': concept,
            'Coverage': 'Missing',
            'Max Score': 0,
            'Avg Score': 0,
            'Matches': 0
        })

# Display coverage analysis
df_coverage = pd.DataFrame(coverage_results)
print("🔍 Knowledge Coverage Analysis:\n")
print(df_coverage.to_string(index=False, float_format='%.3f'))

# Visualize coverage
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Coverage levels
coverage_counts = df_coverage['Coverage'].value_counts()
colors = {'Good': 'green', 'Partial': 'yellow', 'Weak': 'orange', 'Missing': 'red'}
ax1.bar(coverage_counts.index, coverage_counts.values, 
       color=[colors.get(x, 'gray') for x in coverage_counts.index])
ax1.set_xlabel('Coverage Level')
ax1.set_ylabel('Number of Concepts')
ax1.set_title('Knowledge Coverage Distribution')

# Score distribution
ax2.barh(df_coverage['Concept'], df_coverage['Max Score'], color='steelblue')
ax2.axvline(0.7, color='green', linestyle='--', alpha=0.5, label='Good threshold')
ax2.axvline(0.5, color='orange', linestyle='--', alpha=0.5, label='Weak threshold')
ax2.set_xlabel('Maximum Similarity Score')
ax2.set_title('Concept Coverage Scores')
ax2.legend()

plt.tight_layout()
plt.show()

# Recommendations
weak_concepts = df_coverage[df_coverage['Coverage'].isin(['Weak', 'Missing'])]['Concept'].tolist()
if weak_concepts:
    print("\n⚠️ Recommended areas for content addition:")
    for concept in weak_concepts:
        print(f"  - {concept}")

## 5. Cross-Document Connections

In [None]:
# Find connections between documents based on shared topics
with rag.driver.session() as session:
    result = session.run("""
        MATCH (d1:Document)-[:HAS_CHUNK]->(c1:Chunk)
        MATCH (d2:Document)-[:HAS_CHUNK]->(c2:Chunk)
        WHERE d1.id < d2.id
        WITH d1, d2, 
             COLLECT(DISTINCT CASE WHEN c1.text CONTAINS 'Neo4j' AND c2.text CONTAINS 'Neo4j' THEN 'Neo4j' END) +
             COLLECT(DISTINCT CASE WHEN c1.text CONTAINS 'graph' AND c2.text CONTAINS 'graph' THEN 'graph' END) +
             COLLECT(DISTINCT CASE WHEN c1.text CONTAINS 'database' AND c2.text CONTAINS 'database' THEN 'database' END) +
             COLLECT(DISTINCT CASE WHEN c1.text CONTAINS 'query' AND c2.text CONTAINS 'query' THEN 'query' END) as shared_topics
        WHERE SIZE([t IN shared_topics WHERE t IS NOT NULL]) > 0
        RETURN d1.source as doc1, d2.source as doc2, 
               [t IN shared_topics WHERE t IS NOT NULL] as topics,
               SIZE([t IN shared_topics WHERE t IS NOT NULL]) as connection_strength
        ORDER BY connection_strength DESC
        LIMIT 10
    """)
    
    connections = []
    for record in result:
        connections.append({
            'Document 1': record['doc1'][:30],
            'Document 2': record['doc2'][:30],
            'Shared Topics': ', '.join(record['topics']),
            'Strength': record['connection_strength']
        })

if connections:
    df_connections = pd.DataFrame(connections)
    print("🔗 Cross-Document Connections:\n")
    print(df_connections.to_string(index=False))
    
    # Create network visualization
    if len(connections) > 0:
        G = nx.Graph()
        
        for conn in connections:
            G.add_edge(conn['Document 1'], conn['Document 2'], 
                      weight=conn['Strength'])
        
        plt.figure(figsize=(12, 8))
        pos = nx.spring_layout(G, k=1, iterations=50)
        
        # Draw network
        nx.draw_networkx_nodes(G, pos, node_color='lightblue', 
                              node_size=1000, alpha=0.7)
        nx.draw_networkx_labels(G, pos, font_size=8)
        
        # Draw edges with varying thickness
        edges = G.edges()
        weights = [G[u][v]['weight'] for u, v in edges]
        nx.draw_networkx_edges(G, pos, width=[w*2 for w in weights], 
                              alpha=0.5, edge_color='gray')
        
        plt.title('Document Connection Network')
        plt.axis('off')
        plt.show()

## 6. Semantic Search Patterns

In [None]:
# Analyze different search strategies
test_query = "How to optimize Neo4j performance?"

print(f"🔍 Query: '{test_query}'\n")

# 1. Vector Search
print("1️⃣ Vector Search Results:")
vector_results = rag.vector_search(test_query, k=3)
for i, result in enumerate(vector_results, 1):
    print(f"   {i}. Score: {result['score']:.3f} - {result['text'][:80]}...")

# 2. Hybrid Search
print("\n2️⃣ Hybrid Search Results:")
hybrid_results = rag.hybrid_search(test_query, k=3)
for i, result in enumerate(hybrid_results, 1):
    print(f"   {i}. Score: {result['score']:.3f} - {result['text'][:80]}...")

# 3. Query with Context
print("\n3️⃣ RAG Query with Answer Generation:")
response = engine.query(test_query)
print(f"   Answer: {response['answer'][:200]}...")
print(f"   Sources used: {len(response['sources'])}")

# Compare result overlap
vector_texts = set([r['text'][:50] for r in vector_results])
hybrid_texts = set([r['text'][:50] for r in hybrid_results])
overlap = vector_texts.intersection(hybrid_texts)

print(f"\n📊 Search Strategy Comparison:")
print(f"   Vector unique results: {len(vector_texts - hybrid_texts)}")
print(f"   Hybrid unique results: {len(hybrid_texts - vector_texts)}")
print(f"   Overlapping results: {len(overlap)}")

## 7. Topic Evolution Analysis

In [None]:
# Analyze how topics are distributed across chunks
topics_to_analyze = ['Neo4j', 'graph', 'database', 'vector', 'embedding', 'search']

topic_distribution = {}

with rag.driver.session() as session:
    for topic in topics_to_analyze:
        result = session.run("""
            MATCH (d:Document)-[:HAS_CHUNK]->(c:Chunk)
            WHERE toLower(c.text) CONTAINS toLower($topic)
            RETURN c.chunk_index as chunk_index, COUNT(*) as count
            ORDER BY chunk_index
        """, topic=topic)
        
        indices = []
        counts = []
        for record in result:
            indices.append(record['chunk_index'])
            counts.append(record['count'])
        
        topic_distribution[topic] = (indices, counts)

# Visualize topic distribution
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for idx, topic in enumerate(topics_to_analyze):
    if idx < len(axes):
        indices, counts = topic_distribution[topic]
        if indices:
            axes[idx].bar(indices, counts, color='teal', alpha=0.7)
            axes[idx].set_xlabel('Chunk Index')
            axes[idx].set_ylabel('Occurrences')
            axes[idx].set_title(f'Topic: "{topic}"')
            axes[idx].grid(axis='y', alpha=0.3)
        else:
            axes[idx].text(0.5, 0.5, 'No data', ha='center', va='center')
            axes[idx].set_title(f'Topic: "{topic}"')

plt.suptitle('Topic Distribution Across Document Chunks')
plt.tight_layout()
plt.show()

# Summary statistics
print("\n📊 Topic Coverage Summary:")
for topic in topics_to_analyze:
    indices, counts = topic_distribution[topic]
    if counts:
        print(f"  {topic}: {sum(counts)} occurrences across {len(set(indices))} unique chunks")

## 8. Question Answering Capabilities

In [None]:
# Test various question types
questions = [
    ("Definition", "What is Neo4j?"),
    ("How-to", "How do I create a graph database?"),
    ("Comparison", "What's the difference between graph and relational databases?"),
    ("Best Practice", "What are best practices for Neo4j?"),
    ("Troubleshooting", "How to debug Neo4j queries?")
]

qa_results = []

for q_type, question in questions:
    print(f"\n❓ [{q_type}] {question}")
    
    # Get answer
    response = engine.query(question, k=3)
    
    # Evaluate response
    answer_length = len(response['answer'])
    sources_count = len(response['sources'])
    avg_relevance = np.mean(response['relevance_scores']) if response['relevance_scores'] else 0
    
    print(f"💡 Answer: {response['answer'][:150]}...")
    print(f"📊 Metrics: {sources_count} sources, {answer_length} chars, {avg_relevance:.3f} relevance")
    
    qa_results.append({
        'Type': q_type,
        'Question': question[:40],
        'Answer Length': answer_length,
        'Sources': sources_count,
        'Avg Relevance': avg_relevance
    })

# Summary table
df_qa = pd.DataFrame(qa_results)
print("\n📊 Question Answering Performance:")
print(df_qa.to_string(index=False, float_format='%.3f'))

## 9. Content Recommendations

In [None]:
# Generate content recommendations based on current knowledge
def find_related_content(query_text, exclude_source=None, k=5):
    """Find related content to a given query"""
    results = rag.vector_search(query_text, k=k*2)  # Get more to filter
    
    # Filter and deduplicate
    recommendations = []
    seen_texts = set()
    
    for result in results:
        text_preview = result['text'][:100]
        if text_preview not in seen_texts:
            if not exclude_source or result.get('metadata', {}).get('source') != exclude_source:
                recommendations.append(result)
                seen_texts.add(text_preview)
        
        if len(recommendations) >= k:
            break
    
    return recommendations

# Test with a sample document chunk
sample_text = "Neo4j uses Cypher query language for graph traversal"

print(f"📝 Reference Content: '{sample_text}'\n")
print("🎯 Recommended Related Content:\n")

recommendations = find_related_content(sample_text, k=5)

for i, rec in enumerate(recommendations, 1):
    print(f"{i}. Relevance: {rec['score']:.3f}")
    print(f"   Content: {rec['text'][:100]}...")
    if 'metadata' in rec and 'source' in rec['metadata']:
        print(f"   Source: {rec['metadata']['source']}")
    print()

## 10. Knowledge Graph Insights Export

In [None]:
# Generate comprehensive insights report
insights = {
    'discovery_date': pd.Timestamp.now().isoformat(),
    'knowledge_coverage': {},
    'topic_clusters': {},
    'content_gaps': [],
    'recommendations': []
}

# Knowledge coverage
stats = rag.get_stats()
insights['knowledge_coverage'] = {
    'total_documents': stats['documents'],
    'total_chunks': stats['chunks'],
    'avg_chunks_per_doc': stats['chunks'] / max(stats['documents'], 1)
}

# Topic clusters (from earlier analysis)
if 'n_clusters' in locals():
    insights['topic_clusters'] = {
        'number_of_clusters': n_clusters,
        'clustering_method': 'KMeans'
    }

# Content gaps (from coverage analysis)
if 'weak_concepts' in locals():
    insights['content_gaps'] = weak_concepts

# Recommendations
insights['recommendations'] = [
    "Add more content on: " + ", ".join(insights['content_gaps'][:3]) if insights['content_gaps'] else "Knowledge base is well-covered",
    f"Current focus areas: {len(set([c.get('category', 'unknown') for c in chunks[:20]]))} distinct categories",
    "Consider adding more examples and troubleshooting guides" if 'examples' in weak_concepts else "Example coverage is good"
]

# Display insights
print("🎯 Knowledge Discovery Insights:\n")
print("📊 Coverage:")
for key, value in insights['knowledge_coverage'].items():
    print(f"   {key}: {value}")

print("\n🔍 Content Gaps:")
for gap in insights['content_gaps'][:5]:
    print(f"   - {gap}")

print("\n💡 Recommendations:")
for rec in insights['recommendations']:
    print(f"   • {rec}")

# Save insights
import json
with open('knowledge_discovery_insights.json', 'w') as f:
    json.dump(insights, f, indent=2, default=str)

print("\n✅ Insights saved to knowledge_discovery_insights.json")

## Cleanup

In [None]:
# Close connections
rag.close()
print("✅ Connections closed")

## Summary

This notebook provided deep insights into your knowledge graph:

- **Semantic Networks**: Discovered relationships between content
- **Topic Clustering**: Identified natural content groupings
- **Knowledge Gaps**: Found areas needing more documentation
- **Cross-Document Connections**: Mapped relationships between documents
- **Search Patterns**: Compared different search strategies
- **Content Recommendations**: Generated related content suggestions

### Key Takeaways:
1. Your knowledge graph shows natural clustering around key topics
2. Some content gaps exist that could be filled for better coverage
3. Hybrid search provides better results than vector search alone
4. Documents are well-connected through shared topics

### Next Steps:
- Review **05_query_optimization.ipynb** for performance tuning
- Add content to fill identified knowledge gaps
- Use insights to improve document organization