In [23]:
!pip install -r requirements.txt

Collecting tqdm (from -r requirements.txt (line 2))
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [1]:
import json
import re
from typing import List, Dict
from neo4j import GraphDatabase
from pathlib import Path
from tqdm import tqdm

class JSONToKGProcessor:
    def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        self.documents = {}
        self.chunks = []
        
    def close(self):
        self.driver.close()
    
    def clear_database(self):
        """Clear all existing data"""
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            print("Database cleared successfully!")
    
    def test_connection(self):
        """Test Neo4j connection"""
        try:
            with self.driver.session() as session:
                result = session.run("RETURN 1 as test")
                print("✅ Neo4j connection successful!")
                return True
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            return False
        finally:
            pass
    
    def load_json_files(self, json_dir: str):
        """Load all JSON files from directory"""
        json_path = Path(json_dir)
        self.documents.clear()  # Clear existing documents
        self.chunks.clear()     # Clear existing chunks
        
        for json_file in json_path.glob("*.json"):
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                spec_id = data["metadata"]["specification_id"]
                self.documents[spec_id] = data
                self.chunks.extend(data["chunks"])
        print(f"Loaded {len(self.documents)} documents with {len(self.chunks)} chunks")
    
    def build_knowledge_graph(self):
        """Build complete knowledge graph"""
        with self.driver.session() as session:
            # Create constraints (only if they don't exist)
            try:
                session.run("CREATE CONSTRAINT doc_id IF NOT EXISTS FOR (d:Document) REQUIRE d.spec_id IS UNIQUE")
                session.run("CREATE CONSTRAINT chunk_id IF NOT EXISTS FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE")
            except Exception as e:
                print(f"Constraints may already exist: {e}")
            
            # Create document nodes
            self.create_document_nodes(session)
            
            # Create chunk nodes and relationships
            self.create_chunk_nodes(session)
            
            # Create cross-references
            self.create_cross_references(session)
            
            # Create indexes
            try:
                session.run("CREATE INDEX chunk_content IF NOT EXISTS FOR (c:Chunk) ON (c.content)")
                session.run("CREATE INDEX chunk_section IF NOT EXISTS FOR (c:Chunk) ON (c.section_id)")
            except Exception as e:
                print(f"Indexes may already exist: {e}")
    
    def create_document_nodes(self, session):
        """Create document nodes using MERGE"""
        print("Creating document nodes...")
        
        for spec_id, data in tqdm(self.documents.items()):
            session.run("""
                MERGE (d:Document {spec_id: $spec_id})
                ON CREATE SET
                    d.version = $version,
                    d.title = $title,
                    d.file_path = $file_path,
                    d.total_chunks = $total_chunks,
                    d.export_date = $export_date,
                    d.created_at = datetime()
                ON MATCH SET
                    d.version = $version,
                    d.title = $title,
                    d.file_path = $file_path,
                    d.total_chunks = $total_chunks,
                    d.export_date = $export_date,
                    d.updated_at = datetime()
            """, 
                spec_id=spec_id,
                version=data["metadata"]["version"],
                title=data["metadata"]["title"],
                file_path=data["metadata"]["file_path"],
                total_chunks=data["export_info"]["total_chunks"],
                export_date=data["export_info"]["export_date"]
            )
    
    def create_chunk_nodes(self, session):
        """Create chunk nodes using MERGE and CONTAINS relationships"""
        print("Creating chunk nodes...")
        
        for chunk in tqdm(self.chunks):
            # Extract spec_id from chunk_id (e.g., "ts_29.388_1" -> "ts_29.388")
            chunk_id_parts = chunk["chunk_id"].split("_")
            if len(chunk_id_parts) >= 3:
                spec_id = f"{chunk_id_parts[0]}_{chunk_id_parts[1]}.{chunk_id_parts[2]}"
            else:
                spec_id = chunk["chunk_id"]  # fallback
            
            # Create chunk node using MERGE
            session.run("""
                MERGE (c:Chunk {chunk_id: $chunk_id})
                ON CREATE SET 
                    c.section_id = $section_id,
                    c.section_title = $section_title,
                    c.content = $content,
                    c.chunk_type = $chunk_type,
                    c.spec_id = $spec_id,
                    c.created_at = datetime()
                ON MATCH SET
                    c.section_id = $section_id,
                    c.section_title = $section_title,
                    c.content = $content,
                    c.chunk_type = $chunk_type,
                    c.spec_id = $spec_id,
                    c.updated_at = datetime()
            """, 
                chunk_id=chunk["chunk_id"],
                section_id=chunk["section_id"],
                section_title=chunk["section_title"],
                content=chunk["content"],
                chunk_type=chunk["chunk_type"],
                spec_id=spec_id
            )
        
        # Create CONTAINS relationships
        print("Creating CONTAINS relationships...")
        session.run("""
            MATCH (d:Document), (c:Chunk)
            WHERE d.spec_id = c.spec_id
            MERGE (d)-[:CONTAINS]->(c)
        """)
    
    def create_cross_references(self, session):
        """Create cross-reference relationships"""
        print("Creating cross-references...")
        
        for chunk in tqdm(self.chunks):
            if chunk["cross_references"]:
                source_chunk_id = chunk["chunk_id"]
                
                for ref_spec in chunk["cross_references"]:
                    # Create reference relationships to documents
                    session.run("""
                        MATCH (source:Chunk {chunk_id: $source_id})
                        MATCH (target_doc:Document {spec_id: $target_spec})
                        MERGE (source)-[:REFERENCES {
                            ref_type: 'spec_reference',
                            target_spec: $target_spec
                        }]->(target_doc)
                    """, source_id=source_chunk_id, target_spec=ref_spec)
                    
                    # Also create references to specific chunks if they exist
                    session.run("""
                        MATCH (source:Chunk {chunk_id: $source_id})
                        MATCH (target_chunk:Chunk)
                        WHERE target_chunk.spec_id = $target_spec
                        MERGE (source)-[:REFERENCES_CHUNK {
                            ref_type: 'chunk_reference',
                            target_spec: $target_spec
                        }]->(target_chunk)
                    """, source_id=source_chunk_id, target_spec=ref_spec)
    
    def query_cross_references(self, topic: str, max_results: int = 10) -> List[Dict]:
        """Query for cross-references related to a topic"""
        with self.driver.session() as session:
            # Find primary chunks about the topic
            primary_query = """
            MATCH (c:Chunk)
            WHERE toLower(c.content) CONTAINS toLower($topic)
            RETURN c.chunk_id as chunk_id, c.section_title as title, 
                   c.content as content, c.spec_id as spec_id
            ORDER BY size(c.content) DESC
            LIMIT $limit
            """
            
            primary_results = session.run(primary_query, topic=topic, limit=max_results)
            primary_chunks = [dict(record) for record in primary_results]
            
            if not primary_chunks:
                return []
            
            # For each primary chunk, find references
            enhanced_results = []
            for chunk in primary_chunks:
                ref_query = """
                MATCH (primary:Chunk {chunk_id: $chunk_id})
                MATCH (primary)-[:REFERENCES_CHUNK]->(ref_chunk:Chunk)
                WHERE toLower(ref_chunk.content) CONTAINS toLower($topic)
                RETURN ref_chunk.chunk_id as ref_chunk_id, 
                       ref_chunk.section_title as ref_title,
                       ref_chunk.content as ref_content,
                       ref_chunk.spec_id as ref_spec_id
                LIMIT 3
                """
                
                ref_results = session.run(ref_query, chunk_id=chunk["chunk_id"], topic=topic)
                references = [dict(record) for record in ref_results]
                
                enhanced_results.append({
                    "primary": chunk,
                    "references": references
                })
            
            return enhanced_results
    
    def get_section_references(self, spec_id: str, section_id: str) -> Dict:
        """Get all references for a specific section"""
        with self.driver.session() as session:
            query = """
            MATCH (c:Chunk {spec_id: $spec_id, section_id: $section_id})
            MATCH (c)-[:REFERENCES]->(ref_doc:Document)
            MATCH (ref_doc)-[:CONTAINS]->(ref_chunk:Chunk)
            RETURN c.chunk_id as source_chunk,
                   c.section_title as source_title,
                   ref_chunk.chunk_id as ref_chunk,
                   ref_chunk.section_title as ref_title,
                   ref_chunk.spec_id as ref_spec
            """
            
            results = session.run(query, spec_id=spec_id, section_id=section_id)
            return [dict(record) for record in results]
    
    def get_statistics(self):
        """Get database statistics"""
        with self.driver.session() as session:
            stats = {}
            
            # Count nodes by type
            result = session.run("""
                MATCH (n) 
                RETURN labels(n)[0] as label, count(n) as count
            """)
            
            for record in result:
                stats[record['label']] = record['count']
            
            # Count relationships
            rel_result = session.run("""
                MATCH ()-[r]->() 
                RETURN type(r) as rel_type, count(r) as count
            """)
            
            stats['relationships'] = {}
            for record in rel_result:
                stats['relationships'][record['rel_type']] = record['count']
            
            return stats
    
    def process_json_to_kg(self, json_dir: str, clear_first: bool = True):
        """Complete pipeline from JSON to Knowledge Graph"""
        # Test connection first
        if not self.test_connection():
            return False
        
        # Clear database if requested
        if clear_first:
            print("Clearing existing database...")
            self.clear_database()
            
        print("Loading JSON files...")
        self.load_json_files(json_dir)
        
        print("Building knowledge graph...")
        self.build_knowledge_graph()
        
        print("Knowledge graph built successfully!")
        
        # Show statistics
        stats = self.get_statistics()
        print(f"\nDatabase Statistics:")
        for node_type, count in stats.items():
            if node_type != 'relationships':
                print(f"  {node_type}: {count} nodes")
        
        print("  Relationships:")
        for rel_type, count in stats.get('relationships', {}).items():
            print(f"    {rel_type}: {count}")
        
        return True

In [2]:
# Use the corrected processor from the artifact above
processor = JSONToKGProcessor(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password"
)

try:
    # This will clear database first, then process
    processor.process_json_to_kg("/home/linguyen/3GPP/3GPP_JSON_DOC/demo/", True)
    
    # Example queries
    results = processor.query_cross_references("AMF", max_results=5)
    print(f"Found {len(results)} cross-references for 'AMF'")
    
finally:
    processor.close()

✅ Neo4j connection successful!
Clearing existing database...
Database cleared successfully!
Loading JSON files...
Loaded 68 documents with 13030 chunks
Building knowledge graph...
Creating document nodes...


100%|██████████| 68/68 [00:01<00:00, 45.05it/s]


Creating chunk nodes...


100%|██████████| 13030/13030 [01:05<00:00, 198.02it/s]


Creating CONTAINS relationships...
Creating cross-references...


100%|██████████| 13030/13030 [07:10<00:00, 30.24it/s] 


Knowledge graph built successfully!

Database Statistics:
  Document: 68 nodes
  Chunk: 13024 nodes
  Relationships:
    REFERENCES: 6087
Found 5 cross-references for 'AMF'


In [7]:
processor = JSONToKGProcessor(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password")
results = processor.query_cross_references("AMF", max_results=5)
print(f"Found {len(results)} cross-references for 'AMF'")
print(results[0])  # Display the results of the query

Found 5 cross-references for 'AMF'
{'primary': {'chunk_id': 'ts_29.503_6.10.9', 'title': 'Security', 'content': 'As indicated in 3GPP\xa0TS\xa033.501\xa0[6] and 3GPP\xa0TS\xa029.500\xa0[4], the access to the Nudm_UEID API may be authorized by means of the OAuth2 protocol (see IETF\xa0RFC\xa06749\xa0[18]), based on local configuration, using the "Client Credentials" authorization grant, where the NRF (see 3GPP\xa0TS\xa029.510\xa0[19]) plays the role of the authorization server.\nIf OAuth2 is used, an NF Service Consumer, prior to consuming services offered by the Nudm_UEID API, shall obtain a "token" from the authorization server, by invoking the Access Token Request service, as described in 3GPP\xa0TS\xa029.510\xa0[19], clause 5.4.2.2.\nNOTE:\tWhen multiple NRFs are deployed in a network, the NRF used as authorization server is the same NRF that the NF Service Consumer used for discovering the Nudm_UEID service.\nThe Nudm_UEID API defines a single scope "nudm-ueid" for OAuth2 authoriza