# 3GPP Knowledge Graph Builder v2 - Simple Version

This is a simplified version that completely avoids null property issues.

In [None]:
!pip install neo4j tqdm

In [None]:
import json
import re
from typing import List, Dict, Optional
from neo4j import GraphDatabase
from pathlib import Path
from tqdm import tqdm
import hashlib

class SimpleKGProcessorV2:
    def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        self.documents = {}
        self.chunks = []
        
    def close(self):
        self.driver.close()
    
    def clear_database(self):
        """Clear all existing data"""
        with self.driver.session() as session:
            session.run("MATCH (n) DETACH DELETE n")
            print("Database cleared successfully!")
    
    def test_connection(self):
        """Test Neo4j connection"""
        try:
            with self.driver.session() as session:
                result = session.run("RETURN 1 as test")
                print("✅ Neo4j connection successful!")
                return True
        except Exception as e:
            print(f"❌ Connection failed: {e}")
            return False
    
    def load_json_files(self, json_dir: str):
        """Load all JSON files from directory"""
        json_path = Path(json_dir)
        self.documents.clear()
        self.chunks.clear()
        
        for json_file in json_path.glob("*.json"):
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                spec_id = data["metadata"]["specification_id"]
                self.documents[spec_id] = data
                self.chunks.extend(data["chunks"])
        print(f"Loaded {len(self.documents)} documents with {len(self.chunks)} chunks")
    
    def build_knowledge_graph(self):
        """Build knowledge graph - simple version"""
        with self.driver.session() as session:
            # Create constraints
            try:
                session.run("CREATE CONSTRAINT doc_id IF NOT EXISTS FOR (d:Document) REQUIRE d.spec_id IS UNIQUE")
                session.run("CREATE CONSTRAINT chunk_id IF NOT EXISTS FOR (c:Chunk) REQUIRE c.chunk_id IS UNIQUE")
            except:
                pass
            
            # Create documents
            self.create_documents(session)
            
            # Create chunks
            self.create_chunks(session)
            
            # Create simple references - NO null properties ever
            self.create_simple_references(session)
    
    def create_documents(self, session):
        """Create document nodes"""
        print("Creating documents...")
        for spec_id, data in tqdm(self.documents.items()):
            session.run("""
                MERGE (d:Document {spec_id: $spec_id})
                SET d.version = $version,
                    d.title = $title,
                    d.total_chunks = $total_chunks
            """, 
                spec_id=spec_id,
                version=data["metadata"]["version"],
                title=data["metadata"]["title"],
                total_chunks=data["export_info"]["total_chunks"]
            )
    
    def create_chunks(self, session):
        """Create chunk nodes"""
        print("Creating chunks...")
        for chunk in tqdm(self.chunks):
            # Extract spec_id
            chunk_id_parts = chunk["chunk_id"].split("_")
            if len(chunk_id_parts) >= 3:
                spec_id = f"{chunk_id_parts[0]}_{chunk_id_parts[1]}.{chunk_id_parts[2]}"
            else:
                spec_id = chunk["chunk_id"]
            
            # Get metadata safely
            content_meta = chunk.get("content_metadata", {})
            
            session.run("""
                MERGE (c:Chunk {chunk_id: $chunk_id})
                SET c.section_id = $section_id,
                    c.section_title = $section_title,
                    c.content = $content,
                    c.chunk_type = $chunk_type,
                    c.spec_id = $spec_id,
                    c.word_count = $word_count,
                    c.complexity_score = $complexity_score,
                    c.key_terms = $key_terms
            """, 
                chunk_id=chunk["chunk_id"],
                section_id=chunk["section_id"],
                section_title=chunk["section_title"],
                content=chunk["content"],
                chunk_type=chunk["chunk_type"],
                spec_id=spec_id,
                word_count=content_meta.get("word_count", 0),
                complexity_score=content_meta.get("complexity_score", 0.0),
                key_terms=content_meta.get("key_terms", [])
            )
        
        # Create CONTAINS relationships
        print("Creating CONTAINS relationships...")
        session.run("""
            MATCH (d:Document), (c:Chunk)
            WHERE d.spec_id = c.spec_id
            MERGE (d)-[:CONTAINS]->(c)
        """)
    
    def create_simple_references(self, session):
        """Create references - completely avoiding null properties"""
        print("Creating references...")
        
        for chunk in tqdm(self.chunks):
            source_chunk_id = chunk["chunk_id"]
            cross_refs = chunk.get("cross_references", {})
            
            # Process external references
            for ref in cross_refs.get("external", []):
                self.create_safe_external_reference(session, source_chunk_id, ref)
    
    def create_safe_external_reference(self, session, source_chunk_id: str, ref: dict):
        """Create external reference with zero null properties"""
        
        # Generate unique ID
        ref_uid = hashlib.md5(f"{source_chunk_id}_{ref['target_spec']}_{ref['ref_id']}".encode()).hexdigest()[:10]
        
        # Get target_item safely
        target_item = ref.get("target_item")
        
        # Only create if target document exists
        result = session.run("""
            MATCH (source:Chunk {chunk_id: $source_id})
            MATCH (target_doc:Document {spec_id: $target_spec})
            RETURN count(*) as exists
        """, source_id=source_chunk_id, target_spec=ref["target_spec"])
        
        if result.single()["exists"] > 0:
            # Create basic relationship with NO null properties
            session.run("""
                MATCH (source:Chunk {chunk_id: $source_id})
                MATCH (target_doc:Document {spec_id: $target_spec})
                CREATE (source)-[r:REFERENCES_SPEC]->(target_doc)
                SET r.ref_id = $ref_id,
                    r.ref_type = $ref_type,
                    r.confidence = $confidence,
                    r.is_external = true,
                    r.ref_uid = $ref_uid
            """, 
                source_id=source_chunk_id,
                target_spec=ref["target_spec"],
                ref_id=ref["ref_id"],
                ref_type=ref["ref_type"],
                confidence=ref["confidence"],
                ref_uid=ref_uid
            )
            
            # Add target_item ONLY if it exists and is valid
            if target_item and target_item != "null" and target_item.strip():
                session.run("""
                    MATCH ()-[r:REFERENCES_SPEC]->()
                    WHERE r.ref_uid = $ref_uid
                    SET r.target_item = $target_item
                """, 
                    ref_uid=ref_uid,
                    target_item=target_item
                )
    
    def get_statistics(self):
        """Get database statistics"""
        with self.driver.session() as session:
            stats = {}
            
            # Count nodes
            result = session.run("""
                MATCH (n) 
                RETURN labels(n)[0] as label, count(n) as count
            """)
            
            for record in result:
                stats[record['label']] = record['count']
            
            # Count relationships
            rel_result = session.run("""
                MATCH ()-[r]->() 
                RETURN type(r) as rel_type, count(r) as count
            """)
            
            stats['relationships'] = {}
            for record in rel_result:
                stats['relationships'][record['rel_type']] = record['count']
            
            return stats
    
    def process_json_to_kg_simple(self, json_dir: str, clear_first: bool = True):
        """Simple pipeline from JSON v2 to Knowledge Graph"""
        if not self.test_connection():
            return False
        
        if clear_first:
            print("Clearing existing database...")
            self.clear_database()
            
        print("Loading JSON v2 files...")
        self.load_json_files(json_dir)
        
        print("Building simple knowledge graph...")
        self.build_knowledge_graph()
        
        print("Simple knowledge graph built successfully!")
        
        # Show statistics
        stats = self.get_statistics()
        print(f"\nDatabase Statistics:")
        for node_type, count in stats.items():
            if node_type != 'relationships':
                print(f"  {node_type}: {count} nodes")
        
        print("  Relationships:")
        for rel_type, count in stats.get('relationships', {}).items():
            print(f"    {rel_type}: {count}")
        
        return True

print("Simple KG Processor V2 loaded successfully!")

## Build Simple Knowledge Graph

In [None]:
# Initialize the simple processor
processor = SimpleKGProcessorV2(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password"
)

try:
    # Process the v2 JSON files
    success = processor.process_json_to_kg_simple(
        "/home/linguyen/3GPP/3GPP_JSON_DOC/processed_json_v2/", 
        clear_first=True
    )
    
    if success:
        print("\n" + "="*50)
        print("Simple Knowledge Graph built successfully!")
        print("This version completely avoids null property issues.")
        print("="*50)
    
finally:
    processor.close()

## Test the Simple Knowledge Graph

In [None]:
# Test basic queries
processor = SimpleKGProcessorV2(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password"
)

try:
    with processor.driver.session() as session:
        print("Testing simple queries:")
        print("="*30)
        
        # Find chunks about SCP
        result = session.run("""
            MATCH (c:Chunk)
            WHERE toLower(c.content) CONTAINS 'scp'
            RETURN c.spec_id as spec, c.section_title as title, c.chunk_type as type
            LIMIT 5
        """)
        
        print("\nChunks containing 'SCP':")
        for record in result:
            print(f"  {record['spec']} - {record['title']} ({record['type']})")
        
        # Find references
        result = session.run("""
            MATCH (source:Chunk)-[r:REFERENCES_SPEC]->(target:Document)
            RETURN source.spec_id as from_spec, target.spec_id as to_spec, r.confidence as confidence
            ORDER BY confidence DESC
            LIMIT 5
        """)
        
        print("\nTop references by confidence:")
        for record in result:
            print(f"  {record['from_spec']} → {record['to_spec']} (confidence: {record['confidence']:.2f})")

finally:
    processor.close()

print("\n✅ Simple Knowledge Graph is working perfectly!")
print("No null property errors - completely safe approach.")

## Fix Missing REFERENCES_CHUNK Relationships

This section adds the missing REFERENCES_CHUNK relationships based on internal cross-references from the JSON data.

In [None]:
class ReferencesFixer:
    """Fix missing REFERENCES_CHUNK relationships in the knowledge graph"""
    
    def __init__(self, neo4j_uri: str = "neo4j://localhost:7687", 
                 neo4j_user: str = "neo4j", neo4j_password: str = "password"):
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        self.chunks_data = {}
    
    def close(self):
        self.driver.close()
    
    def load_processed_json(self, json_dir: str):
        """Load processed JSON files to get internal references"""
        json_path = Path(json_dir)
        
        for json_file in json_path.glob("*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    
                for chunk in data.get("chunks", []):
                    chunk_id = chunk["chunk_id"]
                    self.chunks_data[chunk_id] = chunk
                    
            except Exception as e:
                print(f"Error loading {json_file}: {e}")
        
        print(f"Loaded {len(self.chunks_data)} chunks from JSON files")
    
    def add_missing_chunk_references(self):
        """Add REFERENCES_CHUNK relationships based on internal cross-references"""
        print("Adding missing REFERENCES_CHUNK relationships...")
        
        with self.driver.session() as session:
            added_count = 0
            
            for chunk_id, chunk_data in tqdm(self.chunks_data.items()):
                cross_refs = chunk_data.get("cross_references", {})
                internal_refs = cross_refs.get("internal", [])
                
                for ref in internal_refs:
                    target_chunk_id = ref.get("target_chunk_id")
                    if target_chunk_id:
                        success = self._create_chunk_reference(
                            session, chunk_id, target_chunk_id, ref
                        )
                        if success:
                            added_count += 1
            
            print(f"Added {added_count} REFERENCES_CHUNK relationships")
    
    def _create_chunk_reference(self, session, source_chunk_id: str, 
                               target_chunk_id: str, ref_data: dict) -> bool:
        """Create a REFERENCES_CHUNK relationship between two chunks"""
        try:
            # Generate unique reference ID
            ref_uid = hashlib.md5(
                f"{source_chunk_id}_{target_chunk_id}_{ref_data.get('ref_id', '')}".encode()
            ).hexdigest()[:10]
            
            # Check if both chunks exist
            result = session.run("""
                MATCH (source:Chunk {chunk_id: $source_id})
                MATCH (target:Chunk {chunk_id: $target_id})
                RETURN count(*) as exists
            """, source_id=source_chunk_id, target_id=target_chunk_id)
            
            if result.single()["exists"] == 0:
                return False
            
            # Create the relationship if it doesn't exist
            session.run("""
                MATCH (source:Chunk {chunk_id: $source_id})
                MATCH (target:Chunk {chunk_id: $target_id})
                MERGE (source)-[r:REFERENCES_CHUNK {ref_uid: $ref_uid}]->(target)
                SET r.ref_id = $ref_id,
                    r.ref_type = $ref_type,
                    r.confidence = $confidence,
                    r.is_internal = true
            """, 
                source_id=source_chunk_id,
                target_id=target_chunk_id,
                ref_uid=ref_uid,
                ref_id=ref_data.get("ref_id", ""),
                ref_type=ref_data.get("ref_type", "internal"),
                confidence=ref_data.get("confidence", 0.8)
            )
            
            return True
            
        except Exception as e:
            print(f"Error creating reference {source_chunk_id} -> {target_chunk_id}: {e}")
            return False
    
    def verify_references(self):
        """Verify that REFERENCES_CHUNK relationships exist"""
        with self.driver.session() as session:
            # Count REFERENCES_CHUNK relationships
            result = session.run("""
                MATCH ()-[r:REFERENCES_CHUNK]->()
                RETURN count(r) as chunk_ref_count
            """)
            chunk_ref_count = result.single()["chunk_ref_count"]
            
            # Count REFERENCES_SPEC relationships
            result = session.run("""
                MATCH ()-[r:REFERENCES_SPEC]->()
                RETURN count(r) as spec_ref_count
            """)
            spec_ref_count = result.single()["spec_ref_count"]
            
            print(f"Database relationship counts:")
            print(f"  REFERENCES_CHUNK: {chunk_ref_count}")
            print(f"  REFERENCES_SPEC: {spec_ref_count}")
            
            return chunk_ref_count > 0
    
    def fix_missing_references(self, json_dir: str = "/home/linguyen/3GPP/3GPP_JSON_DOC/processed_json_v2/"):
        """Main method to fix missing references"""
        print("Loading processed JSON files...")
        self.load_processed_json(json_dir)
        
        print("Adding missing REFERENCES_CHUNK relationships...")
        self.add_missing_chunk_references()
        
        print("Verifying references...")
        success = self.verify_references()
        
        if success:
            print("✅ Missing REFERENCES_CHUNK relationships added successfully!")
        else:
            print("⚠️ No REFERENCES_CHUNK relationships found - check internal refs in JSON data")
        
        return success

print("ReferencesFixer class loaded successfully!")

In [None]:
# Run the references fixer to add missing REFERENCES_CHUNK relationships
fixer = ReferencesFixer(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password"
)

try:
    success = fixer.fix_missing_references(
        "/home/linguyen/3GPP/3GPP_JSON_DOC/processed_json_v2/"
    )
    
    if success:
        print("\n" + "="*50)
        print("✅ Database references fixed successfully!")
        print("The REFERENCES_CHUNK relationships are now available.")
        print("="*50)
finally:
    fixer.close()

## Create Term Nodes from Abbreviations

This section extracts abbreviations and definitions from document chunks and creates Term nodes in the knowledge graph. Term nodes help resolve abbreviations like "SCP" to their full names like "Service Communication Proxy" and track which specs define them.

In [None]:
import sys
sys.path.insert(0, '/home/linguyen/3GPP')
from term_extractor import TermExtractor, ExtractedTerm

class TermNodeBuilder:
    """Build Term nodes in Neo4j from abbreviation/definition chunks"""
    
    def __init__(self, neo4j_uri: str = "neo4j://localhost:7687", 
                 neo4j_user: str = "neo4j", neo4j_password: str = "password"):
        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        self.extractor = TermExtractor()
        self.term_dict = {}  # Consolidated term dictionary
    
    def close(self):
        self.driver.close()
    
    def extract_terms_from_chunks(self, json_dir: str):
        """Extract all terms from abbreviation/definition chunks in JSON files"""
        print("Extracting terms from JSON files...")
        json_path = Path(json_dir)
        
        for json_file in tqdm(list(json_path.glob("*.json"))):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                spec_id = data["metadata"]["specification_id"]
                
                for chunk in data.get("chunks", []):
                    section_title = chunk.get("section_title", "").lower()
                    content = chunk.get("content", "")
                    
                    # Only process abbreviation/definition sections
                    if 'abbreviation' in section_title:
                        terms = self.extractor.extract_abbreviations(content, spec_id)
                        self._merge_terms(terms)
                    elif 'definition' in section_title:
                        terms = self.extractor.extract_definitions(content, spec_id)
                        self._merge_terms(terms)
                        
            except Exception as e:
                print(f"Error processing {json_file}: {e}")
        
        print(f"Extracted {len(self.term_dict)} unique terms")
        return self.term_dict
    
    def _merge_terms(self, terms: list):
        """Merge terms into consolidated dictionary"""
        for term in terms:
            abbr = term.abbreviation
            
            if abbr not in self.term_dict:
                self.term_dict[abbr] = {
                    'abbreviation': abbr,
                    'full_name': term.full_name,
                    'term_type': term.term_type,
                    'source_specs': [term.source_spec],
                    'primary_spec': term.source_spec
                }
            else:
                # Add source spec if not already present
                if term.source_spec not in self.term_dict[abbr]['source_specs']:
                    self.term_dict[abbr]['source_specs'].append(term.source_spec)
    
    def create_term_constraint(self, session):
        """Create unique constraint for Term nodes"""
        try:
            session.run("CREATE CONSTRAINT term_abbr IF NOT EXISTS FOR (t:Term) REQUIRE t.abbreviation IS UNIQUE")
            print("Term constraint created")
        except Exception as e:
            print(f"Constraint may already exist: {e}")
    
    def create_term_nodes(self):
        """Create Term nodes and DEFINED_IN relationships in Neo4j"""
        print("Creating Term nodes in Neo4j...")
        
        with self.driver.session() as session:
            # Create constraint
            self.create_term_constraint(session)
            
            created_count = 0
            for abbr, term_data in tqdm(self.term_dict.items()):
                try:
                    # Create Term node
                    session.run("""
                        MERGE (t:Term {abbreviation: $abbr})
                        SET t.full_name = $full_name,
                            t.term_type = $term_type,
                            t.source_specs = $source_specs,
                            t.primary_spec = $primary_spec
                    """,
                        abbr=abbr,
                        full_name=term_data['full_name'],
                        term_type=term_data['term_type'],
                        source_specs=term_data['source_specs'],
                        primary_spec=term_data['primary_spec']
                    )
                    
                    # Create DEFINED_IN relationships to Documents
                    for spec_id in term_data['source_specs']:
                        session.run("""
                            MATCH (t:Term {abbreviation: $abbr})
                            MATCH (d:Document {spec_id: $spec_id})
                            MERGE (t)-[:DEFINED_IN]->(d)
                        """,
                            abbr=abbr,
                            spec_id=spec_id
                        )
                    
                    created_count += 1
                    
                except Exception as e:
                    print(f"Error creating term '{abbr}': {e}")
            
            print(f"Created {created_count} Term nodes")
    
    def verify_terms(self):
        """Verify Term nodes were created"""
        with self.driver.session() as session:
            # Count Term nodes
            result = session.run("MATCH (t:Term) RETURN count(t) as count")
            term_count = result.single()["count"]
            
            # Count DEFINED_IN relationships
            result = session.run("MATCH ()-[r:DEFINED_IN]->() RETURN count(r) as count")
            rel_count = result.single()["count"]
            
            # Sample some terms
            result = session.run("""
                MATCH (t:Term)
                WHERE t.abbreviation IN ['SCP', 'AMF', 'SMF', 'UPF', 'PCF']
                RETURN t.abbreviation as abbr, t.full_name as full_name, 
                       t.source_specs as specs
                ORDER BY t.abbreviation
            """)
            
            print(f"\nTerm Node Statistics:")
            print(f"  Total Term nodes: {term_count}")
            print(f"  DEFINED_IN relationships: {rel_count}")
            print(f"\nSample 5G Core NF Terms:")
            for record in result:
                specs = record['specs'][:3] if len(record['specs']) > 3 else record['specs']
                print(f"  {record['abbr']}: {record['full_name']}")
                print(f"    Defined in: {', '.join(specs)}{'...' if len(record['specs']) > 3 else ''}")
            
            return term_count > 0
    
    def build_term_nodes(self, json_dir: str = "/home/linguyen/3GPP/3GPP_JSON_DOC/processed_json_v2/"):
        """Main method to build Term nodes from JSON files"""
        print("="*50)
        print("Building Term Nodes from Abbreviations/Definitions")
        print("="*50)
        
        # Extract terms
        self.extract_terms_from_chunks(json_dir)
        
        # Create nodes in Neo4j
        self.create_term_nodes()
        
        # Verify
        success = self.verify_terms()
        
        if success:
            print("\n✅ Term nodes created successfully!")
        else:
            print("\n⚠️ No Term nodes created - check JSON files")
        
        return success

print("TermNodeBuilder class loaded successfully!")

In [None]:
# Run the Term Node Builder to create Term nodes from abbreviations
term_builder = TermNodeBuilder(
    neo4j_uri="neo4j://localhost:7687",
    neo4j_user="neo4j",
    neo4j_password="password"
)

try:
    success = term_builder.build_term_nodes(
        "/home/linguyen/3GPP/3GPP_JSON_DOC/processed_json_v2/"
    )
    
    if success:
        print("\n" + "="*50)
        print("✅ Term nodes added to Knowledge Graph!")
        print("You can now query terms like:")
        print("  MATCH (t:Term {abbreviation: 'SCP'}) RETURN t")
        print("="*50)
finally:
    term_builder.close()