# Custom Knowledge Graph Building and Entity Resolution Pipeline

This notebook demonstrates:
1. Building a knowledge graph from ACLED data using the CustomKGPipeline
2. Performing entity resolution using embedding-based similarity matching
3. Merging similar entities to create a cleaner knowledge graph

In [1]:
import sys
import os

# Add the parent directory (graphrag_pipeline) to the Python path (needed for importing
# modules in parent directory)
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Utilities
import asyncio
from dotenv import load_dotenv, find_dotenv
import os
import json
from google import genai
import polars as pl
import numpy as np
from library.kg_builder import CustomKGPipeline, build_kg_from_df
from library.kg_builder.utilities import GeminiLLM
from neo4j_graphrag.experimental.components.resolver import (
    SpaCySemanticMatchResolver, FuzzyMatchResolver, SinglePropertyExactMatchResolver
)
import tqdm.notebook as tqdm

# Neo4j and Neo4j GraphRAG imports
import neo4j
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings

In [2]:
load_dotenv(find_dotenv(), override=True)

gemini_api_key = os.getenv('GEMINI_API_KEY')
if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY environment variable is not set.")

print("✓ Gemini API key loaded successfully")

✓ Gemini API key loaded successfully


## Setup Requirements

Ensure the SpaCy model for entity resolution is installed:

In [3]:
import importlib.util
import subprocess
import sys
import spacy


def ensure_spacy_model(model_name):
    """Ensure SpaCy model is installed, install if not present."""
    if importlib.util.find_spec(model_name) is None:
        print(f"Installing SpaCy model: {model_name}...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    else:
        print(f"✓ SpaCy model '{model_name}' is available")

# Install required model for entity resolution
ensure_spacy_model("en_core_web_lg")

✓ SpaCy model 'en_core_web_lg' is available


In [4]:
import os
import json

# Load configuration
config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')
with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
    config = json.load(f)

print("✓ Configuration loaded successfully")

✓ Configuration loaded successfully


# 1. Data Loading and Preparation

The data is loaded here as a reference, but it is loaded again inside the pipeline below.

In [5]:
# Load ACLED data
file_path = os.path.join(parent_dir, 'data', 'factal', 'Factal_Sudan_2025-06-01_2025-06-28.parquet')

try:
    df1 = pl.read_parquet(file_path)
    df1 = df1.head(10)  # Use first 10 rows for testing
    print(f"Loaded {len(df1)} rows from ACLED data")
    print(f"Columns: {df1.columns}")
except FileNotFoundError:
    print(f"File not found: {file_path}")
    # List available files
    data_dir = os.path.dirname(file_path)
    if os.path.exists(data_dir):
        print(f"Available files in {data_dir}:")
        for file in os.listdir(data_dir):
            print(f"  - {file}")
    raise

Loaded 10 rows from ACLED data
Columns: ['country_keyword', 'item_id', 'url', 'text', 'domain', 'date', 'severity', 'country', 'state', 'town', 'location', 'topic', 'theme', 'tag', 'topics', 'topic_summary']


# 2. Knowledge Graph Construction

## Pipeline Configuration

The pipeline uses SpaCy Semantic Matching for entity resolution, which merges nodes with similar textual properties.

In [6]:
# Example usage code
async def build_knowledge_graph():
    """Main function to build knowledge graph from ACLED data."""

    # Setup paths and environment
    config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')
    load_dotenv(os.path.join(config_files_path, '.env'), override=True)

    # Load configuration
    with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
        config = json.load(f)

    # Get credentials
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')

    if not all([neo4j_uri, neo4j_username, neo4j_password, gemini_api_key]):
        raise ValueError("Missing required environment variables")

    # Load and prepare data
    file_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'factal', 'Factal_Sudan_2025-06-01_2025-06-28.parquet')

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")

    df = pl.read_parquet(file_path).head(10)

    # Convert date column to string format
    if 'date' in df.columns:
        df = df.with_columns([
            pl.col('date').dt.strftime('%Y-%m-%d').alias('date')
        ])

    print(f"Processing {len(df)} documents...")

    # Initialize components
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )

    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])

    # Build knowledge graph
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:

        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(
            driver,
            filter_query=None,
            resolve_properties=["name"],
            similarity_threshold=0.95,
            spacy_model="en_core_web_lg"
        )

        # Initialize KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if not config['prompt_template_config'].get('use_default', True) else None,
            text_splitter_config=config['text_splitter_config'],
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )

        # Define document metadata mapping
        document_metadata_mapping = {
            'date': 'date',
            'url': 'url',
            'domain': 'domain'
        }

        # Process dataframe
        results = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='item_id',
            text_column='text',
            document_metadata_mapping=document_metadata_mapping,
            document_id_column='item_id'
        )

    return results

# Execute pipeline
print("🚀 Starting Knowledge Graph construction...")
all_results = await build_knowledge_graph()
print(f"✅ Processed {len(all_results)} documents successfully")

🚀 Starting Knowledge Graph construction...
Processing 10 documents...
Processing row 1 of 10
Processing row 1 of 10
Result: run_id='1bf781c0-e47c-4775-9b79-1bc98240c565' result={'resolver': {'number_of_nodes_to_resolve': 38, 'number_of_created_nodes': 2}}
Elapsed time: 32.71 seconds
Estimated time remaining: 294.42 seconds

Processing row 2 of 10
Result: run_id='1bf781c0-e47c-4775-9b79-1bc98240c565' result={'resolver': {'number_of_nodes_to_resolve': 38, 'number_of_created_nodes': 2}}
Elapsed time: 32.71 seconds
Estimated time remaining: 294.42 seconds

Processing row 2 of 10
Result: run_id='d26b9f6c-1725-41ac-b485-4f41dd5df8a6' result={'resolver': {'number_of_nodes_to_resolve': 41, 'number_of_created_nodes': 2}}
Elapsed time: 60.51 seconds
Estimated time remaining: 242.03 seconds

Processing row 3 of 10
Result: run_id='d26b9f6c-1725-41ac-b485-4f41dd5df8a6' result={'resolver': {'number_of_nodes_to_resolve': 41, 'number_of_created_nodes': 2}}
Elapsed time: 60.51 seconds
Estimated time re

# 3. Entity Resolution and Deduplication

In [None]:
# Import necessary modules
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase
import numpy as np
import os
import tqdm

# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=("neo4j", os.getenv("NEO4J_PASSWORD"))
)

# Load embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# List of relevant node labels for deduplication
ENTITY_LABELS = ["Event", "Actor", "Country", "ADM1", "Location"]

def get_embedding(text):
    """Generate embedding for text."""
    return embedding_model.encode(text).tolist()

def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors."""
    if not vec1 or not vec2:
        return 0
    
    dot = np.dot(vec1, vec2)
    norm1, norm2 = np.linalg.norm(vec1), np.linalg.norm(vec2)
    return dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0

def get_all_entities():
    """Retrieve all entities from the knowledge graph."""
    all_entities = []
    query_template = """
    MATCH (n:{label})
    RETURN elementId(n) AS id, labels(n) AS labels, n.name AS name, properties(n) AS properties
    """
    
    with driver.session() as session:
        for label in ENTITY_LABELS:
            result = session.run(query_template.format(label=label)).data()
            all_entities.extend(result)
    
    return all_entities

def find_similar_entities(threshold=0.7):
    """Find and create relationships between similar entities."""
    entities = get_all_entities()
    print(f"Processing {len(entities)} entities...")
    
    # Generate embeddings for each entity
    for entity in entities:
        # Filter entity labels
        filtered_labels = [l for l in entity['labels'] if l not in ["__KGBuilder__", "__Entity__"]]
        entity['primary_label'] = filtered_labels[0] if filtered_labels else entity['labels'][0]
        
        # Create text representation for embedding
        text = f"Type: {entity['primary_label']}\nName: {entity['name']}\n"
        for key, value in entity['properties'].items():
            if key != 'embedding' and value is not None:
                text += f"{key}: {str(value)}\n"
        
        entity['embedding'] = get_embedding(text)
    
    # Find similar pairs
    similar_pairs = []
    for i, e1 in enumerate(entities):
        for j, e2 in enumerate(entities[i + 1:], i + 1):
            # Only compare entities with same label
            if e1['primary_label'] != e2['primary_label']:
                continue
            
            similarity = cosine_similarity(e1['embedding'], e2['embedding'])
            if similarity > threshold:
                similar_pairs.append({
                    "id1": e1['id'],
                    "id2": e2['id'],
                    "name1": e1['name'],
                    "name2": e2['name'],
                    "type1": e1['primary_label'],
                    "type2": e2['primary_label'],
                    "similarity": similarity
                })
    
    # Create SAME_AS relationships
    create_query = """
    MATCH (a), (b)
    WHERE elementId(a) = $id1 AND elementId(b) = $id2
    MERGE (a)-[:SAME_AS {similarity: $similarity}]->(b)
    """
    
    with driver.session() as session:
        for pair in tqdm.tqdm(similar_pairs, desc="Creating similarity relationships"):
            session.run(create_query, pair)
    
    return similar_pairs

def merge_similar_nodes(threshold=0.89):
    """Merge nodes that have SAME_AS relationships above the threshold."""
    merge_query = """
    MATCH (n1)-[r:SAME_AS]->(n2)
    WHERE n1 IS NOT NULL AND n2 IS NOT NULL AND r.similarity >= $threshold
    
    // Copy properties from n2 to n1 if they don't exist
    WITH n1, n2, [key IN keys(n2) WHERE NOT key IN keys(n1)] AS newKeys
    FOREACH (key IN newKeys | SET n1[key] = n2[key])
    
    // Transfer outgoing relationships
    WITH n1, n2
    OPTIONAL MATCH (n2)-[outRel]->(target)
    WHERE target IS NOT NULL AND type(outRel) <> 'SAME_AS'
    WITH n1, n2, outRel, target, type(outRel) AS relType
    WHERE NOT EXISTS((n1)-[:`${relType}`]->(target))
    FOREACH (_ IN CASE WHEN outRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (n1)-[newRel:`${relType}`]->(target)
        SET newRel = properties(outRel)
    )
    
    // Transfer incoming relationships
    WITH DISTINCT n1, n2
    OPTIONAL MATCH (source)-[inRel]->(n2)
    WHERE source IS NOT NULL AND source <> n1 AND type(inRel) <> 'SAME_AS'
    WITH n1, n2, inRel, source, type(inRel) AS relType
    WHERE NOT EXISTS((source)-[:`${relType}`]->(n1))
    FOREACH (_ IN CASE WHEN inRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (source)-[newRel:`${relType}`]->(n1)
        SET newRel = properties(inRel)
    )
    
    // Delete the duplicate node
    WITH DISTINCT n1, n2
    DETACH DELETE n2
    RETURN count(n2) AS mergedCount
    """
    
    try:
        with driver.session() as session:
            result = session.run(merge_query, {"threshold": threshold})
            record = result.single()
            return record["mergedCount"] if record else 0
    except Exception as e:
        print(f"Error during node merging: {e}")
        return 0

def check_apoc():
    try:
        with driver.session() as session:
            session.run("CALL apoc.help('create')")
            print("APOC is available.")
            return True
    except Exception as e:
        print(f"APOC not available: {e}")
        return False

# === MAIN EXECUTION ===
check_apoc()
pairs = find_similar_entities(threshold=0.7)
pairs = sorted(pairs, key=lambda x: x["similarity"], reverse=True)
print(f"Found {len(pairs)} similar entity pairs")

APOC is available.
Found 42 entities to process


Computing similarity scores:   0%|          | 0/92 [00:00<?, ?it/s]

Found 92 similar entity pairs.


In [None]:
# Display top 10 most similar entity pairs
print("Top 10 most similar entity pairs:")
print("-" * 80)
for i, pair in enumerate(similar_pairs[:10]):
    print(f"{i+1}. {pair['name1']} ↔ {pair['name2']}")
    print(f"   Type: {pair['type1']}, Similarity: {pair['similarity']:.3f}")
    print()

[{'id1': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:0', 'id2': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:76', 'name1': 'M9C2+H2X', 'name2': 'M9C2+H2X', 'type1': 'Location', 'type2': 'Location', 'similarity': np.float64(0.990342315165928)}, {'id1': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:3', 'id2': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:123', 'name1': 'RSF', 'name2': 'RSF militia', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9174598874812924)}, {'id1': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:35', 'id2': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:38', 'name1': 'people', 'name2': 'civilian', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9080286401331917)}, {'id1': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:31', 'id2': '4:ae0cc0c9-9095-4ba5-8c4f-024072a30c7c:101', 'name1': "United Nations International Children's Emergency Fund", 'name2': 'UNICEF', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9011329872515563)}, {'id1': '4:ae0cc0c9-9095-4ba5-8c4f

In [None]:
# Merge similar entities with high similarity threshold
print("🔄 Merging similar entities...")
merged_count = merge_similar_nodes(threshold=0.89)
print(f"✅ Successfully merged {merged_count} duplicate nodes")

# Close the driver connection
if 'driver' in locals():
    driver.close()
    print("📝 Database connection closed")

Merged 4 nodes.


# 4. Summary

This notebook demonstrated the complete pipeline for building and refining a knowledge graph:

1. **Data Loading**: Loaded ACLED conflict data from Sudan
2. **Knowledge Graph Construction**: Created entities, relationships, and document nodes with metadata
3. **Entity Resolution**: Found similar entities using embedding-based similarity
4. **Deduplication**: Merged duplicate entities to create a cleaner graph

The resulting knowledge graph contains deduplicated entities with proper relationships, ready for downstream analysis and querying.