# Custom Knowledge Graph Building and Entity Resolution Pipeline

This notebook demonstrates:
1. Building a knowledge graph from ACLED data using the CustomKGPipeline
2. Performing entity resolution using embedding-based similarity matching
3. Merging similar entities to create a cleaner knowledge graph

In [8]:
import sys
import os

# Add the parent directory (graphrag_pipeline) to the Python path (needed for importing
# modules in parent directory)
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Utilities
import asyncio
from dotenv import load_dotenv, find_dotenv
import os
import json
from google import genai
import polars as pl
from library.kg_builder import CustomKGPipeline, build_kg_from_df
from library.kg_builder.utilities import GeminiLLM
from neo4j_graphrag.experimental.components.resolver import (
    SpaCySemanticMatchResolver, FuzzyMatchResolver, SinglePropertyExactMatchResolver
)
import tqdm.notebook as tqdm

# Neo4j and Neo4j GraphRAG imports
import neo4j
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings

In [None]:
load_dotenv(find_dotenv(), override=True)

gemini_api_key = os.getenv('GEMINI_API_KEY')
if not gemini_api_key:
    raise ValueError("GEMINI_API_KEY environment variable is not set.")

print("✓ Gemini API key loaded successfully")

name='models/embedding-gecko-001' display_name='Embedding Gecko' description='Obtain a distributed representation of a text.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=1024 output_token_limit=1 supported_actions=['embedText', 'countTextTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-1.0-pro-vision-latest' display_name='Gemini 1.0 Pro Vision' description='The original Gemini 1.0 Pro Vision model version which was optimized for image understanding. Gemini 1.0 Pro Vision was deprecated on July 12, 2024. Move to a newer Gemini version.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=12288 output_token_limit=4096 supported_actions=['generateContent', 'countTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-pro-vision' display_name='Gemini 1.0 Pro Vi

## Setup Requirements

Ensure the SpaCy model for entity resolution is installed:

In [None]:
import importlib.util
import subprocess
import sys
import spacy


def ensure_spacy_model(model_name):
    """Ensure SpaCy model is installed, install if not present."""
    if importlib.util.find_spec(model_name) is None:
        print(f"Installing SpaCy model: {model_name}...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    else:
        print(f"✓ SpaCy model '{model_name}' is available")

# Install required model for entity resolution
ensure_spacy_model("en_core_web_lg")

Model 'en_core_web_lg' is already installed.


In [None]:
import os
import json

with open(os.path.join(config_files_path, 'kg_building_config2.json'), 'r') as f:
    config = json.load(f)

print("✓ Configuration loaded successfully")

{'use_default': False,
 'template': 'You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.\n\nExtract the entities (nodes) and specify their type from the following Input text.\nAlso extract the relationships between these nodes. The relationship direction goes from the start node to the end node.\n\nReturn result as JSON using the following format:\n{"nodes": [{"id": "0", "label": "the type of entity", "properties": {"name": "name of entity" }}],\n"relationships": [{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {"details": "Description of the relationship"}}]}\n\n- Use only the information from the Input text. Do not add any additional information.\n- Make sure to create as many nodes and relationships as needed to offer rich context.\n- Use only the provided schema.\n\nInput text:\n{text}'}

# 1. Data Loading and Preparation

The data is loaded here as a reference, but it is loaded again inside the pipeline below.

In [None]:
global df1

df1 = pl.read_csv(os.path.join('sample_data', 'factal_single_topic_report-2025-05-01-2025-06-05.csv'))
df1 = df1.rename({"Associated topics": "Country"})
df1 = df1.rename({"Published text": "Text"})

df1=df1.head(10)

# Create an index for each row
df1 = df1.with_row_index(name="id", offset=1)
# Convert the "id" to a string to ensure it is treated as a document ID
df1 = df1.with_columns(pl.col('id').cast(pl.String))
    
df1.head(2)

id,Published date,Severity,Text,Translated text,Original language,Source URL,Status,Country
str,str,i64,str,str,str,str,str,str
"""1""","""2025-06-03 15:28:28.271179+00:…",3,"""WFP and UNICEF now say five me…",,,"""https://www.unicef.org/press-r…","""published""","""Sudan"""
"""2""","""2025-06-03 10:10:04.994458+00:…",3,"""""Multiple casualties"" after WF…",,,"""https://www.reuters.com/world/…","""published""","""Sudan"""


### Load Admin1 locations from HDX database

In [6]:
global df2
admin1 = pl.read_csv(r"C:\Users\matia\Downloads\global_pcodes_adm_1_2.csv")
df2 = admin1.filter(pl.col('Location') == 'SDN').clone()
df2 = df2.with_columns(pl.lit('Sudan').alias('Country'))
df2 = df2.rename({"Name": "Text"})

sudan_states_mapping = {
    "SD01": "Khartoum",
    "SD02": "North Darfur",
    "SD03": "South Darfur",
    "SD04": "West Darfur",
    "SD05": "East Darfur",
    "SD06": "Central Darfur",
    "SD07": "South Kordofan",
    "SD08": "Blue Nile",
    "SD09": "White Nile",
    "SD10": "Red Sea",
    "SD11": "Kassala",
    "SD12": "Gedaref",
    "SD13": "North Kordofan",
    "SD14": "Sennar",
    "SD15": "Aj Jazirah",
    "SD16": "River Nile",
    "SD17": "Northern",
    "SD18": "West Kordofan",
    "SD19": "Abyei PCA"
}

df2 = df2.with_columns(
    pl.when(pl.col("Parent P-Code") == "SDN")
    .then(pl.col("Text"))
    .otherwise(
        pl.col("Parent P-Code").replace(sudan_states_mapping, default=pl.col("Parent P-Code"))
    )
    .alias("Admin1")
)

df2 = df2.select(['Text', 'Admin1', 'Country'])
df2 = df2.head(15)

# Create an index for each row if df2 doesn't already have an 'id' column
if 'id' not in df2.columns:
    df2 = df2.with_row_index(name="id", offset=1)
    # Convert the "id" to a string to ensure it is treated as a document ID
    df2 = df2.with_columns(pl.col('id').cast(pl.String))
        
df2.head(2)

(Deprecated in version 1.0.0)
  pl.col("Parent P-Code").replace(sudan_states_mapping, default=pl.col("Parent P-Code"))


id,Text,Admin1,Country
str,str,str,str
"""1""","""Khartoum""","""Khartoum""","""Sudan"""
"""2""","""North Darfur""","""North Darfur""","""Sudan"""


In [None]:
# Load ACLED data
file_path = os.path.join(parent_dir, 'data', 'acled', 'Acled_Sudan_2025-05-01_2025-05-31.parquet')

try:
    df1 = pl.read_parquet(file_path)
    df1 = df1.head(10)  # Use first 10 rows for testing
    print(f"✓ Loaded {len(df1)} rows from ACLED data")
    print(f"Columns: {df1.columns}")
except FileNotFoundError:
    print(f"❌ File not found: {file_path}")
    # List available files
    data_dir = os.path.dirname(file_path)
    if os.path.exists(data_dir):
        print(f"Available files in {data_dir}:")
        for file in os.listdir(data_dir):
            print(f"  - {file}")
    raise

# 2. Running the pipeline

In [7]:
prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None
print(prompt_template)

You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

Return result as JSON using the following format:
{"nodes": [{"id": "0", "label": "the type of entity", "properties": {"name": "name of entity" }}],
"relationships": [{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {"details": "Description of the relationship"}}]}

- Use only the information from the Input text. Do not add any additional information.
- Make sure to create as many nodes and relationships as needed to offer rich context.
- Use only the provided schema.

Input text:
{text}


## 2.2. With a data frame

## Pipeline Configuration

More useful information about the resolvers can be found in the [user guide](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html#entity-resolver). Below, we use different resolvers (from the most aggressive - spaCy to the most conservative - exact matching) to get a broad overview of the performance results.

#### With Factal

In [8]:
# Example usage code
async def build_knowledge_graph():
    """Main function to build knowledge graph from ACLED data."""

    global df1, df2
    
    # Find path to config_files folder
    config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')
    load_dotenv(os.path.join(config_files_path, '.env'), override=True)
    
    with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
        config = json.load(f)

    # Get credentials
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')

    if not all([neo4j_uri, neo4j_username, neo4j_password, gemini_api_key]):
        raise ValueError("Missing required environment variables")

    # Load and prepare data
    file_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'acled', 'Acled_Sudan_2025-05-01_2025-05-31.parquet')

    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")

    df = pl.read_parquet(file_path).head(10)

    # Convert date column to string format
    if 'date' in df.columns:
        df = df.with_columns([
            pl.col('date').dt.strftime('%Y-%m-%d').alias('date')
        ])

    print(f"Processing {len(df)} documents...")

    # Initialize components
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )

    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])

    # Build knowledge graph
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:

        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(
            driver,
            filter_query=None,
            resolve_properties=["name"],
            similarity_threshold=0.95,
            spacy_model="en_core_web_lg"
        )

        # Initialize KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if not config['prompt_template_config'].get('use_default', True) else None,
            text_splitter_config=config['text_splitter_config'],
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )

        # Define document metadata mapping
        document_metadata_mapping = {
            'date': 'date',
            'url': 'url',
            'domain': 'domain'
        }
        
        metadata_mapping2 = {
            "name": "Text",
            "location": "Text",
            "adm1": "Admin1",
            "country": "Country"
        }
        
        # Process the First dataframe
        print("Processing the first dataframe...")
        results_df1 = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df1,
            document_base_field='id',
            text_column='Text',
            document_metadata_mapping=metadata_mapping1,
            document_id_column=None  # Use default document ID generation
        )
        all_results.extend(results_df1)

        # Process the Second dataframe
        print("Processing the second dataframe...")
        results_df2 = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df2,
            document_base_field='id',
            text_column='Text',
            document_metadata_mapping=metadata_mapping2,
            document_id_column=None  # Use default document ID generation
        )
        all_results.extend(results_df2)
    
    return all_results

    return results

# Execute pipeline
print("🚀 Starting Knowledge Graph construction...")
all_results = await build_knowledge_graph()
print(f"✅ Processed {len(all_results)} documents successfully")

Processing the first dataframe...
Processing row 1 of 10
Result: run_id='ac672be4-5835-474e-bd25-d2668f510f4b' result={'resolver': {'number_of_nodes_to_resolve': 5, 'number_of_created_nodes': 0}}
Elapsed time: 6.21 seconds
Estimated time remaining: 55.86 seconds

Processing row 2 of 10
Result: run_id='843a3db5-4e71-4a81-b54c-3fdf072a73c7' result={'resolver': {'number_of_nodes_to_resolve': 13, 'number_of_created_nodes': 5}}
Elapsed time: 20.29 seconds
Estimated time remaining: 81.17 seconds

Processing row 3 of 10
Result: run_id='86a56c3d-1e2a-45b7-aac0-3648abf87989' result={'resolver': {'number_of_nodes_to_resolve': 14, 'number_of_created_nodes': 2}}
Elapsed time: 49.06 seconds
Estimated time remaining: 114.48 seconds

Processing row 4 of 10
Result: run_id='740f8de1-52ed-4baa-9bcc-8ba9851c5f63' result={'resolver': {'number_of_nodes_to_resolve': 19, 'number_of_created_nodes': 1}}
Elapsed time: 69.82 seconds
Estimated time remaining: 104.73 seconds

Processing row 5 of 10
Result: run_id=

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 5
}
].
LLM response has improper format for chunk_index=0


Result: run_id='df66005a-d949-4c9d-8b0e-d19b5bf5e181' result={'resolver': {'number_of_nodes_to_resolve': 47, 'number_of_created_nodes': 0}}
Elapsed time: 34.10 seconds
Estimated time remaining: 12.40 seconds

Processing row 12 of 15


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 3
}
].
LLM response has improper format for chunk_index=0


Result: run_id='50622147-54c3-4820-b5bd-55a8e5097ebf' result={'resolver': {'number_of_nodes_to_resolve': 47, 'number_of_created_nodes': 0}}
Elapsed time: 36.70 seconds
Estimated time remaining: 9.18 seconds

Processing row 13 of 15


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
}
].
LLM response has improper format for chunk_index=0


Result: run_id='4cba2cc0-c28c-41b4-8725-be06dd493315' result={'resolver': {'number_of_nodes_to_resolve': 47, 'number_of_created_nodes': 0}}
Elapsed time: 39.28 seconds
Estimated time remaining: 6.04 seconds

Processing row 14 of 15


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-2.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 10
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 58
}
].
LLM response has improper format for chunk_index=0


Result: run_id='9540a96c-06b2-46b4-9b42-6f728541658f' result={'resolver': {'number_of_nodes_to_resolve': 47, 'number_of_created_nodes': 0}}
Elapsed time: 41.86 seconds
Estimated time remaining: 2.99 seconds

Processing row 15 of 15
Result: run_id='cfa29aaf-0289-40bf-8031-f45e9f73e394' result={'resolver': {'number_of_nodes_to_resolve': 48, 'number_of_created_nodes': 0}}
Elapsed time: 47.50 seconds
Estimated time remaining: 0.00 seconds

Processed 25 documents


# 3. Entity Resolution and Deduplication

In [None]:
# Import necessary modules
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=("neo4j", os.getenv("NEO4J_PASSWORD"))
)

# Load embedding model
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

# List of relevant node labels for deduplication
ENTITY_LABELS = ["Event", "Actor", "Country", "ADM1", "Location"]

def get_all_entities():
    """Retrieve all entities from the knowledge graph."""
    all_entities = []
    query_template = """
    MATCH (n:{label})
    RETURN elementId(n) AS id, n.name AS name, labels(n) AS labels, properties(n) AS properties
    """
    
    with driver.session() as session:
        for label in ENTITY_LABELS:
            result = session.run(query_template.format(label=label)).data()
            all_entities.extend(result)
    
    return all_entities

def find_similar_entities(threshold=0.7):
    """Find and create relationships between similar entities."""
    entities = get_all_entities()
    print(f"Processing {len(entities)} entities...")
    
    # Generate embeddings for each entity
    for entity in entities:
        # Filter entity labels
        filtered_labels = [l for l in entity['labels'] if l not in ["__KGBuilder__", "__Entity__"]]
        entity['primary_label'] = filtered_labels[0] if filtered_labels else entity['labels'][0]
        
        # Create text representation for embedding
        text = f"Type: {entity['primary_label']}\nName: {entity['name']}\n"
        for key, value in entity['properties'].items():
            if key != 'embedding' and value is not None:
                text += f"{key}: {str(value)}\n"
        
        entity['embedding'] = get_embedding(text)
    
    # Find similar pairs
    similar_pairs = []
    for i, e1 in enumerate(entities):
        for j, e2 in enumerate(entities[i + 1:], i + 1):
            # Only compare entities with same label
            if e1['primary_label'] != e2['primary_label']:
                continue
            
            similarity = cosine_similarity(e1['embedding'], e2['embedding'])
            if similarity > threshold:
                similar_pairs.append({
                    "id1": e1['id'],
                    "id2": e2['id'],
                    "name1": e1['name'],
                    "name2": e2['name'],
                    "type1": e1['primary_label'],
                    "type2": e2['primary_label'],
                    "similarity": similarity
                })
    
    # Create SAME_AS relationships
    create_query = """
    MATCH (a), (b)
    WHERE elementId(a) = $id1 AND elementId(b) = $id2
    MERGE (a)-[:SAME_AS {similarity: $similarity}]->(b)
    """
    
    with driver.session() as session:
        for pair in tqdm.tqdm(similar_pairs, desc="Creating similarity relationships"):
            session.run(create_query, pair)
    
    return similar_pairs

def merge_similar_nodes():
    merge_query = """
    MATCH (n1)-[r:SAME_AS]->(n2)
    WHERE n1 IS NOT NULL AND n2 IS NOT NULL AND r.similarity >= $threshold
    
    // Copy properties from n2 to n1 if they don't exist
    WITH n1, n2, [key IN keys(n2) WHERE NOT key IN keys(n1)] AS newKeys
    FOREACH (key IN newKeys | SET n1[key] = n2[key])
    
    // Transfer outgoing relationships
    WITH n1, n2
    OPTIONAL MATCH (n2)-[outRel]->(target)
    WHERE target IS NOT NULL AND type(outRel) <> 'SAME_AS'
    WITH n1, n2, outRel, target, type(outRel) AS relType
    WHERE NOT EXISTS((n1)-[:`${relType}`]->(target))
    FOREACH (_ IN CASE WHEN outRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (n1)-[newRel:`${relType}`]->(target)
        SET newRel = properties(outRel)
    )
    
    // Transfer incoming relationships
    WITH DISTINCT n1, n2
    OPTIONAL MATCH (source)-[inRel]->(n2)
    WHERE source IS NOT NULL AND source <> n1 AND type(inRel) <> 'SAME_AS'
    WITH n1, n2, inRel, source, type(inRel) AS relType
    WHERE NOT EXISTS((source)-[:`${relType}`]->(n1))
    FOREACH (_ IN CASE WHEN inRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (source)-[newRel:`${relType}`]->(n1)
        SET newRel = properties(inRel)
    )
    
    // Delete the duplicate node
    WITH DISTINCT n1, n2
    DETACH DELETE n2
    RETURN count(n2) AS mergedCount
    """
    
    try:
        with driver.session() as session:
            result = session.run(merge_query, {"threshold": threshold})
            record = result.single()
            return record["mergedCount"] if record else 0
    except Exception as e:
        print(f"Error during node merging: {e}")
        return 0

def check_apoc():
    try:
        with driver.session() as session:
            session.run("CALL apoc.help('create')")
            print("APOC is available.")
            return True
    except Exception as e:
        print(f"APOC not available: {e}")
        return False

# === MAIN EXECUTION ===
check_apoc()
pairs = find_similar_entities()
pairs = sorted(pairs, key=lambda x: x["similarity"], reverse=True)
print(f"Found {len(pairs)} similar entity pairs.")

APOC is available.
Found 48 entities to process


Computing similarity scores:   0%|          | 0/94 [00:00<?, ?it/s]

Found 94 similar entity pairs.


In [22]:
print(pairs)

[{'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:227', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:282', 'name1': 'Kutum', 'name2': 'Kubum', 'type1': 'Location', 'type2': 'Location', 'similarity': np.float64(0.9554210593764909)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:19', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:368', 'name1': 'military', 'name2': 'army', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9545791105950588)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:4', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:53', 'name1': 'Sudan doctors union', 'name2': 'Sudan Doctors Network', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9404681965196532)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:47', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:105', 'name1': 'Cholera outbreak', 'name2': 'cholera-related deaths and infections', 'type1': 'Event', 'type2': 'Event', 'similarity': np.float64(0.9379905887483682)}, {'id1': '4:47781cee

In [None]:
merged = merge_similar_nodes()
print(f"Merged {merged} nodes.")

### Node similarity

In [None]:
# Merge similar entities with high similarity threshold
print("🔄 Merging similar entities...")
merged_count = merge_similar_nodes(threshold=0.89)
print(f"✅ Successfully merged {merged_count} duplicate nodes")

# Close the driver connection
if 'driver' in locals():
    driver.close()
    print("📝 Database connection closed")

# 4. Summary

This notebook demonstrated the complete pipeline for building and refining a knowledge graph:

1. **Data Loading**: Loaded ACLED conflict data from Sudan
2. **Knowledge Graph Construction**: Created entities, relationships, and document nodes with metadata
3. **Entity Resolution**: Found similar entities using embedding-based similarity
4. **Deduplication**: Merged duplicate entities to create a cleaner graph

The resulting knowledge graph contains deduplicated entities with proper relationships, ready for downstream analysis and querying.