# 0. Libraries

In [23]:
import sys
import os

# Add the parent directory (graphrag_pipeline) to the Python path (needed for importing
# modules in parent directory)
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Utilities
import asyncio
from dotenv import load_dotenv, find_dotenv
import os
import json
from google import genai
import polars as pl
import numpy as np
from library.kg_builder import CustomKGPipeline, build_kg_from_df
from library.kg_builder.utilities import GeminiLLM
from neo4j_graphrag.experimental.components.resolver import (
    SpaCySemanticMatchResolver, FuzzyMatchResolver, SinglePropertyExactMatchResolver
)
import tqdm.notebook as tqdm

# Neo4j and Neo4j GraphRAG imports
import neo4j
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings

Let's first check the available Gemini models.

In [24]:
load_dotenv(find_dotenv(), override=True)

gemini_api_key = os.getenv('GEMINI_API_KEY')

if gemini_api_key:
    client = genai.Client(api_key=gemini_api_key)  # Configure the API key for genai
else:
    raise ValueError("GEMINI_API_KEY environment variable is not set.")

# Display available models
for model in client.models.list():
    print(model)

name='models/embedding-gecko-001' display_name='Embedding Gecko' description='Obtain a distributed representation of a text.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=1024 output_token_limit=1 supported_actions=['embedText', 'countTextTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-1.0-pro-vision-latest' display_name='Gemini 1.0 Pro Vision' description='The original Gemini 1.0 Pro Vision model version which was optimized for image understanding. Gemini 1.0 Pro Vision was deprecated on July 12, 2024. Move to a newer Gemini version.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=12288 output_token_limit=4096 supported_actions=['generateContent', 'countTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-pro-vision' display_name='Gemini 1.0 Pro Vi

We also have to make sure that the corresponding SpaCy model for text embedding used at the resolving step is installed.

In [25]:
import importlib.util
import subprocess
import sys

import spacy


def ensure_spacy_model(model_name):
    if importlib.util.find_spec(model_name) is None:
        print(f"Model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    else:
        print(f"Model '{model_name}' is already installed.")

# Use it for 'en_core_web_lg'
ensure_spacy_model("en_core_web_lg")  # Model used for resolving entities in the KG pipeline

Model 'en_core_web_lg' is already installed.


In [36]:
config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')

with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
    config = json.load(f)

config['prompt_template_config']

{'use_default': False,
 'template': 'You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.\n\nExtract the entities (nodes) and specify their type, location, affiliation, severity, etc. from the Input text.nThe text describes a security incident. The text contains names of the following entitites: Country, State, Town, Location of the incident, Actors.\nThe text also contains additional data like Actors affiliations, severity of the incident, number of fatalities, type of event - store these details as node properties.\nAlso extract the relationships between these nodes. The relationship direction goes from the start node to the end node.\n\nReturn result as JSON using the following format:\n{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity", "type": "type of node", "location": "location, town, state, country 

# 1. Loading the data

The data is loaded here as a reference, but it is loaded again inside the pipeline below.

## 1.2. Factal sample data

In [31]:
# Use parent_dir to construct the absolute path to the data file
file_path = os.path.join(parent_dir, 'data', 'factal', 'Factal_Sudan_2025-06-01_2025-06-28.parquet')

# Check if file exists before reading
if os.path.exists(file_path):
	df1 = pl.read_parquet(file_path)
	df1 = df1.head(10)
else:
	# List available files in the directory to help find the correct file
	data_dir = os.path.join(parent_dir, 'data', 'factal')
	if os.path.exists(data_dir):
		print(f"Available files in {data_dir}:")
		for file in os.listdir(data_dir):
			print(f"- {file}")
	else:
		print(f"Directory not found: {data_dir}")

# 2. Running the pipeline

In [37]:
prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None
print(prompt_template)

You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.

Extract the entities (nodes) and specify their type, location, affiliation, severity, etc. from the Input text.nThe text describes a security incident. The text contains names of the following entitites: Country, State, Town, Location of the incident, Actors.
The text also contains additional data like Actors affiliations, severity of the incident, number of fatalities, type of event - store these details as node properties.
Also extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

Return result as JSON using the following format:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity", "type": "type of node", "location": "location, town, state, country that the node is associated with" }} }}],
"r

## 2.2. With a data frame

### A. Using the `SpaCySemanticMatchResolver`

More useful information about the resolvers can be found in the [user guide](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html#entity-resolver). Below, we use different resolvers (from the most aggressive - spaCy to the most conservative - exact matching) to get a broad overview of the performance results.

#### With Factal

In [38]:
# Example usage code
async def main():

    # Find path to config_files folder
    config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')

    # Load environment variables from .env file
    load_dotenv(os.path.join(config_files_path, '.env'), override=True)
    
    with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
        config = json.load(f)
    
    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')
    
    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )
    
    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])
    
    # Configure text splitter
    text_splitter_config = config['text_splitter_config'] 
    
    # Create the pipeline - use with statement to ensure proper resource management
    # and to ensure the driver is closed after use
    all_results = []
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(  # Merge nodes with same label and similar textual properties
            driver,
            filter_query=None,  # "WHERE (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(doc:Document {id = 'docId'}",  # Used to reduce the resolution scope to a specific document
            resolve_properties=["name"],  # Properties to use for resolution (default is "name")
            similarity_threshold=0.8,  # The similarity threshold above which nodes are merged (default is 0.8). Higher threshold will result in less false positives, but may miss some matches. 
            spacy_model="en_core_web_lg"  # spaCy model to use for resolution (default is "en_core_web_lg")
        )

        # Initialize the custom KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=None,  # Use None if no examples are provided
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )
    
        
        # Process the First dataframe
        print("Processing the first dataframe...")
        results_df1 = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df1,
            document_base_field='item_id',
            text_column='text',
            document_id_column='item_id'  # Use default document ID generation
        )
        all_results.extend(results_df1)
    
    return all_results

# Asyncio event loop to run the main function in a Jupyter notebook
all_results = await main()
print(f"Processed {len(all_results)} documents")

# # Asyncio event loop to run the main function in a script
# if __name__ == "__main__":
#     results = asyncio.run(main())
#     print(f"Processed {len(results)} documents")

Processing the first dataframe...
Processing row 1 of 10
Result: run_id='2819be1a-f71b-4e44-a3bc-679508a05e3b' result={'resolver': {'number_of_nodes_to_resolve': 43, 'number_of_created_nodes': 2}}
Elapsed time: 30.22 seconds
Estimated time remaining: 271.98 seconds

Processing row 2 of 10
Result: run_id='54e10403-ff25-4220-9c08-44d047787ea6' result={'resolver': {'number_of_nodes_to_resolve': 46, 'number_of_created_nodes': 2}}
Elapsed time: 50.27 seconds
Estimated time remaining: 201.08 seconds

Processing row 3 of 10
Result: run_id='90ad9e47-e250-4de7-9bd9-81b1972fec60' result={'resolver': {'number_of_nodes_to_resolve': 51, 'number_of_created_nodes': 1}}
Elapsed time: 69.79 seconds
Estimated time remaining: 162.84 seconds

Processing row 4 of 10
Result: run_id='1b55f714-0f15-4c41-bb72-16cc68551050' result={'resolver': {'number_of_nodes_to_resolve': 56, 'number_of_created_nodes': 0}}
Elapsed time: 111.48 seconds
Estimated time remaining: 167.23 seconds

Processing row 5 of 10
Result: ru

# Entity Resolution Pipeline

In [10]:
# Import necessary modules
from sentence_transformers import SentenceTransformer
from neo4j import GraphDatabase

# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=("neo4j", os.getenv("NEO4J_PASSWORD"))
)

# Load embedding model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def get_embedding(text):
    return model.encode(text).tolist()

# List of relevant node labels for deduplication
ENTITY_LABELS = ["Event", "Actor", "Country", "ADM1", "Location"]

def get_all_entities():
    query_template = """
    MATCH (n:{label})
    RETURN elementId(n) AS id, labels(n) AS labels, n.name AS name, properties(n) AS properties
    """
    all_entities = []
    with driver.session() as session:
        for label in ENTITY_LABELS:
            result = session.run(query_template.format(label=label)).data()
            all_entities.extend(result)
    return all_entities

def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None or len(vec1) == 0 or len(vec2) == 0:
        return 0  # If either vector is None or empty, return 0 similarity
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0

def find_similar_entities(threshold=0.7):
    entities = get_all_entities()
    print(f"Found {len(entities)} entities to process")
    
    # Compute embeddings
    for entity in entities:
        # Filter out "__KGBuilder__" from labels
        filtered_labels = [l for l in entity['labels'] if l != "__KGBuilder__" and l != "__Entity__"]
        if filtered_labels:  # If there are any labels left
            entity['primary_label'] = filtered_labels[0]
        else:
            entity['primary_label'] = entity['labels'][0]  # Fallback if only __KGBuilder__ is present
            
        # Start with entity label/type and name
        text = f"Type: {entity['primary_label']}\nName: {entity['name']}\n"
        
        # Rest of embedding code remains the same
        for key, value in entity['properties'].items():
            if key != 'embedding' and value is not None:
                if isinstance(value, (list, dict)):
                    formatted_value = str(value)
                else:
                    formatted_value = value
                text += f"{key}: {formatted_value}\n"
        
        entity['embedding'] = get_embedding(text)
     
    # Find similar pairs with the new label comparison
    similar_pairs = []
    total_comparisons = sum(range(len(entities)))
    for i, e1 in enumerate(entities):
        for j, e2 in enumerate(entities[i + 1:], i + 1):
            # Only compare if they have the same primary label (excluding __KGBuilder__)
            if e1['primary_label'] != e2['primary_label'] or e1['primary_label'] == "__KGBuilder__":
                continue
            
            sim = cosine_similarity(e1['embedding'], e2['embedding'])
            if sim > threshold:
                similar_pairs.append({
                    "id1": e1['id'],
                    "id2": e2['id'],
                    "name1": e1['name'],
                    "name2": e2['name'],
                    "type1": e1['primary_label'],
                    "type2": e2['primary_label'],
                    "similarity": sim
                })
    
    # Create SAME_AS relationships
    query = """
    MATCH (a), (b)
    WHERE elementId(a) = $id1 AND elementId(b) = $id2
    MERGE (a)-[:SAME_AS {similarity: $similarity}]->(b)
    """
    with driver.session() as session:
        for pair in tqdm.tqdm(similar_pairs, desc="Computing similarity scores"):
            session.run(query, pair)
    
    return similar_pairs

def merge_similar_nodes():
    merge_query = """
    // Process one pair of nodes at a time to avoid conflicts
    MATCH (n1)-[r:SAME_AS]->(n2)
    WHERE n1 IS NOT NULL AND n2 IS NOT NULL
    
    // Copy properties from n2 to n1 if they don't exist in n1
    WITH n1, n2, [key IN keys(n2) WHERE NOT key IN keys(n1)] AS newKeys
    FOREACH (key IN newKeys | SET n1[key] = n2[key])
    
    // Get all outgoing relationships from n2 (except SAME_AS)
    WITH n1, n2
    OPTIONAL MATCH (n2)-[outRel]->(target)
    WHERE target IS NOT NULL AND type(outRel) <> 'SAME_AS'
    
    // Create equivalent relationships from n1 if they don't already exist
    WITH n1, n2, outRel, target, type(outRel) AS relType
    WHERE NOT EXISTS((n1)-[:`${relType}`]->(target))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN outRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (n1)-[newRel:`${relType}`]->(target)
        SET newRel = properties(outRel)
    )
    
    // Return the node pair for the next phase
    WITH DISTINCT n1, n2
    
    // Handle incoming relationships
    OPTIONAL MATCH (source)-[inRel]->(n2)
    WHERE source IS NOT NULL AND source <> n1 AND type(inRel) <> 'SAME_AS'
    
    // Create equivalent relationships to n1 if they don't already exist
    WITH n1, n2, inRel, source, type(inRel) AS relType
    WHERE NOT EXISTS((source)-[:`${relType}`]->(n1))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN inRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (source)-[newRel:`${relType}`]->(n1)
        SET newRel = properties(inRel)
    )
    
    // Return distinct pairs for deletion phase
    WITH DISTINCT n1, n2
    
    // Delete the second node and all its relationships
    DETACH DELETE n2
    
    RETURN count(n2) AS mergedCount
    """
    
    try:
        with driver.session() as session:
            result = session.run(merge_query)
            record = result.single()
            return record["mergedCount"] if record else 0
    except Exception as e:
        print(f"Error during node merging: {e}")
        return 0

def check_apoc():
    try:
        with driver.session() as session:
            session.run("CALL apoc.help('create')")
            print("APOC is available.")
            return True
    except Exception as e:
        print(f"APOC not available: {e}")
        return False

# === MAIN EXECUTION ===
check_apoc()
pairs = find_similar_entities()
pairs = sorted(pairs, key=lambda x: x["similarity"], reverse=True)
print(f"Found {len(pairs)} similar entity pairs.")

APOC is available.
Found 48 entities to process


Computing similarity scores:   0%|          | 0/94 [00:00<?, ?it/s]

Found 94 similar entity pairs.


In [22]:
print(pairs)

[{'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:227', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:282', 'name1': 'Kutum', 'name2': 'Kubum', 'type1': 'Location', 'type2': 'Location', 'similarity': np.float64(0.9554210593764909)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:19', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:368', 'name1': 'military', 'name2': 'army', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9545791105950588)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:4', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:53', 'name1': 'Sudan doctors union', 'name2': 'Sudan Doctors Network', 'type1': 'Actor', 'type2': 'Actor', 'similarity': np.float64(0.9404681965196532)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:47', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:105', 'name1': 'Cholera outbreak', 'name2': 'cholera-related deaths and infections', 'type1': 'Event', 'type2': 'Event', 'similarity': np.float64(0.9379905887483682)}, {'id1': '4:47781cee

In [None]:
merged = merge_similar_nodes()
print(f"Merged {merged} nodes.")

### Node similarity

In [None]:
node_similarity_query = """
CALL gds.nodeSimilarity.stream('amazonGraph')
YIELD node1, node2, similarity as node_similarity
WHERE 'Company' IN labels(gds.util.asNode(node1)) AND 'Company' IN labels(gds.util.asNode(node2))
AND node_similarity < 1
RETURN gds.util.asNode(node1).name AS Company1, gds.util.asNode(node2).name AS Company2, node_similarity
ORDER BY node_similarity DESCENDING, Company1, Company2
"""

def results_to_df(query: str) -> pd.DataFrame:
    results = gds.execute_query(query)[0]
    df = pd.DataFrame(results, columns=results[0].keys())
    return df

df_node_similarity = results_to_df(node_similarity_query)
print(df_node_similarity)

### Create SAME_AS relationships

In [None]:
def create_same_as_relationship(df, column_name):
    # Iterate over the DataFrame rows
    for _, row in df.iterrows():
        node1 = row[column_name + '1']
        node2 = row[column_name + '2']

        # Run Cypher query to create 'SAME_AS' relationship
        score = row["combined_score"]
        if score > 0.20:
            query = f"MATCH (n1), (n2) WHERE n1.name = '{node1}' AND n2.name = '{node2}' CREATE (n1)-[:SAME_AS]->(n2)"
            gds.execute_query(query)

create_same_as_relationship(selected_df, "Company")

### Merge nodes with SAME_AS relationship

In [None]:
merge_query = """
MATCH (n1)-[r:SAME_AS]->(n2)
WITH n1, n2, collect(r) as relsToDel

FOREACH (rel IN relsToDel | DELETE rel)
WITH collect(DISTINCT n1) + collect(DISTINCT n2) AS nodesToMerge

UNWIND nodesToMerge AS node

WITH collect(DISTINCT node) AS uniqueNodesToMerge
CALL apoc.refactor.mergeNodes(uniqueNodesToMerge, {mergeRels:true}) YIELD node
RETURN node
"""

gds.execute_query(merge_query)