# 0. Libraries

In [1]:
import sys
import os

# Add the parent directory (graphrag_pipeline) to the Python path (needed for importing
# modules in parent directory)
parent_dir = os.path.dirname(os.getcwd())
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Utilities
import asyncio
from dotenv import load_dotenv
import os
import json
from google import genai
import polars as pl
from library.kg_builder import CustomKGPipeline, build_kg_from_df
from library.kg_builder.utilities import GeminiLLM
from neo4j_graphrag.experimental.components.resolver import (
    SpaCySemanticMatchResolver, FuzzyMatchResolver, SinglePropertyExactMatchResolver
)

from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings
import neo4j
from neo4j import GraphDatabase

# Import for embedding model (if needed for entity similarity)
from sentence_transformers import SentenceTransformer
import tqdm.notebook as tqdm

# Neo4j and Neo4j GraphRAG imports
import neo4j
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings

Let's first check the available Gemini models.

In [2]:
load_dotenv('.env', override=True)

gemini_api_key = os.getenv('GEMINI_API_KEY')

if gemini_api_key:
    client = genai.Client(api_key=gemini_api_key)  # Configure the API key for genai
else:
    raise ValueError("GEMINI_API_KEY environment variable is not set.")

We also have to make sure that the corresponding SpaCy model for text embedding used at the resolving step is installed.

In [3]:
import importlib.util
import subprocess
import sys

import spacy


def ensure_spacy_model(model_name):
    if importlib.util.find_spec(model_name) is None:
        print(f"Model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    else:
        print(f"Model '{model_name}' is already installed.")

# Use it for 'en_core_web_lg'
ensure_spacy_model("en_core_web_lg")  # Model used for resolving entities in the KG pipeline

Model 'en_core_web_lg' is already installed.


# 1. Loading the data

The data is loaded here as a reference, but it is loaded again inside the pipeline below.

## 1.2. Factal sample data

In [7]:
path = r'C:\Users\blanc\OneDrive\Desktop\DSDM\3-TFM\REPO DE VERITAT\UN_Conflict_Report\graphrag_pipeline\data\google_news\google_news_India_2025-06-26_2025-07-01.parquet'
df = pl.read_parquet(path)
df = df.head(20)

if df['date'].dtype == pl.Date:
    # Convert date column to string if it is of type Date
    df = df.with_columns(pl.col('date').cast(pl.String))
    
df.head(2)

title,google_link,source,id,date,decoded_url,full_text
str,str,str,str,str,str,str
"""Shubhanshu Shukla: Astronaut b…","""https://news.google.com/rss/ar…","""BBC""","""GN_IND26683""","""2025-06-26""","""https://www.bbc.com/news/artic…","""On 26 June 2025, Astronaut bec…"
"""New report: India vs. Pakistan…","""https://news.google.com/rss/ar…","""NewsNation""","""GN_IND70889""","""2025-06-26""","""https://www.newsnationnow.com/…",


### Load Admin1 locations from HDX database

# 2. Running the pipeline

In [8]:
config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')

with open(os.path.join(config_files_path, 'kg_building_config2.json'), 'r') as f:
    config = json.load(f)

prompt_template = config['prompt_template_config']

print((prompt_template['template']))

You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

Return result as JSON using the following format:
{"nodes": [{"id": "0", "label": "the type of entity", "properties": {"name": "name of entity" }}],
"relationships": [{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {"details": "Description of the relationship"}}]}

- Use only the information from the Input text. Do not add any additional information.
- Make sure to create as many nodes and relationships as needed to offer rich context.
- Use only the provided schema.

Input text:
{text}


## 2.2. With a data frame

### A. Using the `SpaCySemanticMatchResolver`

More useful information about the resolvers can be found in the [user guide](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html#entity-resolver). Below, we use different resolvers (from the most aggressive - spaCy to the most conservative - exact matching) to get a broad overview of the performance results.

### Embed Locations from list of Admin units

In [6]:
neo4j_uri = os.getenv('NEO4J_URI')
neo4j_username = os.getenv('NEO4J_USERNAME')
neo4j_password = os.getenv('NEO4J_PASSWORD')
gemini_api_key = os.getenv('GEMINI_API_KEY')


In [None]:
# Example usage code
async def build_knowledge_graph():
    """Main function to build knowledge graph from ACLED data."""
    
    # Find path to config_files folder
    config_files_path = os.path.join(os.path.dirname(os.getcwd()), 'config_files')
    load_dotenv(os.path.join(config_files_path, '.env'), override=True)
    
    with open(os.path.join(config_files_path, 'kg_building_config.json'), 'r') as f:
        config = json.load(f)

    # Get credentials
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')

    if not all([neo4j_uri, neo4j_username, neo4j_password, gemini_api_key]):
        raise ValueError("Missing required environment variables")

    # Load and prepare data
    file_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'google_news', 'google_news_Sudan_2025-05-01_2025-06-01_chunk1.parquet')
    df = pl.read_parquet(file_path)
      
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Data file not found: {file_path}")
    
    df = df.head(20)

    if df['date'].dtype == pl.Date:
        df = df.with_columns(pl.col('date').cast(pl.String))

    print(f"Processing {len(df)} documents...")

    # Initialize components
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )

    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])

    all_results = []
    # Build knowledge graph
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:

        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(
            driver,
            filter_query=None,
            resolve_properties=["name"],
            similarity_threshold=0.95,
            spacy_model="en_core_web_lg"
        )

        # Initialize KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if not config['prompt_template_config'].get('use_default', True) else None,
            text_splitter_config=config['text_splitter_config'],
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )

        # Define document metadata mapping
        metadata_mapping = {
            'date': 'date',
            'url': 'decoded_url',
            'source': 'source'
        }
        
        
        # Process the First dataframe - MAKE A LOOP OUT OF THIS
        print("Processing the first dataframe...")
        results_df1 = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='id',
            text_column='full_text',
            document_metadata_mapping=metadata_mapping,
            document_id_column=None  # Use default document ID generation
        )
        all_results.extend(results_df1)

    return all_results


# Execute pipeline
print("🚀 Starting Knowledge Graph construction...")
all_results = await build_knowledge_graph()
print(f"✅ Processed {len(all_results)} documents successfully")

🚀 Starting Knowledge Graph construction...
Processing 20 documents...
Processing the first dataframe...
Processing row 1 of 20
Skipping row 1 due to empty text
Elapsed time: 0.00 seconds
Estimated time remaining: 0.01 seconds

Processing row 2 of 20
Result: run_id='696f47d0-3e7c-4043-a5b6-bc5b4e53f723' result={'resolver': {'number_of_nodes_to_resolve': 64, 'number_of_created_nodes': 32}}
Elapsed time: 42.57 seconds
Estimated time remaining: 383.17 seconds

Processing row 3 of 20
Result: run_id='118efb6b-3828-4ac6-88fa-4544f4ae81bd' result={'resolver': {'number_of_nodes_to_resolve': 68, 'number_of_created_nodes': 1}}
Elapsed time: 98.07 seconds
Estimated time remaining: 555.74 seconds

Processing row 4 of 20
Result: run_id='82ac0068-6a3d-4a5c-91ea-c4440382ec06' result={'resolver': {'number_of_nodes_to_resolve': 75, 'number_of_created_nodes': 1}}
Elapsed time: 119.52 seconds
Estimated time remaining: 478.09 seconds

Processing row 5 of 20
Result: run_id='05e196d3-8f2b-453a-9eae-bb3b15b1d

NameError: name 'all_results' is not defined

# Entity Resolution Pipeline

In [None]:
# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=("neo4j", os.getenv("NEO4J_PASSWORD"))
)

# Load embedding model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

def get_embedding(text):
    return model.encode(text).tolist()

# List of relevant node labels for deduplication
ENTITY_LABELS = ["Event", "Actor", "Location"]

def get_all_entities():
    query_template = """
    MATCH (n:{label})
    RETURN elementId(n) AS id, n.name AS name, labels(n) AS labels, properties(n) AS properties
    """
    all_entities = []
    with driver.session() as session:
        for label in ENTITY_LABELS:
            result = session.run(query_template.format(label=label)).data()
            all_entities.extend(result)
    return all_entities

def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None or len(vec1) == 0 or len(vec2) == 0:
        return 0  # If either vector is None or empty, return 0 similarity
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0

def find_similar_entities(threshold=0.7):
    entities = get_all_entities()
    print(f"Found {len(entities)} entities to process")
    
    # Compute embeddings
    for entity in entities:
        # Filter out "__KGBuilder__" from labels
        filtered_labels = [l for l in entity['labels'] if l != "__KGBuilder__" and l != "__Entity__"]
        if filtered_labels:  # If there are any labels left
            entity['primary_label'] = filtered_labels[0]
        else:
            entity['primary_label'] = entity['labels'][0]  # Fallback if only __KGBuilder__ is present
            
        # Start with entity label/type and name
        text = f"Type: {entity['primary_label']}\nName: {entity['name']}\n"
        
        # Rest of embedding code remains the same
        for key, value in entity['properties'].items():
            if key != 'embedding' and value is not None:
                if isinstance(value, (list, dict)):
                    formatted_value = str(value)
                else:
                    formatted_value = value
                text += f"{key}: {formatted_value}\n"
        
        entity['embedding'] = get_embedding(text)
     
    # Find similar pairs with the new label comparison
    similar_pairs = []
    total_comparisons = sum(range(len(entities)))
    for i, e1 in enumerate(entities):
        for j, e2 in enumerate(entities[i + 1:], i + 1):
            # Only compare if they have the same primary label (excluding __KGBuilder__)
            if e1['primary_label'] != e2['primary_label'] or e1['primary_label'] == "__KGBuilder__":
                continue
            
            sim = cosine_similarity(e1['embedding'], e2['embedding'])
            if sim > threshold:
                similar_pairs.append({
                    "id1": e1['id'],
                    "id2": e2['id'],
                    "name1": e1['name'],
                    "name2": e2['name'],
                    "type1": e1['primary_label'],
                    "type2": e2['primary_label'],
                    "similarity": sim
                })
    
    # Create SAME_AS relationships
    query = """
    MATCH (a), (b)
    WHERE elementId(a) = $id1 AND elementId(b) = $id2
    MERGE (a)-[:SAME_AS {similarity: $similarity}]->(b)
    """
    with driver.session() as session:
        for pair in tqdm.tqdm(similar_pairs, desc="Computing similarity scores"):
            session.run(query, pair)
    
    return similar_pairs

def check_apoc():
    try:
        with driver.session() as session:
            session.run("CALL apoc.help('create')")
            print("APOC is available.")
            return True
    except Exception as e:
        print(f"APOC not available: {e}")
        return False

# === MAIN EXECUTION ===
check_apoc()
pairs = find_similar_entities()
pairs = sorted(pairs, key=lambda x: x["similarity"], reverse=True)
print(f"Found {len(pairs)} similar entity pairs.")

APOC is available.
Found 206 entities to process


Computing similarity scores:   0%|          | 0/20805 [00:00<?, ?it/s]

BufferError: Existing exports of data: object cannot be re-sized

In [None]:
print(pairs)

In [None]:
def merge_similar_nodes():
    merge_query = """
    // Process one pair of nodes at a time to avoid conflicts
    MATCH (n1)-[r:SAME_AS]->(n2)
    WHERE n1 IS NOT NULL AND n2 IS NOT NULL
    
    // Copy properties from n2 to n1 if they don't exist in n1
    WITH n1, n2, [key IN keys(n2) WHERE NOT key IN keys(n1)] AS newKeys
    FOREACH (key IN newKeys | SET n1[key] = n2[key])
    
    // Get all outgoing relationships from n2 (except SAME_AS)
    WITH n1, n2
    OPTIONAL MATCH (n2)-[outRel]->(target)
    WHERE target IS NOT NULL AND type(outRel) <> 'SAME_AS'
    
    // Create equivalent relationships from n1 if they don't already exist
    WITH n1, n2, outRel, target, type(outRel) AS relType
    WHERE NOT EXISTS((n1)-[:`${relType}`]->(target))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN outRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (n1)-[newRel:`${relType}`]->(target)
        SET newRel = properties(outRel)
    )
    
    // Return the node pair for the next phase
    WITH DISTINCT n1, n2
    
    // Handle incoming relationships
    OPTIONAL MATCH (source)-[inRel]->(n2)
    WHERE source IS NOT NULL AND source <> n1 AND type(inRel) <> 'SAME_AS'
    
    // Create equivalent relationships to n1 if they don't already exist
    WITH n1, n2, inRel, source, type(inRel) AS relType
    WHERE NOT EXISTS((source)-[:`${relType}`]->(n1))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN inRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (source)-[newRel:`${relType}`]->(n1)
        SET newRel = properties(inRel)
    )
    
    // Return distinct pairs for deletion phase
    WITH DISTINCT n1, n2
    
    // Delete the second node and all its relationships
    DETACH DELETE n2
    
    RETURN count(n2) AS mergedCount
    """
    
    try:
        with driver.session() as session:
            result = session.run(merge_query)
            record = result.single()
            return record["mergedCount"] if record else 0
    except Exception as e:
        print(f"Error during node merging: {e}")
        return 0

merged = merge_similar_nodes()
print(f"Merged {merged} nodes.")

### Node similarity

In [None]:
node_similarity_query = """
CALL gds.nodeSimilarity.stream('amazonGraph')
YIELD node1, node2, similarity as node_similarity
WHERE 'Company' IN labels(gds.util.asNode(node1)) AND 'Company' IN labels(gds.util.asNode(node2))
AND node_similarity < 1
RETURN gds.util.asNode(node1).name AS Company1, gds.util.asNode(node2).name AS Company2, node_similarity
ORDER BY node_similarity DESCENDING, Company1, Company2
"""

def results_to_df(query: str) -> pd.DataFrame:
    results = gds.execute_query(query)[0]
    df = pd.DataFrame(results, columns=results[0].keys())
    return df

df_node_similarity = results_to_df(node_similarity_query)
print(df_node_similarity)

### Create SAME_AS relationships

In [None]:
def create_same_as_relationship(df, column_name):
    # Iterate over the DataFrame rows
    for _, row in df.iterrows():
        node1 = row[column_name + '1']
        node2 = row[column_name + '2']

        # Run Cypher query to create 'SAME_AS' relationship
        score = row["combined_score"]
        if score > 0.20:
            query = f"MATCH (n1), (n2) WHERE n1.name = '{node1}' AND n2.name = '{node2}' CREATE (n1)-[:SAME_AS]->(n2)"
            gds.execute_query(query)

create_same_as_relationship(selected_df, "Company")

### Merge nodes with SAME_AS relationship

In [None]:
merge_query = """
MATCH (n1)-[r:SAME_AS]->(n2)
WITH n1, n2, collect(r) as relsToDel

FOREACH (rel IN relsToDel | DELETE rel)
WITH collect(DISTINCT n1) + collect(DISTINCT n2) AS nodesToMerge

UNWIND nodesToMerge AS node

WITH collect(DISTINCT node) AS uniqueNodesToMerge
CALL apoc.refactor.mergeNodes(uniqueNodesToMerge, {mergeRels:true}) YIELD node
RETURN node
"""

gds.execute_query(merge_query)