# 0. Libraries

In [1]:
# Utilities
import asyncio
import json
import os

# Neo4j and Neo4j GraphRAG imports
import neo4j
import numpy as np
import pandas as pd
import polars as pl
import requests
import tqdm.notebook as tqdm
import wikipedia
from dotenv import find_dotenv, load_dotenv
from duckduckgo_search import DDGS
from google import genai
from neo4j import GraphDatabase
from neo4j_graphrag.embeddings import SentenceTransformerEmbeddings
from neo4j_graphrag.experimental.components.resolver import (
    FuzzyMatchResolver,
    SinglePropertyExactMatchResolver,
    SpaCySemanticMatchResolver,
)
from neo4j_graphrag_custom.kg_builder import (
    CustomKGPipeline,
    GeminiLLM,
    build_kg_from_df,
)
from sentence_transformers import SentenceTransformer

Let's first check the available Gemini models.

In [2]:
load_dotenv('.env', override=True)

gemini_api_key = os.getenv('GEMINI_API_KEY')

if gemini_api_key:
    client = genai.Client(api_key=gemini_api_key)  # Configure the API key for genai
else:
    raise ValueError("GEMINI_API_KEY environment variable is not set.")

# Display available models
for model in client.models.list():
    print(model)

name='models/embedding-gecko-001' display_name='Embedding Gecko' description='Obtain a distributed representation of a text.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=1024 output_token_limit=1 supported_actions=['embedText', 'countTextTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-1.0-pro-vision-latest' display_name='Gemini 1.0 Pro Vision' description='The original Gemini 1.0 Pro Vision model version which was optimized for image understanding. Gemini 1.0 Pro Vision was deprecated on July 12, 2024. Move to a newer Gemini version.' version='001' endpoints=None labels=None tuned_model_info=TunedModelInfo(base_model=None, create_time=None, update_time=None) input_token_limit=12288 output_token_limit=4096 supported_actions=['generateContent', 'countTokens'] default_checkpoint_id=None checkpoints=None
name='models/gemini-pro-vision' display_name='Gemini 1.0 Pro Vi

We also have to make sure that the corresponding SpaCy model for text embedding used at the resolving step is installed.

In [3]:
import importlib.util
import subprocess
import sys

import spacy


def ensure_spacy_model(model_name):
    if importlib.util.find_spec(model_name) is None:
        print(f"Model '{model_name}' not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "spacy", "download", model_name])
    else:
        print(f"Model '{model_name}' is already installed.")

# Use it for 'en_core_web_lg'
ensure_spacy_model("en_core_web_lg")  # Model used for resolving entities in the KG pipeline

Model 'en_core_web_lg' is already installed.


In [4]:
with open('kg_building_config.json', 'r') as f:
    config = json.load(f)

config['examples_config']

{'pass_examples': False,
 'examples': [{'input_text': "Text: On January 1, 2023, a significant conflict erupted in the Middle East involving multiple countries and organizations. The conflict, named 'Middle East Conflict 2023', lasted until March 15, 2023. Key actors included the 'Middle East Coalition' and the 'Opposing Forces'. The conflict resulted in a high level of destruction and instability in the region.",
   'schema': {'nodes': [{'id': '0',
      'label': 'Event',
      'properties': {'name': 'Middle East Conflict 2023',
       'date': '2023-01-01',
       'end_date': '2023-03-15',
       'type': 'Conflict',
       'severity': 5,
       'description': 'A significant conflict in the Middle East.'}},
     {'id': '1',
      'label': 'Actor',
      'properties': {'name': 'Middle East Coalition', 'type': 'Organization'}},
     {'id': '2',
      'label': 'Actor',
      'properties': {'name': 'Opposing Forces', 'type': 'Organization'}},
     {'id': '3',
      'label': 'Region',
     

# 1. Loading the data

The data is loaded here as a reference, but it is loaded again inside the pipeline below.

## 1.2. Factal sample data

In [5]:
global df
df = pl.read_csv('factal_single_topic_report-2025-05-01-2025-06-05.csv')

# Create an index for each row
df = df.with_row_index(name="id", offset=1)
# Rename "Associated topics" column to "Country"
df = df.rename({"Associated topics": "Country"})

df=df.head(20)
df.head(2)

id,Published date,Severity,Published text,Translated text,Original language,Source URL,Status,Country
u32,str,i64,str,str,str,str,str,str
1,"""2025-06-03 15:28:28.271179+00:…",3,"""WFP and UNICEF now say five me…",,,"""https://www.unicef.org/press-r…","""published""","""Sudan"""
2,"""2025-06-03 10:10:04.994458+00:…",3,"""""Multiple casualties"" after WF…",,,"""https://www.reuters.com/world/…","""published""","""Sudan"""


### Load Admin1 locations from HDX database

In [6]:
global df2
admin1 = pd.read_csv(r"C:\Users\matia\Downloads\global_pcodes_adm_1_2.csv")
df2 = admin1[admin1['Location'] == 'SDN'].copy()
df2 = df2.reset_index(drop=True)
df2 = df2.reset_index(names='id')
df2['id'] = df2['id'] + 1
df2['Country'] = 'Sudan'
df2.head(2)

Unnamed: 0,id,Location,Admin Level,P-Code,Name,Parent P-Code,Valid from date,Country
0,1,SDN,1,SD01,Khartoum,SDN,2020-08-31,Sudan
1,2,SDN,1,SD02,North Darfur,SDN,2020-08-31,Sudan


# 2. Running the pipeline

In [7]:
# Open configuration file
script_dir = os.getcwd()
config_path = os.path.join(script_dir, 'kg_building_config.json')
with open(config_path, 'r') as config_file:
    config = json.load(config_file)
prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None
print(prompt_template)

You are a top-tier algorithm designed for extracting information in structured formats to build a knowledge graph that will be used for creating security reports for different countries.

Extract the entities (nodes) and specify their type from the following Input text.
Also extract the relationships between these nodes. The relationship direction goes from the start node to the end node.

Output ONLY valid JSON with this exact structure without any commentary, explanation, or markdown formatting:
{{"nodes": [ {{"id": "0", "label": "the type of entity", "properties": {{"name": "name of entity" }} }}],
"relationships": [{{"type": "TYPE_OF_RELATIONSHIP", "start_node_id": "0", "end_node_id": "1", "properties": {{"details": "Description of the relationship"}} }}] }}

- Use only the information from the Input text. Do not add any additional information.
- If the input text is empty, return empty Json.
- Make sure to create as many nodes and relationships as needed to offer rich context for 

## 2.2. With a data frame

### A. Using the `SpaCySemanticMatchResolver`

More useful information about the resolvers can be found in the [user guide](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html#entity-resolver). Below, we use different resolvers (from the most aggressive - spaCy to the most conservative - exact matching) to get a broad overview of the performance results.

#### With Factal

In [None]:
# Example usage code
async def main():

    # Load configuration and setup

    script_dir = os.getcwd()

    # script_dir = os.path.dirname(os.path.abspath(__file__))  # Uncomment if running as a script

    # Load environment variables from a .env file
    dotenv_path = os.path.join(script_dir, '.env')
    load_dotenv(dotenv_path, override=True)

    # Open configuration file from JSON format
    config_path = os.path.join(script_dir, 'kg_building_config.json')
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    
    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')
    
    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )
    
    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])
    
    # Configure text splitter
    text_splitter_config = config['text_splitter_config']
    
    # Load data 
    #df_path = os.path.join(script_dir, 'factal_single_topic_report-2025-05-01-2025-06-05.csv')
    #df = pl.read_csv(    df = df.with_row_index(name="id", offset=1)

    # Access the global df variable
    global df
    # Add row index only once
    #df = df.with_row_index(name="id", offset=1)

    # Convert the "id" to a string to ensure it is treated as a document ID
    df = df.with_columns(pl.col('id').cast(pl.String))
    
    # Create the pipeline - use with statement to ensure proper resource management
    # and to ensure the driver is closed after use
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(  # Merge nodes with same label and similar textual properties
            driver,
            filter_query=None,  # "WHERE (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(doc:Document {id = 'docId'}",  # Used to reduce the resolution scope to a specific document
            resolve_properties=["name"],  # Properties to use for resolution (default is "name")
            similarity_threshold=0.8,  # The similarity threshold above which nodes are merged (default is 0.8). Higher threshold will result in less false positives, but may miss some matches. 
            spacy_model="en_core_web_lg"  # spaCy model to use for resolution (default is "en_core_web_lg")
        )

        # Initialize the custom KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=None,  # Use None if no examples are provided
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )
        
        # Define metadata mapping (document properties additional to base field 
        # to dataframe columns)
        metadata_mapping = {
            "source": "Source URL",
            "published_date": "Published date"
        }
        
        # Process the dataframe
        results = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='id',
            text_column='Published text',
            document_metadata_mapping=metadata_mapping,
            document_id_column=None  # Use default document ID generation
        )
    
    return results

# Asyncio event loop to run the main function in a Jupyter notebook
results = await main()
print(f"Processed {len(results)} documents")

# # Asyncio event loop to run the main function in a script
# if __name__ == "__main__":
#     results = asyncio.run(main())
#     print(f"Processed {len(results)} documents")

# NASTIA

In [None]:
async def main():
    global df, df2 

    # Load configuration and setup
    script_dir = os.getcwd()

    # Load environment variables from a .env file
    dotenv_path = os.path.join(script_dir, '.env')
    load_dotenv(dotenv_path, override=True)

    # Open configuration file from JSON format
    config_path = os.path.join(script_dir, 'kg_building_config.json')
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')

    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )

    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])

    # Configure text splitter
    text_splitter_config = config['text_splitter_config']

    # Define metadata mapping (document properties additional to base field
    # to dataframe columns)
    metadata_mapping = {
        "source": "Source URL",
        "published_date": "Published date",
        "country": "Country"
    }

    # Factal DF
    global df
    df = df.with_columns(pl.col('id').cast(pl.String))

    # Admin1 DF
    global df2

    # Create the pipeline - use with statement to ensure proper resource management
    # and to ensure the driver is closed after use
    all_results = []
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:

        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(
            driver,
            filter_query=None,
            resolve_properties=["name"],
            similarity_threshold=0.8,
            spacy_model="en_core_web_lg"
        )

        # Initialize the custom KG pipeline (this needs to be done only once)
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )

        # --- Process the first dataframe ---
        print("\nProcessing first DataFrame...")
        results_df1 = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='id',
            text_column='Published text',
            document_metadata_mapping=metadata_mapping,
            document_id_column=None
        )
        all_results.extend(results_df1)
        print(f"Finished processing first DataFrame. Total documents processed: {len(all_results)}")

        # --- Process the second dataframe (if loaded successfully) ---
        if df2 is not None and not df2.is_empty():
            print("\nProcessing second DataFrame...")
            results_df2 = await build_kg_from_df(
                kg_pipeline=kg_pipeline,
                df=df2, # Pass the second DataFrame here
                document_base_field='id',
                text_column='Name', # Make sure this column name exists in df2
                document_metadata_mapping=metadata_mapping, # Reuse mapping or define a new one if df2 has different metadata
                document_id_column=None
            )
            all_results.extend(results_df2)
            print(f"Finished processing second DataFrame. Total documents processed: {len(all_results)}")
        else:
            print("Second DataFrame was not processed (either not loaded or empty).")

    return all_results

# Asyncio event loop to run the main function in a Jupyter notebook
results = await main()
print(f"Processed {len(results)} documents")


Processing first DataFrame...
Processing row 1 of 20


LLM response is not valid JSON for chunk_index=0


Result: run_id='a566a80a-3644-4348-9d69-39279e8f59e7' result={'resolver': {'number_of_nodes_to_resolve': 0, 'number_of_created_nodes': 0}}
Elapsed time: 9.59 seconds
Estimated time remaining: 182.24 seconds

Processing row 2 of 20


### B. Using the `FuzzyMatchResolver`

In [None]:
# Example usage code
async def main():

    # Load configuration and setup
    script_dir = os.getcwd()

    # Load environment variables from a .env file
    dotenv_path = os.path.join(script_dir, '.env')
    load_dotenv(dotenv_path, override=True)

    # Open configuration file from JSON format
    config_path = os.path.join(script_dir, 'kg_building_config.json')
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    
    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')
    
    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )
    
    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])
    
    # Configure text splitter
    text_splitter_config = config['text_splitter_config']
    
    # Use the global dataframe instead of trying to load the non-existent parquet file
    global df
    
    # Convert the "id" to a string to ensure it is treated as a document ID
    df = df.with_columns(pl.col('id').cast(pl.String))
    
    # Create the pipeline
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = FuzzyMatchResolver(  # Merge nodes with same label and similar textual properties
            driver,
            filter_query=None,
            resolve_properties=["name"],
            similarity_threshold=0.8,
        )

        # Initialize the custom KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )
        
        # Define metadata mapping - adjust to match your factal CSV columns
        metadata_mapping = {
            "source": "Source URL",
            "published_date": "Published date"
        }
        
        # Process the dataframe - use column names from your factal CSV
        results = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='id',
            text_column='Published text', 
            document_metadata_mapping=metadata_mapping,
            document_id_column=None
        )
    
    return results

# Asyncio event loop to run the main function
results = await main()
print(f"Processed {len(results)} documents")

### C. Using the `SinglePropertyExactMatchResolver`

In [None]:
# Example usage code
async def main():

    # Load configuration and setup

    script_dir = os.getcwd()

    # script_dir = os.path.dirname(os.path.abspath(__file__))  # Uncomment if running as a script

    # Load environment variables from a .env file
    dotenv_path = os.path.join(script_dir, '.env')
    load_dotenv(dotenv_path, override=True)

    # Open configuration file from JSON format
    config_path = os.path.join(script_dir, 'kg_building_config.json')
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    
    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')
    
    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )
    
    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])
    
    # Configure text splitter
    text_splitter_config = config['text_splitter_config']
    
    # Load data
    df_path = os.path.join(script_dir, 'FILTERED_DATAFRAME.parquet')
    df = pl.read_parquet(df_path)

    # Convert 'date' column to string format (from YYYYMMDD to YYYY-MM-DD)
    df = df.with_columns(pl.col('date').cast(pl.String))
    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format='%Y%m%d'))
    df = df.with_columns(pl.col('date').dt.strftime('%Y-%m-%d'))

    # Create subset of the dataframe for testing
    df = df.head(10)
    
    # Create the pipeline - use with statement to ensure proper resource management
    # and to ensure the driver is closed after use
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = SinglePropertyExactMatchResolver(  # Merge nodes with same label and exact property
            driver,
            filter_query=None,  # "WHERE (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(doc:Document {id = 'docId'}",  # Used to reduce the resolution scope to a specific document
            resolve_property="name"  # Property to use for resolution (default is "name")
        )

        # Initialize the custom KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=None,
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5,
        )
        
        # Define metadata mapping (document properties additional to base field 
        # to dataframe columns)
        metadata_mapping = {
            "source": "url",
            "published_date": "date"
        }
        
        # Process the dataframe
        results = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='title',
            text_column='full_text',
            document_metadata_mapping=metadata_mapping,
            document_id_column=None  # Use default document ID generation
        )
    
    return results

# Asyncio event loop to run the main function in a Jupyter notebook
results = await main()
print(f"Processed {len(results)} documents")

# # Asyncio event loop to run the main function in a script
# if __name__ == "__main__":
#     results = asyncio.run(main())
#     print(f"Processed {len(results)} documents")

### Insights

- The **spaCy** resolver is the one which yields the smallest graph, and the one which resolves the most nodes among the default resolvers. However, it seems to be the slowest resolver.
- The `FuzzyMatchResolver` does not seem to work very well out of the box, as the number of duplicate nodes and edges increases significantly.
- Finally, the `SinglePropertyExactMatchResolver` produces the largest graph among the 3 resolvers, but prunes a relatively decent amount of nodes and edges when running the instance twice.

### Ex-post resolver

Note that it is also possible to run a resolver ex-post, after the graph has been created (see the [user guide for the resolvers](https://neo4j.com/docs/neo4j-graphrag-python/current/user_guide_kg_builder.html#entity-resolver)) and outside the pipeline of the KG building.

In [None]:
# Neo4j connection
neo4j_uri = os.getenv('NEO4J_URI')
neo4j_username = os.getenv('NEO4J_USERNAME')
neo4j_password = os.getenv('NEO4J_PASSWORD')

with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(  # Merge nodes with same label and similar textual properties
            driver,
            filter_query=None,  # "WHERE (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(doc:Document {id = 'docId'}",  # Used to reduce the resolution scope to a specific document
            resolve_properties=["name"],  # Properties to use for resolution (default is "name")
            similarity_threshold=0.8,  # The similarity threshold above which nodes are merged (default is 0.8). Higher threshold will result in less false positives, but may miss some matches. 
            spacy_model="en_core_web_lg"  # spaCy model to use for resolution (default is "en_core_web_lg")
        )

        result = await resolver.run()
        print("Entity resolution completed.")

print(result)

Entity resolution completed.
number_of_nodes_to_resolve=164 number_of_created_nodes=0


## 2.3. Creating the KG without a schema

## 2.4. Passing examples for few-shot learning

Ensure that the `pass_examples` key in the JSON configuration file is set to `true`.

In [None]:
# Example usage code
async def main():

    # Load configuration and setup

    script_dir = os.getcwd()

    # script_dir = os.path.dirname(os.path.abspath(__file__))  # Uncomment if running as a script

    # Load environment variables from a .env file
    dotenv_path = os.path.join(script_dir, '.env')
    load_dotenv(dotenv_path, override=True)

    # Open configuration file from JSON format
    config_path = os.path.join(script_dir, 'kg_building_config.json')
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    
    # Neo4j connection
    neo4j_uri = os.getenv('NEO4J_URI')
    neo4j_username = os.getenv('NEO4J_USERNAME')
    neo4j_password = os.getenv('NEO4J_PASSWORD')
    gemini_api_key = os.getenv('GEMINI_API_KEY')
    
    # Check if gemini_api_key is set
    if gemini_api_key:
        pass
    else:
        raise ValueError("Gemini API key is not set. Please provide a valid API key.")

    # Initialize LLM
    llm = GeminiLLM(
        model_name=config['llm_config']['model_name'],
        google_api_key=gemini_api_key,
        model_params=config['llm_config']['model_params']
    )
    
    # Initialize embedder
    embedder = SentenceTransformerEmbeddings(model=config['embedder_config']['model_name'])
    
    # Configure text splitter
    text_splitter_config = config['text_splitter_config']
    
    # Load data
    df_path = os.path.join(script_dir, 'FILTERED_DATAFRAME.parquet')
    df = pl.read_parquet(df_path)

    # Convert 'date' column to string format (from YYYYMMDD to YYYY-MM-DD)
    df = df.with_columns(pl.col('date').cast(pl.String))
    df = df.with_columns(pl.col('date').str.strptime(pl.Date, format='%Y%m%d'))
    df = df.with_columns(pl.col('date').dt.strftime('%Y-%m-%d'))

    # Create subset of the dataframe for testing
    df = df.head(10)
    
    # Create the pipeline - use with statement to ensure proper resource management
    # and to ensure the driver is closed after use
    with neo4j.GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password)) as driver:
        
        # Initialize entity resolver
        resolver = SpaCySemanticMatchResolver(  # Merge nodes with same label and similar textual properties
            driver,
            filter_query=None,  # "WHERE (entity)-[:FROM_CHUNK]->(:Chunk)-[:FROM_DOCUMENT]->(doc:Document {id = 'docId'}",  # Used to reduce the resolution scope to a specific document
            resolve_properties=["name"],  # Properties to use for resolution (default is "name")
            similarity_threshold=0.8,  # The similarity threshold above which nodes are merged (default is 0.8). Higher threshold will result in less false positives, but may miss some matches. 
            spacy_model="en_core_web_lg"  # spaCy model to use for resolution (default is "en_core_web_lg")
        )

        # Initialize the custom KG pipeline
        kg_pipeline = CustomKGPipeline(
            llm=llm,
            driver=driver,
            embedder=embedder,
            schema_config=config['schema_config'],
            prompt_template=config['prompt_template_config']['template'] if config['prompt_template_config'].get('use_default') == False else None,
            text_splitter_config=text_splitter_config,
            resolver=resolver,
            examples_config=config['examples_config'],  # Use None if no examples are provided
            on_error='RAISE',
            batch_size=1000,
            max_concurrency=5
        )
        
        # Define metadata mapping (document properties additional to base field 
        # to dataframe columns)
        metadata_mapping = {
            "source": "url",
            "published_date": "date"
        }
        
        # Process the dataframe
        results = await build_kg_from_df(
            kg_pipeline=kg_pipeline,
            df=df,
            document_base_field='title',
            text_column='full_text',
            document_metadata_mapping=metadata_mapping,
            document_id_column=None  # Use default document ID generation
        )
    
    return results

# Asyncio event loop to run the main function in a Jupyter notebook
results = await main()
print(f"Processed {len(results)} documents")

# # Asyncio event loop to run the main function in a script
# if __name__ == "__main__":
#     results = asyncio.run(main())
#     print(f"Processed {len(results)} documents")

Obviously, the examples provided have to be useful for the LLM.

# Entity Resolution Pipeline

In [None]:
# Connect to Neo4j
driver = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=("neo4j", os.getenv("NEO4J_PASSWORD"))
)

# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

def get_embedding(text):
    return model.encode(text).tolist()

# List of relevant node labels for deduplication
ENTITY_LABELS = ["Event", "Actor", "Country", "ADM1", "Location"]

def get_all_entities():
    query_template = """
    MATCH (n:{label})
    RETURN elementId(n) AS id, labels(n)[0] AS label, n.name AS name, properties(n) AS properties

    """
    all_entities = []
    with driver.session() as session:
        for label in ENTITY_LABELS:
            result = session.run(query_template.format(label=label)).data()
            all_entities.extend(result)
    return all_entities

def cosine_similarity(vec1, vec2):
    if vec1 is None or vec2 is None or len(vec1) == 0 or len(vec2) == 0:
        return 0  # If either vector is None or empty, return 0 similarity
    dot = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot / (norm1 * norm2) if norm1 > 0 and norm2 > 0 else 0

def find_similar_entities(threshold=0.5):
    entities = get_all_entities()
    
    # Compute embeddings
    for entity in entities:
        text = f"Name: {entity['name']}\n"
        for key, value in entity['properties'].items():
            if key != 'name' and key != 'embedding' and value:
                text += f"{key}: {value}\n"
        entity['embedding'] = get_embedding(text)
    
    # Find similar pairs
    similar_pairs = []
    for i, e1 in enumerate(entities):
        for j, e2 in enumerate(entities[i + 1:], i + 1):
            if e1['label'] != e2['label']:  # Only compare same types
                continue
            sim = cosine_similarity(e1['embedding'], e2['embedding'])
            if sim > threshold:
                similar_pairs.append({
                    "id1": e1['id'],
                    "id2": e2['id'],
                    "name1": e1['name'],
                    "name2": e2['name'],
                    "type1": e1['properties'].get('type', e1['label']),  # Get type from properties or fall back to label
                    "type2": e2['properties'].get('type', e2['label']),
                    "similarity": sim
                })
    
    # Create SAME_AS relationships
    query = """
    MATCH (a), (b)
    WHERE elementId(a) = $id1 AND elementId(b) = $id2
    MERGE (a)-[:SAME_AS {similarity: $similarity}]->(b)
    """
    with driver.session() as session:
        for pair in similar_pairs:
            session.run(query, pair)
    
    return similar_pairs

def merge_similar_nodes():
    merge_query = """
    // Process one pair of nodes at a time to avoid conflicts
    MATCH (n1)-[r:SAME_AS]->(n2)
    WHERE n1 IS NOT NULL AND n2 IS NOT NULL
    
    // Copy properties from n2 to n1 if they don't exist in n1
    WITH n1, n2, [key IN keys(n2) WHERE NOT key IN keys(n1)] AS newKeys
    FOREACH (key IN newKeys | SET n1[key] = n2[key])
    
    // Get all outgoing relationships from n2 (except SAME_AS)
    WITH n1, n2
    OPTIONAL MATCH (n2)-[outRel]->(target)
    WHERE target IS NOT NULL AND type(outRel) <> 'SAME_AS'
    
    // Create equivalent relationships from n1 if they don't already exist
    WITH n1, n2, outRel, target, type(outRel) AS relType
    WHERE NOT EXISTS((n1)-[:`${relType}`]->(target))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN outRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (n1)-[newRel:`${relType}`]->(target)
        SET newRel = properties(outRel)
    )
    
    // Return the node pair for the next phase
    WITH DISTINCT n1, n2
    
    // Handle incoming relationships
    OPTIONAL MATCH (source)-[inRel]->(n2)
    WHERE source IS NOT NULL AND source <> n1 AND type(inRel) <> 'SAME_AS'
    
    // Create equivalent relationships to n1 if they don't already exist
    WITH n1, n2, inRel, source, type(inRel) AS relType
    WHERE NOT EXISTS((source)-[:`${relType}`]->(n1))
    
    // Create new relationship with the same properties
    FOREACH (_ IN CASE WHEN inRel IS NOT NULL THEN [1] ELSE [] END |
        CREATE (source)-[newRel:`${relType}`]->(n1)
        SET newRel = properties(inRel)
    )
    
    // Return distinct pairs for deletion phase
    WITH DISTINCT n1, n2
    
    // Delete the second node and all its relationships
    DETACH DELETE n2
    
    RETURN count(n2) AS mergedCount
    """
    
    try:
        with driver.session() as session:
            result = session.run(merge_query)
            record = result.single()
            return record["mergedCount"] if record else 0
    except Exception as e:
        print(f"Error during node merging: {e}")
        return 0

def check_apoc():
    try:
        with driver.session() as session:
            session.run("CALL apoc.help('create')")
            print("APOC is available.")
            return True
    except Exception as e:
        print(f"APOC not available: {e}")
        return False

# === MAIN EXECUTION ===
check_apoc()
print("Resolving similar entities...")
pairs = find_similar_entities().sort(by="similarity", ascending=False)
print(f"Found {len(pairs)} similar entity pairs.")

APOC is available.
Resolving similar entities...
Found 1008 similar entity pairs.


In [49]:
print(pairs)

[{'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:52', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:204', 'name1': '727 cholera cases', 'name2': 'Cholera outbreak', 'type1': 'Disease Cases', 'type2': 'Disease Outbreak', 'similarity': np.float64(0.83766637758409)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:52', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:208', 'name1': '727 cholera cases', 'name2': 'cholera-related deaths and infections', 'type1': 'Disease Cases', 'type2': 'Disease Outbreak', 'similarity': np.float64(0.8441095024319051)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:184', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:192', 'name1': 'closure of main hospital', 'name2': 'Hospital', 'type1': 'Closure', 'type2': 'medical facility', 'similarity': np.float64(0.8070646261053283)}, {'id1': '4:47781cee-4592-4361-af19-475a2abd1ee7:204', 'id2': '4:47781cee-4592-4361-af19-475a2abd1ee7:208', 'name1': 'Cholera outbreak', 'name2': 'cholera-related deaths and infections',

In [None]:
merged = merge_similar_nodes()
print(f"Merged {merged} nodes.")

In [21]:
gds = GraphDatabase.driver(
    os.getenv("NEO4J_URI"),
    auth=(
        "neo4j",
        os.getenv("NEO4J_PASSWORD")
    )
)

client = genai.Client(api_key=gemini_api_key)

### Node similarity

In [None]:
node_similarity_query = """
CALL gds.nodeSimilarity.stream('amazonGraph')
YIELD node1, node2, similarity as node_similarity
WHERE 'Company' IN labels(gds.util.asNode(node1)) AND 'Company' IN labels(gds.util.asNode(node2))
AND node_similarity < 1
RETURN gds.util.asNode(node1).name AS Company1, gds.util.asNode(node2).name AS Company2, node_similarity
ORDER BY node_similarity DESCENDING, Company1, Company2
"""

def results_to_df(query: str) -> pd.DataFrame:
    results = gds.execute_query(query)[0]
    df = pd.DataFrame(results, columns=results[0].keys())
    return df

df_node_similarity = results_to_df(node_similarity_query)
print(df_node_similarity)

### Embeddings and Cosine Similarity

In [None]:
companies_query = """
MATCH (c:Company)
RETURN c, properties(c) as properties, id(c) as id
"""

def properties_to_text(node) -> str:
    """Given node, return string of all its properties"""
    property_text = ""
    properties = [p for p in node.get('properties').keys() if p != 'embedding']
    for property_name in properties:
        property_text += f"{property_name}: {node.get('properties').get(property_name)}\n"
    return property_text

def add_node_embedding(node):
    text = properties_to_text(node)
    embedding = get_embedding(text) # Assuming get_embedding is defined elsewhere, likely from OpenAI as seen in previous context
    add_embeddings_query = f"""
    MATCH (n)
    WHERE id(n) = {node.get('id')}
    SET n.embedding = {embedding}
    """
    gds.execute_query(add_embeddings_query)
    
results = gds.execute_query(companies_query)[0]
for r in results:
    add_node_embedding(r)

cosine_similarity_query = """
// Cosine Similarity
MATCH (c1:Company), (c2:Company)
WHERE id(c1) < id(c2)
WITH c1, c2, gds.similarity.cosine(c1.embedding, c2.embedding) AS cosine_similarity
WHERE cosine_similarity < 1
RETURN c1.name AS Company1, c2.name AS Company2, cosine_similarity
ORDER BY cosine_similarity DESC
"""

df_cosine_similarity = results_to_df(cosine_similarity_query)
print(df_cosine_similarity)

### Combine similarity scores

In [None]:
combined_df = pd.merge(df_cosine_similarity, df_node_similarity, how='inner', on=("Company1", "Company2"))
combined_df["combined_score"] = combined_df["node_similarity"] * combined_df["cosine_similarity"]
combined_df.sort_values(by="combined_score", ascending=False, inplace=True)

selected_df = combined_df[["Company1", "Company2", "combined_score", "node_similarity", "cosine_similarity"]]
print(selected_df)

### Create SAME_AS relationships

In [None]:
def create_same_as_relationship(df, column_name):
    # Iterate over the DataFrame rows
    for _, row in df.iterrows():
        node1 = row[column_name + '1']
        node2 = row[column_name + '2']

        # Run Cypher query to create 'SAME_AS' relationship
        score = row["combined_score"]
        if score > 0.20:
            query = f"MATCH (n1), (n2) WHERE n1.name = '{node1}' AND n2.name = '{node2}' CREATE (n1)-[:SAME_AS]->(n2)"
            gds.execute_query(query)

create_same_as_relationship(selected_df, "Company")

### Merge nodes with SAME_AS relationship

In [None]:
merge_query = """
MATCH (n1)-[r:SAME_AS]->(n2)
WITH n1, n2, collect(r) as relsToDel

FOREACH (rel IN relsToDel | DELETE rel)
WITH collect(DISTINCT n1) + collect(DISTINCT n2) AS nodesToMerge

UNWIND nodesToMerge AS node

WITH collect(DISTINCT node) AS uniqueNodesToMerge
CALL apoc.refactor.mergeNodes(uniqueNodesToMerge, {mergeRels:true}) YIELD node
RETURN node
"""

gds.execute_query(merge_query)