# 02 - Separate Extract and Build

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

## Local extract to folder

See [Run the extract and build stages separately](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#run-the-extract-and-build-stages-separately)

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import FileBasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

from llama_index.readers.web import SimpleWebPageReader

set_logging_config('INFO')

# Register the FalkorDB backend with the factory
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

extracted_docs = FileBasedDocs(
    docs_directory='extracted'
)

checkpoint = Checkpoint('extraction-checkpoint')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store, 
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

docs = SimpleWebPageReader(
    html_to_text=True,
    metadata_fn=lambda url:{'url': url}
).load_data(doc_urls)

graph_index.extract(docs, handler=extracted_docs, checkpoint=checkpoint, show_progress=True)

collection_id = extracted_docs.collection_id

print('Extraction complete')
print(f'collection_id: {collection_id}')

## Extraction to S3

See [Run the extract and build stages separately](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#run-the-extract-and-build-stages-separately)

In [2]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

from llama_index.readers.web import SimpleWebPageReader

set_logging_config('INFO')

# Register the FalkorDB backend with the factory
GraphStoreFactory.register(FalkorDBGraphStoreFactory)

extracted_docs = S3BasedDocs(
    region=os.environ['AWS_REGION'],
    bucket_name=os.environ['S3_BUCKET_NAME'],
    key_prefix='key_prefix',
    collection_id='demo123'
)

checkpoint = Checkpoint('s3-extraction-checkpoint')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

docs = SimpleWebPageReader(
    html_to_text=True,
    metadata_fn=lambda url:{'url': url}
).load_data(doc_urls)

graph_index.extract(docs, handler=extracted_docs, checkpoint=checkpoint, show_progress=True)

collection_id = extracted_docs.collection_id

print('Extraction complete')
print(f'collection_id: {collection_id}')

2025-05-09 12:57:20:INFO:g.l.i.e.extraction_pipeline:Running extraction pipeline [batch_size: 100, num_workers: 2]


Extracting propositions [nodes: 5, num_workers: 4]: 100%|██████████| 5/5 [00:04<00:00,  1.19it/s]s]
Extracting propositions [nodes: 10, num_workers: 4]: 100%|██████████| 10/10 [00:06<00:00,  1.53it/s]
Extracting topics [nodes: 5, num_workers: 4]: 100%|██████████| 5/5 [00:10<00:00,  2.16s/it]t]
Extracting topics [nodes: 10, num_workers: 4]: 100%|██████████| 10/10 [00:14<00:00,  1.42s/it]


2025-05-09 12:57:44:INFO:g.l.i.b.build_pipeline:Running build pipeline [batch_size: 4, num_workers: 1, job_sizes: [492], batch_writes_enabled: True, batch_write_size: 25]
Extraction complete
collection_id: demo123


## Using batch inference with the LexicalGraphIndex

Ensure you have reviewed batch-extraction.md. For permission creation please see setup-bedrock-batch.md in lexical-graph-hybrid-dev/aws folder.

In [None]:
!pip install PyMuPDF llama-index[pdf]


In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import (
    LexicalGraphIndex,
    GraphRAGConfig,
    IndexingConfig,
    set_logging_config,
)
from graphrag_toolkit.lexical_graph.storage import (
    GraphStoreFactory,
    VectorStoreFactory,
)
from graphrag_toolkit.lexical_graph.storage.graph.falkordb import FalkorDBGraphStoreFactory
from graphrag_toolkit.lexical_graph.indexing.extract import BatchConfig
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file import PyMuPDFReader

def batch_extract_and_load():
    set_logging_config("INFO")

    # Register FalkorDB backend
    GraphStoreFactory.register(FalkorDBGraphStoreFactory)

    # Set batch size
    GraphRAGConfig.extraction_batch_size = int(os.environ.get("EXTRACTION_BATCH_SIZE", 4))

    # Configure batch S3 setup
    batch_config = BatchConfig(
        region=os.environ["AWS_REGION"],
        bucket_name=os.environ["S3_BATCH_BUCKET_NAME"],
        key_prefix=os.environ["BATCH_KEY_PREFIX_01"],
        role_arn=f'arn:aws:iam::{os.environ["AWS_ACCOUNT"]}:role/{os.environ["BATCH_ROLE_NAME"]}',

    )

    indexing_config = IndexingConfig(batch_config=batch_config)
    checkpoint = Checkpoint(os.environ["BATCH_CHECKPOINT_01"])

    graph_store = GraphStoreFactory.for_graph_store(os.environ["GRAPH_STORE"])
    vector_store = VectorStoreFactory.for_vector_store(os.environ["VECTOR_STORE"])

    graph_index = LexicalGraphIndex(
        graph_store,
        vector_store,
        indexing_config=indexing_config
    )

    # Use PyMuPDF for PDFs
    file_extractor = {
        ".pdf": PyMuPDFReader()
    }

    reader = SimpleDirectoryReader(
        input_dir=os.environ["BATCH_SOURCE_DIR"],
        file_extractor=file_extractor
    )
    docs = reader.load_data()

    graph_index.extract(docs, checkpoint=checkpoint, show_progress=True)

# Run the batch job
batch_extract_and_load()
