# 02 - Separate Extract and Build

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

## Local extract to folder

See [Run the extract and build stages separately](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#run-the-extract-and-build-stages-separately)

In [None]:
import logging

# Suppress Neo4j notifications/warnings
logging.getLogger('neo4j.notifications').setLevel(logging.ERROR)
logging.getLogger('neo4j').setLevel(logging.ERROR)

# Or suppress all Neo4j logging completely
logging.getLogger('neo4j').disabled = True


In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import FileBasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

# Import GraphRAG web reader instead of LlamaIndex directly
from graphrag_toolkit.lexical_graph.indexing.load.readers import WebReaderProvider, WebReaderConfig

set_logging_config('INFO')

# Register the Neo4j backend with the factory
GraphStoreFactory.register(Neo4jGraphStoreFactory)

extracted_docs = FileBasedDocs(
    docs_directory='extracted'
)

checkpoint = Checkpoint('extraction-checkpoint')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store, 
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

# Configure web reader with metadata function
web_config = WebReaderConfig(
    html_to_text=True,
    metadata_fn=lambda url: {'url': url, 'source': 'web', 'domain': 'aws.amazon.com'}
)

web_reader = WebReaderProvider(web_config)

# Read documents using GraphRAG web reader
docs = web_reader.read(doc_urls)

graph_index.extract(docs, handler=extracted_docs, checkpoint=checkpoint, show_progress=True)

collection_id = extracted_docs.collection_id

print('Extraction complete')
print(f'collection_id: {collection_id}')


## Extraction to S3

See [Run the extract and build stages separately](https://github.com/awslabs/graphrag-toolkit/blob/main/docs/lexical-graph/indexing.md#run-the-extract-and-build-stages-separately)

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

# Import GraphRAG web reader instead of LlamaIndex directly
from graphrag_toolkit.lexical_graph.indexing.load.readers import WebReaderProvider, WebReaderConfig

set_logging_config('INFO')

# Register the Neo4j backend with the factory
GraphStoreFactory.register(Neo4jGraphStoreFactory)

extracted_docs = S3BasedDocs(
    region=os.environ['AWS_REGION'],
    bucket_name=os.environ['S3_BUCKET_EXTRACK_BUILD_BATCH_NAME'],
    key_prefix=os.environ["EXTRACT_BUILD_PREFIX"],
    collection_id='web-docs'
)

checkpoint = Checkpoint('s3-extraction-web-docs-checkpoint-01')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

doc_urls = [
    'https://docs.aws.amazon.com/neptune/latest/userguide/intro.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/what-is-neptune-analytics.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-features.html',
    'https://docs.aws.amazon.com/neptune-analytics/latest/userguide/neptune-analytics-vs-neptune-database.html'
]

# Configure web reader with metadata function
web_config = WebReaderConfig(
    html_to_text=True,
    metadata_fn=lambda url: {
        'url': url, 
        'source': 'web', 
        'domain': 'aws.amazon.com',
        'document_type': 'documentation'
    }
)

web_reader = WebReaderProvider(web_config)

# Read documents using GraphRAG web reader
docs = web_reader.read(doc_urls)

graph_index.extract(docs, handler=extracted_docs, checkpoint=checkpoint, show_progress=True)

collection_id = extracted_docs.collection_id

print('Extraction complete')
print(f'collection_id: {collection_id}')


## Using batch inference with the LexicalGraphIndex. Writing to AWS S3 and DynamoDB

Ensure you have reviewed batch-extraction.md. For permission creation please see setup-bedrock-batch.md in lexical-graph-hybrid-dev/aws folder.

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import (
    GraphRAGConfig,
    IndexingConfig
    )

from graphrag_toolkit.lexical_graph.storage.graph.neo4j_graph_store_factory import Neo4jGraphStoreFactory
from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint
from graphrag_toolkit.lexical_graph.indexing.extract import BatchConfig

# Import GraphRAG readers instead of LlamaIndex directly
from graphrag_toolkit.lexical_graph.indexing.load.readers import (
    DirectoryReaderProvider, DirectoryReaderConfig,
    PDFReaderProvider, PDFReaderConfig
)

set_logging_config('INFO')

# Set batch size
GraphRAGConfig.extraction_batch_size = int(os.environ.get("EXTRACTION_BATCH_SIZE", 4))

# Configure batch S3 setup
batch_config = BatchConfig(
        region=os.environ["AWS_REGION"],
        bucket_name=os.environ["S3_BUCKET_EXTRACK_BUILD_BATCH_NAME"],
        key_prefix=os.environ["BATCH_PREFIX"],
        role_arn=f'arn:aws:iam::{os.environ["AWS_ACCOUNT"]}:role/{os.environ["BATCH_ROLE_NAME"]}',
    )

indexing_config = IndexingConfig(batch_config=batch_config)

# Register the Neo4j backend with the factory
GraphStoreFactory.register(Neo4jGraphStoreFactory)

extracted_docs = S3BasedDocs(
    region=os.environ['AWS_REGION'],
    bucket_name=os.environ['S3_BUCKET_EXTRACK_BUILD_BATCH_NAME'],
    key_prefix=os.environ["EXTRACT_BUILD_PREFIX"],
    collection_id='best-practices'
)

# Create checkpoint
checkpoint = Checkpoint('extraction-best-practices-checkpoint-01')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store,
    vector_store,
    indexing_config=indexing_config
)

# Configure directory reader with PDF support
directory_config = DirectoryReaderConfig(
    input_dir=os.environ["SOURCE_DIR"],
    recursive=True,
    required_exts=[".pdf"],  # Only process PDF files
    metadata_fn=lambda path: {
        'source': 'directory',
        'file_path': path,
        'document_type': 'best_practices',
        'collection': 'best-practices'
    }
)

# Configure PDF reader for better PDF processing
pdf_config = PDFReaderConfig(
    return_full_document=False,  # Split into chunks
    metadata_fn=lambda path: {
        'source': 'pdf',
        'file_path': path,
        'document_type': 'best_practices'
    }
)

# Create directory reader
directory_reader = DirectoryReaderProvider(directory_config)

# Read all documents from directory
docs = directory_reader.read(os.environ["SOURCE_DIR"])

graph_index.extract(docs, handler=extracted_docs, checkpoint=checkpoint, show_progress=True)

collection_id = extracted_docs.collection_id

print('Extraction complete')
print(f'collection_id: {collection_id}')
