# 02 - Cloud Build

## Setup

If you haven't already, install the toolkit and dependencies using the [Setup](./00-Setup.ipynb) notebook.

## Build

In [None]:
%reload_ext dotenv
%dotenv

import os

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

set_logging_config('INFO')

docs = S3BasedDocs(
    region='us-east-1',
    bucket_name='ccms-rag-extract-188967239867',
    key_prefix='key_prefix',
    collection_id='demo123'
)
checkpoint = Checkpoint('s3-extraction-checkpoint')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store, 
    vector_store
)

graph_index.build(docs, checkpoint=checkpoint, show_progress=True)

print('Build complete')

## Build from S3 and DynamoDB

In [None]:
%reload_ext dotenv
%dotenv

import os
import boto3
from datetime import datetime
import time

from graphrag_toolkit.lexical_graph import LexicalGraphIndex, set_logging_config
from graphrag_toolkit.lexical_graph.storage import GraphStoreFactory
from graphrag_toolkit.lexical_graph.storage import VectorStoreFactory
from graphrag_toolkit.lexical_graph.indexing.load import S3BasedDocs
from graphrag_toolkit.lexical_graph.indexing.build import Checkpoint

set_logging_config('INFO')

# Initialize DynamoDB client
dynamodb = boto3.resource('dynamodb', region_name=os.environ['AWS_REGION'])
table = dynamodb.Table(os.environ['DYNAMODB_NAME'])

# Get collection_id from environment or default
collection_id = os.environ.get('COLLECTION_ID', 'demo123')

# Check for existing IN_PROGRESS record
try:
    response = table.query(
        KeyConditionExpression='collection_id = :id',
        FilterExpression='status = :status',
        ExpressionAttributeValues={
            ':id': collection_id,
            ':status': 'IN_PROGRESS'
        }
    )
    items = response.get('Items', [])
    if not items:
        print(f"Error: No IN_PROGRESS record found for collection_id {collection_id}")
        exit(1)
    # Get the existing record
    existing_record = items[0]
except Exception as e:
    print(f"Error querying DynamoDB for IN_PROGRESS record: {str(e)}")
    exit(1)

# Extract S3BasedDocs parameters from the record
try:
    s3_region = existing_record['aws_region']
    s3_bucket_name = existing_record['s3_bucket']
    s3_key_prefix = existing_record['s3_key_prefix']
    s3_collection_id = existing_record['collection_id']
except KeyError as e:
    print(f"Error: Missing required field {str(e)} in IN_PROGRESS record")
    exit(1)

# Initialize S3BasedDocs with parameters from DynamoDB
docs = S3BasedDocs(
    region=s3_region,
    bucket_name=s3_bucket_name,
    key_prefix=s3_key_prefix,
    collection_id=s3_collection_id
)
checkpoint = Checkpoint('s3-extraction-checkpoint')

graph_store = GraphStoreFactory.for_graph_store(os.environ['GRAPH_STORE'])
vector_store = VectorStoreFactory.for_vector_store(os.environ['VECTOR_STORE'])

graph_index = LexicalGraphIndex(
    graph_store,
    vector_store
)

# Track start time and metadata
start_time = time.time()
status = 'BUILD'
error_message = None
completion_date = datetime.utcnow().isoformat() + 'Z'

# Update the existing record to BUILD
item = {
    'collection_id': collection_id,
    'completion_date': completion_date,
    'status': status,
    'reader_type': existing_record.get('reader_type', 'UNKNOWN'),
    's3_bucket': s3_bucket_name,
    's3_key_prefix': s3_key_prefix,
    'graph_store': os.environ['GRAPH_STORE'],
    'vector_store': os.environ['VECTOR_STORE'],
    'aws_region': s3_region,
    'start_time': datetime.utcfromtimestamp(start_time).isoformat() + 'Z',
    'duration': 0,
    'document_count': existing_record.get('document_count', 0),
    'error_message': None,
    'checkpoint': 's3-extraction-checkpoint',
    'user_id': os.environ.get('USER_ID', existing_record.get('user_id', 'unknown')),
    'environment_variables': {
        'EXTRACTION_MODEL': os.environ.get('EXTRACTION_MODEL', ''),
        'EMBEDDINGS_MODEL': os.environ.get('EMBEDDINGS_MODEL', ''),
        'EMBEDDINGS_DIMENSIONS': os.environ.get('EMBEDDINGS_DIMENSIONS', '')
    }
}

try:
    table.put_item(Item=item)
    print(f"Updated DynamoDB record for collection {collection_id} to BUILD")
except Exception as e:
    print(f"Error updating DynamoDB record to BUILD: {str(e)}")
    exit(1)

# Perform graph building
try:
    graph_index.build(docs, checkpoint=checkpoint, show_progress=True)
    status = 'COMPLETED'
except Exception as e:
    status = 'FAILED'
    error_message = str(e)
    print(f"Graph build failed: {str(e)}")

# Update DynamoDB record with final status
completion_date = datetime.utcnow().isoformat() + 'Z'
duration = int(time.time() - start_time)

item.update({
    'completion_date': completion_date,
    'status': status,
    'duration': duration,
    'error_message': error_message
})

try:
    table.put_item(Item=item)
    print(f"Updated DynamoDB record for collection {collection_id} with status {status}")
except Exception as e:
    print(f"Error updating final DynamoDB record: {str(e)}")
    status = 'FAILED'
    error_message = f"Build: {error_message or 'Success'}, DynamoDB: {str(e)}"
    item.update({
        'status': status,
        'error_message': error_message
    })
    try:
        table.put_item(Item=item)
        print(f"Updated failure record for collection {collection_id} in DynamoDB")
    except Exception as e:
        print(f"Error updating failure DynamoDB record: {str(e)}")

print('Build complete')
print(f'collection_id: {collection_id}')