# Step 0: Setup

This notebook initializes the environment and sets up the document for processing through the IDP pipeline.

**Outputs:**
- Document object with basic metadata
- S3 buckets created
- Sample document uploaded
- Environment configured

## 1. Install Dependencies

In [None]:
ROOTDIR="../.."
SAMPLE_PDF_PATH = f"{ROOTDIR}/samples/rvl_cdip_package.pdf"

In [None]:
# Let's make sure that modules are autoreloaded
%load_ext autoreload
%autoreload 2

# First uninstall existing package (to ensure we get the latest version)
%pip uninstall -y idp_common

# Install the IDP common package with all components in development mode
%pip install -q -e "{ROOTDIR}/lib/idp_common_pkg[dev, all]"

# Check installed version
%pip show idp_common | grep -E "Version|Location"

# Optionally use a .env file for environment variables
try:
    from dotenv import load_dotenv
    load_dotenv()  
except ImportError:
    pass

## 2. Import Libraries and Load Configuration

In [None]:
import os
import json
import yaml
import boto3
import logging
import datetime
from pathlib import Path

# Import base libraries
from idp_common.models import Document, Status

# Configure logging 
logging.basicConfig(level=logging.WARNING)
logging.getLogger('idp_common').setLevel(logging.INFO)

print("Libraries imported successfully")

## 3. Load Configuration Files

In [None]:
# Load all configuration files
config_dir = Path("config")
CONFIG = {}

# Load each configuration file
config_files = [
    "ocr.yaml",
    "classification.yaml", 
    "extraction.yaml",
    "assessment.yaml",
    "summarization.yaml",
    "evaluation.yaml",
    "classes.yaml"
]

for config_file in config_files:
    config_path = config_dir / config_file
    if config_path.exists():
        with open(config_path, 'r') as f:
            file_config = yaml.safe_load(f)
            CONFIG.update(file_config)
        print(f"Loaded {config_file}")
    else:
        print(f"Warning: {config_file} not found")

print(f"\nLoaded configuration sections: {list(CONFIG.keys())}")

## 4. Set Up Environment

In [None]:
# Set environment variables
os.environ['METRIC_NAMESPACE'] = 'IDP-Modular-Pipeline'
os.environ['AWS_REGION'] = boto3.session.Session().region_name or 'us-east-1'

# Get AWS account ID for unique bucket names
sts_client = boto3.client('sts')
account_id = sts_client.get_caller_identity()["Account"]
region = os.environ['AWS_REGION']

# Create unique bucket names based on account ID and region
input_bucket_name = os.getenv("IDP_INPUT_BUCKET_NAME", f"idp-modular-input-{account_id}-{region}")
output_bucket_name = os.getenv("IDP_OUTPUT_BUCKET_NAME", f"idp-modular-output-{account_id}-{region}")

print("Environment setup:")
print(f"METRIC_NAMESPACE: {os.environ.get('METRIC_NAMESPACE')}")
print(f"AWS_REGION: {os.environ.get('AWS_REGION')}")
print(f"Input bucket: {input_bucket_name}")
print(f"Output bucket: {output_bucket_name}")
print(f"SAMPLE_PDF_PATH: {SAMPLE_PDF_PATH}")

## 5. Set Up S3 Buckets and Upload Sample File

In [None]:
# Create S3 client
s3_client = boto3.client('s3')

# Function to create a bucket if it doesn't exist
def ensure_bucket_exists(bucket_name):
    try:
        s3_client.head_bucket(Bucket=bucket_name)
        print(f"Bucket {bucket_name} already exists")
    except Exception:
        try:
            if region == 'us-east-1':
                s3_client.create_bucket(Bucket=bucket_name)
            else:
                s3_client.create_bucket(
                    Bucket=bucket_name,
                    CreateBucketConfiguration={'LocationConstraint': region}
                )
            print(f"Created bucket: {bucket_name}")
            
            # Wait for bucket to be accessible
            waiter = s3_client.get_waiter('bucket_exists')
            waiter.wait(Bucket=bucket_name)
        except Exception as e:
            print(f"Error creating bucket {bucket_name}: {str(e)}")
            raise

# Ensure both buckets exist
ensure_bucket_exists(input_bucket_name)
ensure_bucket_exists(output_bucket_name)

# Upload the sample file to S3
sample_file_key = "modular-sample-" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + ".pdf"
with open(SAMPLE_PDF_PATH, 'rb') as file_data:
    s3_client.upload_fileobj(file_data, input_bucket_name, sample_file_key)

print(f"Uploaded sample file to: s3://{input_bucket_name}/{sample_file_key}")

## 6. Initialize Document Object

In [None]:
# Initialize a new Document
document = Document(
    id="bank_statement",
    input_bucket=input_bucket_name,
    input_key=sample_file_key,
    output_bucket=output_bucket_name,
    status=Status.QUEUED
)

print(f"Created document with ID: {document.id}")
print(f"Status: {document.status.value}")
print(f"Input: s3://{document.input_bucket}/{document.input_key}")
print(f"Output bucket: {document.output_bucket}")

## 7. Save Document and Configuration for Next Step

In [None]:
# Create data directory if it doesn't exist
data_dir = Path(".data/step0_setup")
data_dir.mkdir(parents=True, exist_ok=True)

# Save document object as JSON
document_path = data_dir / "document.json"
with open(document_path, 'w') as f:
    f.write(document.to_json())

# Save configuration
config_path = data_dir / "config.json"
with open(config_path, 'w') as f:
    json.dump(CONFIG, f, indent=2)

# Save environment info
env_info = {
    'input_bucket': input_bucket_name,
    'output_bucket': output_bucket_name,
    'region': region,
    'sample_file_key': sample_file_key,
    'account_id': account_id
}

env_path = data_dir / "environment.json"
with open(env_path, 'w') as f:
    json.dump(env_info, f, indent=2)

print(f"Saved document to: {document_path}")
print(f"Saved configuration to: {config_path}")
print(f"Saved environment info to: {env_path}")

## 8. Summary

In [None]:
print("=== Step 0: Setup Complete ===")
print(f"✅ Document initialized: {document.id}")
print(f"✅ Configuration loaded: {len(CONFIG)} sections")
print(f"✅ S3 buckets ready: {input_bucket_name}, {output_bucket_name}")
print(f"✅ Sample file uploaded: {sample_file_key}")
print(f"✅ Data saved to: .data/step0_setup/")
print("\n📌 Next step: Run step1_ocr.ipynb")