# Batch Image Tesselation for SageMaker

Large-scale processing of TIFF images with automatic slicing and metadata generation.

In [None]:
# Install required packages if needed
!pip install pillow numpy tqdm

In [None]:
import os
from batch_tesselate import BatchTesselator, BatchJob

# Configuration
INPUT_DIR = '/home/ec2-user/SageMaker/12gb_dataset'  # Update this path
OUTPUT_DIR = '/home/ec2-user/SageMaker/processed_output'

# Create batch job configuration
job_config = BatchJob(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    num_pieces=8,           # Split each image into 8 pieces
    max_workers=4,          # Adjust based on instance specs
    min_image_size=1024,    # Skip images smaller than 1024px
    skip_existing=True,     # Skip already processed images
    resume_mode=True,       # Enable resume capability
    log_level='INFO'
)

print(f'Input directory: {INPUT_DIR}')
print(f'Output directory: {OUTPUT_DIR}')
print(f'Configuration: {job_config}')

In [None]:
# Initialize batch processor
processor = BatchTesselator(job_config)

# Run a quick scan to see what will be processed
tiff_files = processor.find_tiff_images()
print(f'Found {len(tiff_files)} TIFF images to process')

# Show first few files as preview
if tiff_files:
    print('\nFirst 10 files to process:')
    for i, (full_path, rel_path) in enumerate(tiff_files[:10]):
        print(f'{i+1:2d}. {rel_path}')

In [None]:
# Start batch processing
# WARNING: This will process ALL TIFF files found
# Make sure the configuration above is correct before running

print('Starting batch processing...')
print('This may take several hours for large datasets')
print('You can interrupt and resume later if needed')

report = processor.run_batch()

In [None]:
# Check results and create ZIP archive
import shutil

# Create ZIP of processed results
zip_filename = f'{OUTPUT_DIR}_results.zip'
print(f'Creating ZIP archive: {zip_filename}')

shutil.make_archive(
    OUTPUT_DIR + '_results',
    'zip',
    OUTPUT_DIR
)

# Get ZIP file size
zip_size = os.path.getsize(zip_filename) / (1024**3)  # GB
print(f'ZIP file created: {zip_filename} ({zip_size:.2f} GB)')

print('\nReady for download and S3 upload!')