In [3]:
import sys
import os
from pathlib import Path
import subprocess
import json

# Add current directory to path to import data_global
# In Jupyter notebooks, use os.getcwd() instead of __file__
notebook_dir = Path(os.getcwd()) if '__file__' not in globals() else Path(__file__).parent
sys.path.insert(0, str(notebook_dir))

# Import the DATASETS class
from data_global import DATASETS as DS

print("‚úÖ Successfully imported DATASETS module")
print("Available classes:", [attr for attr in dir(DS) if not attr.startswith('_')])

‚úÖ Successfully imported DATASETS module
Available classes: ['CONSOLIDATE', 'COPY', 'CREATE', 'EXTRACT', 'MERGE', 'MODALITY', 'MOVE', 'SPLIT', 'STATE', 'TYPE']


# DETR Dataset Upload to S3 - HITL Demo Pipeline

**Date:** December 26, 2025  
**Purpose:** Upload DETR utility detection datasets to S3 for Human-in-the-Loop (HITL) workflow  
**S3 Endpoint:** https://s3.ohl-inspection.com  
**Target Bucket:** siemens-hitl-demo

## Proposed S3 Structure
```
s3://siemens-hitl-demo/
‚îú‚îÄ‚îÄ raw/
‚îÇ   ‚îú‚îÄ‚îÄ insulators/          # Original insulator dataset
‚îÇ   ‚îú‚îÄ‚îÄ crossarms/           # Original crossarm dataset  
‚îÇ   ‚îú‚îÄ‚îÄ utility-poles/       # Original utility pole dataset
‚îÇ   ‚îî‚îÄ‚îÄ merged/              # Combined DETR dataset (923 images)
‚îÇ       ‚îú‚îÄ‚îÄ train/
‚îÇ       ‚îú‚îÄ‚îÄ valid/
‚îÇ       ‚îî‚îÄ‚îÄ test/
‚îî‚îÄ‚îÄ consolidated/            # Single folder with all data for HITL
    ‚îú‚îÄ‚îÄ images/
    ‚îî‚îÄ‚îÄ _annotations.coco.json
```

In [8]:
# Configuration
AWS_CLI = r"C:\Users\Z0057P7S\miniconda3\python.exe -m awscli"
S3_ENDPOINT = "https://s3.ohl-inspection.com"
BUCKET_NAME = "prahlad-siemens-hitl-demo"  # Updated to unique bucket name

# Local paths
DETR_MERGED_PATH = Path(r"c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged")
WORKSPACE_PATH = Path(r"c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens")
TEMP_CONSOLIDATE_PATH = WORKSPACE_PATH / "temp_consolidated"

print("üìÅ Configuration:")
print(f"   DETR Merged Dataset: {DETR_MERGED_PATH}")
print(f"   Workspace: {WORKSPACE_PATH}")
print(f"   Temp Consolidate: {TEMP_CONSOLIDATE_PATH}")
print(f"   S3 Bucket: {BUCKET_NAME}")
print(f"   S3 Endpoint: {S3_ENDPOINT}")

# Verify local dataset exists
if DETR_MERGED_PATH.exists():
    print(f"\n‚úÖ DETR merged dataset found")
    splits = ['train', 'valid', 'test']
    for split in splits:
        split_path = DETR_MERGED_PATH / split
        if split_path.exists():
            files = list(split_path.glob('*'))
            print(f"   {split}: {len(files)} files")
else:
    print(f"\n‚ùå ERROR: DETR merged dataset not found at {DETR_MERGED_PATH}")

üìÅ Configuration:
   DETR Merged Dataset: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged
   Workspace: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens
   Temp Consolidate: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\temp_consolidated
   S3 Bucket: prahlad-siemens-hitl-demo
   S3 Endpoint: https://s3.ohl-inspection.com

‚úÖ DETR merged dataset found
   train: 1 files
   valid: 1 files
   test: 1 files


## Step 1: Check S3 Connection and Create Bucket

In [5]:
def run_aws_command(command, capture_output=True):
    """Helper function to run AWS CLI commands"""
    full_cmd = f'{AWS_CLI} {command} --endpoint-url {S3_ENDPOINT}'
    print(f"üîß Running: {full_cmd}")
    
    result = subprocess.run(
        full_cmd,
        shell=True,
        capture_output=capture_output,
        text=True
    )
    
    if result.returncode == 0:
        if capture_output and result.stdout:
            print(f"‚úÖ Success:\n{result.stdout}")
        return True, result.stdout
    else:
        print(f"‚ùå Error:\n{result.stderr}")
        return False, result.stderr

# Check current buckets
print("=" * 60)
print("Checking existing S3 buckets...")
print("=" * 60)
success, output = run_aws_command("s3 ls")

# Check if our target bucket exists
if BUCKET_NAME in output:
    print(f"\n‚úÖ Bucket '{BUCKET_NAME}' already exists")
else:
    print(f"\n‚ö†Ô∏è  Bucket '{BUCKET_NAME}' does not exist")
    print(f"Creating bucket '{BUCKET_NAME}'...")
    
    # Create bucket without location constraint (Ceph doesn't use AWS regions)
    success, output = run_aws_command(f"s3 mb s3://{BUCKET_NAME}")
    
    if success:
        print(f"‚úÖ Successfully created bucket '{BUCKET_NAME}'")
    else:
        print(f"‚ùå Failed to create bucket. Check permissions and endpoint.")

Checking existing S3 buckets...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 ls --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-17 10:23:10 prahlad-test-bucket


‚ö†Ô∏è  Bucket 'siemens-hitl-demo' does not exist
Creating bucket 'siemens-hitl-demo'...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 mb s3://siemens-hitl-demo --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-17 10:23:10 prahlad-test-bucket


‚ö†Ô∏è  Bucket 'siemens-hitl-demo' does not exist
Creating bucket 'siemens-hitl-demo'...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 mb s3://siemens-hitl-demo --endpoint-url https://s3.ohl-inspection.com
‚ùå Error:
make_bucket failed: s3://siemens-hitl-demo An error occurred (InvalidLocationConstraint) when calling the CreateBucket operation: The eu-central-1 location constraint is not valid.

‚ùå Failed to create bucket. Check permissions and endpoint.
‚ùå Error:
make_bucket failed: s3:/

## Step 2: Consolidate DETR Dataset using CONSOLIDATE.json_COCO_V0()

In [6]:
# Clean up any previous consolidation
if TEMP_CONSOLIDATE_PATH.exists():
    print(f"üßπ Cleaning up previous consolidation at {TEMP_CONSOLIDATE_PATH}")
    import shutil
    shutil.rmtree(TEMP_CONSOLIDATE_PATH)

# Create consolidated dataset from train/valid/test splits
print("=" * 60)
print("Consolidating DETR dataset using DS.CONSOLIDATE.json_COCO_V0()")
print("=" * 60)

result = DS.CONSOLIDATE.json_COCO_V0(
    input_dir=str(DETR_MERGED_PATH),
    output_dir=str(WORKSPACE_PATH / "temp_consolidated"),
    dataset_name="detr_utility_merged",
    dataset_description="DETR Utility Inventory Dataset - Insulators, Crossarms, Utility Poles",
    dataset_version="1.0",
    dataset_year=2025,
    dataset_contributer="Prahlad Menon, Vijay Kovuru, Bhargav Bompalli, Erick Allage",
    dataset_url="https://s3.ohl-inspection.com/siemens-hitl-demo"
)

print("\n‚úÖ Consolidation complete!")
print(json.dumps(result, indent=2))

# Verify consolidated output
consolidated_dir = Path(result['consolidated_directory'])
if consolidated_dir.exists():
    files = list(consolidated_dir.glob('*'))
    images = [f for f in files if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
    json_files = list(consolidated_dir.glob('*.json'))
    
    print(f"\nüìä Consolidated Dataset Summary:")
    print(f"   Location: {consolidated_dir}")
    print(f"   Total files: {len(files)}")
    print(f"   Image files: {len(images)}")
    print(f"   JSON files: {len(json_files)}")
    print(f"   Images in JSON: {result['num_images']}")
    print(f"   Annotations: {result['num_annotations']}")
    print(f"   Categories: {result['num_categories']}")

Consolidating DETR dataset using DS.CONSOLIDATE.json_COCO_V0()
Found 3 COCO annotation files to consolidate
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\test\_annotations.coco.json
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train\_annotations.coco.json
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\valid\_annotations.coco.json
Copying images to consolidated directory...

‚úÖ Consolidation complete!
{
  "input_directory": "c:\\Users\\Z0057P7S\\OneDrive - Siemens Energy\\Documents\\MenonSiemens\\DETR\\utility-inventory-detr-main\\datasets\\processed\\merged",
  "output_directory": "c:\\Users\\Z0057P7S\\OneDrive - Siemens Energy\\Documents\\MenonSiemens\\temp_consolidated\\detr_utility_merged",
  "consolidated_directo

## Step 3: Upload Split Datasets to S3 (raw/merged/)

In [9]:
print("=" * 60)
print("Uploading split datasets (train/valid/test) to S3")
print("=" * 60)

# Upload each split to raw/merged/ in S3
splits = ['train', 'valid', 'test']

for split in splits:
    split_path = DETR_MERGED_PATH / split
    
    if not split_path.exists():
        print(f"‚ö†Ô∏è  Skipping {split} - directory not found")
        continue
    
    s3_prefix = f"s3://{BUCKET_NAME}/raw/merged/{split}/"
    
    print(f"\nüì§ Uploading {split} split...")
    print(f"   Source: {split_path}")
    print(f"   Destination: {s3_prefix}")
    
    # Use AWS CLI sync command with STANDARD storage class
    cmd = f's3 sync "{split_path}" {s3_prefix} --storage-class STANDARD'
    success, output = run_aws_command(cmd, capture_output=True)
    
    if success:
        print(f"‚úÖ {split} uploaded successfully")
    else:
        print(f"‚ùå Failed to upload {split}")

print("\n‚úÖ All splits uploaded to s3://{BUCKET_NAME}/raw/merged/")

Uploading split datasets (train/valid/test) to S3

üì§ Uploading train split...
   Source: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train
   Destination: s3://prahlad-siemens-hitl-demo/raw/merged/train/
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 sync "c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train" s3://prahlad-siemens-hitl-demo/raw/merged/train/ --storage-class STANDARD --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
Completed 480.6 KiB/480.6 KiB (3.2 MiB/s) with 1 file(s) remaining
upload: ..\..\DETR\utility-inventory-detr-main\datasets\processed\merged\train\_annotations.coco.json to s3://prahlad-siemens-hitl-demo/raw/merged/train/_annotations.coco.json

‚úÖ train uploaded successfully

üì§ Uploading valid split...
   Source: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\M

## Step 4: Upload Consolidated Dataset to S3

In [None]:
print("=" * 60)
print("Uploading consolidated dataset to S3")
print("=" * 60)

# Upload consolidated dataset to S3
consolidated_path = Path(result['consolidated_directory'])
s3_consolidated_prefix = f"s3://{BUCKET_NAME}/consolidated/"

print(f"\nüì§ Uploading consolidated dataset...")
print(f"   Source: {consolidated_path}")
print(f"   Destination: {s3_consolidated_prefix}")

# Sync consolidated folder to S3 with STANDARD storage class
cmd = f's3 sync "{consolidated_path}" {s3_consolidated_prefix} --storage-class STANDARD'
success, output = run_aws_command(cmd, capture_output=True)

if success:
    print(f"‚úÖ Consolidated dataset uploaded successfully")
    print(f"\nüìä Upload Summary:")
    print(f"   Bucket: {BUCKET_NAME}")
    print(f"   Endpoint: {S3_ENDPOINT}")
    print(f"   Consolidated path: s3://{BUCKET_NAME}/consolidated/")
else:
    print(f"‚ùå Failed to upload consolidated dataset")