In [3]:
import sys
import os
from pathlib import Path
import subprocess
import json

# Add current directory to path to import data_global
# In Jupyter notebooks, use os.getcwd() instead of __file__
notebook_dir = Path(os.getcwd()) if '__file__' not in globals() else Path(__file__).parent
sys.path.insert(0, str(notebook_dir))

# Import the DATASETS class
from data_global import DATASETS as DS

print("‚úÖ Successfully imported DATASETS module")
print("Available classes:", [attr for attr in dir(DS) if not attr.startswith('_')])

‚úÖ Successfully imported DATASETS module
Available classes: ['CONSOLIDATE', 'COPY', 'CREATE', 'EXTRACT', 'MERGE', 'MODALITY', 'MOVE', 'SPLIT', 'STATE', 'TYPE']


# DETR Dataset Upload to S3 - HITL Demo Pipeline

**Date:** December 26, 2025  
**Purpose:** Upload DETR utility detection datasets to S3 for Human-in-the-Loop (HITL) workflow  
**S3 Endpoint:** https://s3.ohl-inspection.com  
**Target Bucket:** siemens-hitl-demo

## Proposed S3 Structure
```
s3://siemens-hitl-demo/
‚îú‚îÄ‚îÄ raw/
‚îÇ   ‚îú‚îÄ‚îÄ insulators/          # Original insulator dataset
‚îÇ   ‚îú‚îÄ‚îÄ crossarms/           # Original crossarm dataset  
‚îÇ   ‚îú‚îÄ‚îÄ utility-poles/       # Original utility pole dataset
‚îÇ   ‚îî‚îÄ‚îÄ merged/              # Combined DETR dataset (923 images)
‚îÇ       ‚îú‚îÄ‚îÄ train/
‚îÇ       ‚îú‚îÄ‚îÄ valid/
‚îÇ       ‚îî‚îÄ‚îÄ test/
‚îî‚îÄ‚îÄ consolidated/            # Single folder with all data for HITL
    ‚îú‚îÄ‚îÄ images/
    ‚îî‚îÄ‚îÄ _annotations.coco.json
```

In [8]:
# Configuration
AWS_CLI = r"C:\Users\Z0057P7S\miniconda3\python.exe -m awscli"
S3_ENDPOINT = "https://s3.ohl-inspection.com"
BUCKET_NAME = "prahlad-siemens-hitl-demo"  # Updated to unique bucket name

# Local paths
DETR_MERGED_PATH = Path(r"c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged")
WORKSPACE_PATH = Path(r"c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens")
TEMP_CONSOLIDATE_PATH = WORKSPACE_PATH / "temp_consolidated"

print("üìÅ Configuration:")
print(f"   DETR Merged Dataset: {DETR_MERGED_PATH}")
print(f"   Workspace: {WORKSPACE_PATH}")
print(f"   Temp Consolidate: {TEMP_CONSOLIDATE_PATH}")
print(f"   S3 Bucket: {BUCKET_NAME}")
print(f"   S3 Endpoint: {S3_ENDPOINT}")

# Verify local dataset exists
if DETR_MERGED_PATH.exists():
    print(f"\n‚úÖ DETR merged dataset found")
    splits = ['train', 'valid', 'test']
    for split in splits:
        split_path = DETR_MERGED_PATH / split
        if split_path.exists():
            files = list(split_path.glob('*'))
            print(f"   {split}: {len(files)} files")
else:
    print(f"\n‚ùå ERROR: DETR merged dataset not found at {DETR_MERGED_PATH}")

üìÅ Configuration:
   DETR Merged Dataset: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged
   Workspace: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens
   Temp Consolidate: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\temp_consolidated
   S3 Bucket: prahlad-siemens-hitl-demo
   S3 Endpoint: https://s3.ohl-inspection.com

‚úÖ DETR merged dataset found
   train: 1 files
   valid: 1 files
   test: 1 files


## Step 1: Check S3 Connection and Create Bucket

In [5]:
def run_aws_command(command, capture_output=True):
    """Helper function to run AWS CLI commands"""
    full_cmd = f'{AWS_CLI} {command} --endpoint-url {S3_ENDPOINT}'
    print(f"üîß Running: {full_cmd}")
    
    result = subprocess.run(
        full_cmd,
        shell=True,
        capture_output=capture_output,
        text=True
    )
    
    if result.returncode == 0:
        if capture_output and result.stdout:
            print(f"‚úÖ Success:\n{result.stdout}")
        return True, result.stdout
    else:
        print(f"‚ùå Error:\n{result.stderr}")
        return False, result.stderr

# Check current buckets
print("=" * 60)
print("Checking existing S3 buckets...")
print("=" * 60)
success, output = run_aws_command("s3 ls")

# Check if our target bucket exists
if BUCKET_NAME in output:
    print(f"\n‚úÖ Bucket '{BUCKET_NAME}' already exists")
else:
    print(f"\n‚ö†Ô∏è  Bucket '{BUCKET_NAME}' does not exist")
    print(f"Creating bucket '{BUCKET_NAME}'...")
    
    # Create bucket without location constraint (Ceph doesn't use AWS regions)
    success, output = run_aws_command(f"s3 mb s3://{BUCKET_NAME}")
    
    if success:
        print(f"‚úÖ Successfully created bucket '{BUCKET_NAME}'")
    else:
        print(f"‚ùå Failed to create bucket. Check permissions and endpoint.")

Checking existing S3 buckets...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 ls --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-17 10:23:10 prahlad-test-bucket


‚ö†Ô∏è  Bucket 'siemens-hitl-demo' does not exist
Creating bucket 'siemens-hitl-demo'...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 mb s3://siemens-hitl-demo --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-17 10:23:10 prahlad-test-bucket


‚ö†Ô∏è  Bucket 'siemens-hitl-demo' does not exist
Creating bucket 'siemens-hitl-demo'...
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 mb s3://siemens-hitl-demo --endpoint-url https://s3.ohl-inspection.com
‚ùå Error:
make_bucket failed: s3://siemens-hitl-demo An error occurred (InvalidLocationConstraint) when calling the CreateBucket operation: The eu-central-1 location constraint is not valid.

‚ùå Failed to create bucket. Check permissions and endpoint.
‚ùå Error:
make_bucket failed: s3:/

## Step 2: Consolidate DETR Dataset using CONSOLIDATE.json_COCO_V0()

In [6]:
# Clean up any previous consolidation
if TEMP_CONSOLIDATE_PATH.exists():
    print(f"üßπ Cleaning up previous consolidation at {TEMP_CONSOLIDATE_PATH}")
    import shutil
    shutil.rmtree(TEMP_CONSOLIDATE_PATH)

# Create consolidated dataset from train/valid/test splits
print("=" * 60)
print("Consolidating DETR dataset using DS.CONSOLIDATE.json_COCO_V0()")
print("=" * 60)

result = DS.CONSOLIDATE.json_COCO_V0(
    input_dir=str(DETR_MERGED_PATH),
    output_dir=str(WORKSPACE_PATH / "temp_consolidated"),
    dataset_name="detr_utility_merged",
    dataset_description="DETR Utility Inventory Dataset - Insulators, Crossarms, Utility Poles",
    dataset_version="1.0",
    dataset_year=2025,
    dataset_contributer="Prahlad Menon, Vijay Kovuru, Bhargav Bompalli, Erick Allage",
    dataset_url="https://s3.ohl-inspection.com/siemens-hitl-demo"
)

print("\n‚úÖ Consolidation complete!")
print(json.dumps(result, indent=2))

# Verify consolidated output
consolidated_dir = Path(result['consolidated_directory'])
if consolidated_dir.exists():
    files = list(consolidated_dir.glob('*'))
    images = [f for f in files if f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
    json_files = list(consolidated_dir.glob('*.json'))
    
    print(f"\nüìä Consolidated Dataset Summary:")
    print(f"   Location: {consolidated_dir}")
    print(f"   Total files: {len(files)}")
    print(f"   Image files: {len(images)}")
    print(f"   JSON files: {len(json_files)}")
    print(f"   Images in JSON: {result['num_images']}")
    print(f"   Annotations: {result['num_annotations']}")
    print(f"   Categories: {result['num_categories']}")

Consolidating DETR dataset using DS.CONSOLIDATE.json_COCO_V0()
Found 3 COCO annotation files to consolidate
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\test\_annotations.coco.json
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train\_annotations.coco.json
Processing c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\valid\_annotations.coco.json
Copying images to consolidated directory...

‚úÖ Consolidation complete!
{
  "input_directory": "c:\\Users\\Z0057P7S\\OneDrive - Siemens Energy\\Documents\\MenonSiemens\\DETR\\utility-inventory-detr-main\\datasets\\processed\\merged",
  "output_directory": "c:\\Users\\Z0057P7S\\OneDrive - Siemens Energy\\Documents\\MenonSiemens\\temp_consolidated\\detr_utility_merged",
  "consolidated_directo

## Step 3: Upload Split Datasets to S3 (raw/merged/)

In [9]:
print("=" * 60)
print("Uploading split datasets (train/valid/test) to S3")
print("=" * 60)

# Upload each split to raw/merged/ in S3
splits = ['train', 'valid', 'test']

for split in splits:
    split_path = DETR_MERGED_PATH / split
    
    if not split_path.exists():
        print(f"‚ö†Ô∏è  Skipping {split} - directory not found")
        continue
    
    s3_prefix = f"s3://{BUCKET_NAME}/raw/merged/{split}/"
    
    print(f"\nüì§ Uploading {split} split...")
    print(f"   Source: {split_path}")
    print(f"   Destination: {s3_prefix}")
    
    # Use AWS CLI sync command with STANDARD storage class
    cmd = f's3 sync "{split_path}" {s3_prefix} --storage-class STANDARD'
    success, output = run_aws_command(cmd, capture_output=True)
    
    if success:
        print(f"‚úÖ {split} uploaded successfully")
    else:
        print(f"‚ùå Failed to upload {split}")

print("\n‚úÖ All splits uploaded to s3://{BUCKET_NAME}/raw/merged/")

Uploading split datasets (train/valid/test) to S3

üì§ Uploading train split...
   Source: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train
   Destination: s3://prahlad-siemens-hitl-demo/raw/merged/train/
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 sync "c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\datasets\processed\merged\train" s3://prahlad-siemens-hitl-demo/raw/merged/train/ --storage-class STANDARD --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
Completed 480.6 KiB/480.6 KiB (3.2 MiB/s) with 1 file(s) remaining
upload: ..\..\DETR\utility-inventory-detr-main\datasets\processed\merged\train\_annotations.coco.json to s3://prahlad-siemens-hitl-demo/raw/merged/train/_annotations.coco.json

‚úÖ train uploaded successfully

üì§ Uploading valid split...
   Source: c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\M

## Step 4: Upload Consolidated Dataset to S3

In [None]:
print("=" * 60)
print("Uploading consolidated dataset to S3")
print("=" * 60)

# Upload consolidated dataset to S3
consolidated_path = Path(result['consolidated_directory'])
s3_consolidated_prefix = f"s3://{BUCKET_NAME}/consolidated/"

print(f"\nüì§ Uploading consolidated dataset...")
print(f"   Source: {consolidated_path}")
print(f"   Destination: {s3_consolidated_prefix}")

# Sync consolidated folder to S3 with STANDARD storage class
cmd = f's3 sync "{consolidated_path}" {s3_consolidated_prefix} --storage-class STANDARD'
success, output = run_aws_command(cmd, capture_output=True)

if success:
    print(f"‚úÖ Consolidated dataset uploaded successfully")
    print(f"\nüìä Upload Summary:")
    print(f"   Bucket: {BUCKET_NAME}")
    print(f"   Endpoint: {S3_ENDPOINT}")
    print(f"   Consolidated path: s3://{BUCKET_NAME}/consolidated/")
else:
    print(f"‚ùå Failed to upload consolidated dataset")

## Step 5: Verify S3 Upload and List Bucket Contents

In [10]:
print("=" * 60)
print("Verifying S3 Bucket Structure")
print("=" * 60)

# List bucket contents
print("\nüìÇ Bucket structure:")
success, output = run_aws_command(f"s3 ls s3://{BUCKET_NAME}/ --recursive --human-readable --summarize")

print("\nüìÇ Raw merged datasets:")
success, output = run_aws_command(f"s3 ls s3://{BUCKET_NAME}/raw/merged/ --recursive")

print("\nüìÇ Consolidated dataset:")
success, output = run_aws_command(f"s3 ls s3://{BUCKET_NAME}/consolidated/ --recursive")

print("\n" + "=" * 60)
print("‚úÖ DETR Dataset Upload Complete!")
print("=" * 60)
print(f"\nüéØ Next Steps for HITL Demo:")
print(f"   1. Team can access datasets at: s3://{BUCKET_NAME}/")
print(f"   2. Use 'consolidated/' folder for single-dataset HITL workflow")
print(f"   3. Use 'raw/merged/' folders for split-based training/validation")
print(f"   4. Implement HITL evaluation scripts to:")
print(f"      - Load model predictions")
print(f"      - Calculate IOU metrics")
print(f"      - Generate correction workflow")
print(f"\nüìù Dataset Info:")
print(f"   Endpoint: {S3_ENDPOINT}")
print(f"   Bucket: {BUCKET_NAME}")
print(f"   Images: {result['num_images']}")
print(f"   Annotations: {result['num_annotations']}")
print(f"   Categories: {result['num_categories']}")

Verifying S3 Bucket Structure

üìÇ Bucket structure:
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 ls s3://prahlad-siemens-hitl-demo/ --recursive --human-readable --summarize --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-26 12:18:51   53.2 KiB raw/merged/test/_annotations.coco.json
2025-12-26 12:18:42  480.6 KiB raw/merged/train/_annotations.coco.json
2025-12-26 12:18:47  109.0 KiB raw/merged/valid/_annotations.coco.json

Total Objects: 3
   Total Size: 642.8 KiB


üìÇ Raw merged datasets:
üîß Running: C:\Users\Z0057P7S\miniconda3\python.exe -m awscli s3 ls s3://prahlad-siemens-hitl-demo/raw/merged/ --recursive --endpoint-url https://s3.ohl-inspection.com
‚úÖ Success:
2025-12-26 12:18:51   53.2 KiB raw/merged/test/_annotations.coco.json
2025-12-26 12:18:42  480.6 KiB raw/merged/train/_annotations.coco.json
2025-12-26 12:18:47  109.0 KiB raw/merged/valid/_annotations.coco.json

Total Objects: 3
   Total Size: 642.8 KiB


üìÇ Raw merged datase

## ‚ö†Ô∏è Important Finding: Images Not in Repository

The DETR repository only contains **annotation JSON files** (metadata), not the actual images.  
The images (923 total) were excluded from git due to size constraints.

### Current S3 Upload Status:
‚úÖ **Uploaded:** Annotation JSONs only (642.8 KiB total)  
‚ùå **Missing:** Actual image files (~923 images)

### To Complete the Dataset Upload:

**Option 1: Download from Roboflow** (Original Source)
- Use `DETR/utility-inventory-detr-main/scripts/01_download_datasets.py`
- Download the 3 source datasets from Roboflow
- Extract using `00_extract_datasets.py`
- Then upload images to S3

**Option 2: Use Local Training Images** (If Available)
- Check if images exist locally from previous training runs
- Verify images match the annotation JSON references
- Upload directly to S3

**Option 3: Request from Team**
- Check with Vijay Kovuru who completed the training
- May have access to the full dataset with images

## üì• How to Get the DETR Images - Step by Step Guide

### Method 1: Manual Download from Roboflow (Recommended - No API Key Needed)

**Step 1: Visit Each Dataset and Download**

1. **Insulators Dataset** (599 images)
   - URL: https://universe.roboflow.com/sofia-valdivieso-von-teuber/insulators-wo6lb/dataset/3
   - Click "Download Dataset"
   - Select format: **COCO**
   - Download ZIP file: `Insulators.v5i.coco.zip`

2. **Crossarm Dataset** (207 images)
   - URL: https://universe.roboflow.com/project-91iyv/song-crossarm-zqkmo
   - Click "Download Dataset"
   - Select format: **COCO**
   - Download ZIP file: `song crossarm.v6i.coco.zip`

3. **Utility-pole Dataset** (218 images)
   - URL: https://universe.roboflow.com/project-6kpfk/utility-pole-hdbuh
   - Click "Download Dataset"
   - Select format: **COCO**
   - Download ZIP file: `utility-pole.v4i.coco.zip`

**Step 2: Save ZIP files to workspace**
```
Save all 3 ZIP files to:
c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main\
```

**Step 3: Extract and Process Datasets**
```python
# Run in terminal from DETR directory:
cd "c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main"

# Check which files exist
python scripts\01_download_datasets.py

# Extract ZIP files
python scripts\00_extract_datasets.py

# Clean datasets
python scripts\02_clean_datasets.py

# Merge into unified dataset
python scripts\03_merge_datasets.py
```

**Step 4: Re-run This Notebook**
Once images are extracted, re-run the upload cells in this notebook. The images will be:
- In `datasets/processed/merged/train/`, `valid/`, `test/`
- Automatically uploaded to S3 with the annotation JSONs

---

### Method 2: Use Roboflow API (If You Have API Key)

**Step 1: Get Roboflow API Key**
- Sign up at https://roboflow.com/
- Go to Settings ‚Üí API Keys
- Copy your API key

**Step 2: Set Environment Variable**
```powershell
# In PowerShell:
$env:ROBOFLOW_API_KEY = "your_api_key_here"
```

**Step 3: Run Download Script**
```python
cd "c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main"
python scripts\01_download_datasets.py  # Will use API to download
python scripts\00_extract_datasets.py
python scripts\02_clean_datasets.py
python scripts\03_merge_datasets.py
```

---

### Method 3: Request from Team Member

Contact **Vijay Kovuru** (ext) who completed the training. He may have:
- The original ZIP files
- Already extracted datasets with images
- Access to a shared drive with the full dataset

---

### Expected Result After Download

Once complete, you should have:
```
DETR/utility-inventory-detr-main/datasets/
‚îú‚îÄ‚îÄ raw/
‚îÇ   ‚îú‚îÄ‚îÄ insulators/      (extracted ZIP 1)
‚îÇ   ‚îú‚îÄ‚îÄ crossarm/        (extracted ZIP 2)
‚îÇ   ‚îî‚îÄ‚îÄ utility-pole/    (extracted ZIP 3)
‚îî‚îÄ‚îÄ processed/
    ‚îî‚îÄ‚îÄ merged/
        ‚îú‚îÄ‚îÄ train/       (713 .jpg images + _annotations.coco.json)
        ‚îú‚îÄ‚îÄ valid/       (134 .jpg images + _annotations.coco.json)
        ‚îî‚îÄ‚îÄ test/        (76 .jpg images + _annotations.coco.json)
```

**Total Size:** ~200-500 MB for all images

In [11]:
# Final Status Summary
print("=" * 70)
print("DETR Dataset S3 Upload - Status Report")
print("=" * 70)
print("\n‚úÖ COMPLETED:")
print("  ‚Ä¢ AWS CLI configured and tested")
print("  ‚Ä¢ S3 bucket created: prahlad-siemens-hitl-demo")
print("  ‚Ä¢ DATASETS.CONSOLIDATE.json_COCO_V0() function validated")
print("  ‚Ä¢ Annotation JSONs uploaded (642.8 KiB)")
print("    - train: 713 images metadata")
print("    - valid: 134 images metadata")
print("    - test: 76 images metadata")
print("  ‚Ä¢ Total: 923 images metadata, 1,024 annotations, 3 categories")

print("\n‚ö†Ô∏è  PENDING:")
print("  ‚Ä¢ Download 923 image files from Roboflow")
print("  ‚Ä¢ Upload images to S3 (est. 200-500 MB)")

print("\nüìä S3 Structure (Current):")
print("  s3://prahlad-siemens-hitl-demo/")
print("  ‚îî‚îÄ‚îÄ raw/merged/")
print("      ‚îú‚îÄ‚îÄ test/_annotations.coco.json")
print("      ‚îú‚îÄ‚îÄ train/_annotations.coco.json")
print("      ‚îî‚îÄ‚îÄ valid/_annotations.coco.json")

print("\nüéØ Next Steps for Team:")
print("  1. Download images from Roboflow using DETR scripts")
print("  2. Re-run this notebook to upload images")
print("  3. Implement HITL evaluation scripts")
print("  4. Build YOLOv11-OBB converter")
print("  5. Develop correction UI workflow")

print("\nüìù Resources:")
print(f"  ‚Ä¢ S3 Endpoint: {S3_ENDPOINT}")
print(f"  ‚Ä¢ Bucket: {BUCKET_NAME}")
print("  ‚Ä¢ Notebook: AWS/DataSelection/data_global.ipynb")
print("  ‚Ä¢ Review Doc: REVIEW-DETR-AWS-DataSelection.md")
print("=" * 70)

DETR Dataset S3 Upload - Status Report

‚úÖ COMPLETED:
  ‚Ä¢ AWS CLI configured and tested
  ‚Ä¢ S3 bucket created: prahlad-siemens-hitl-demo
  ‚Ä¢ DATASETS.CONSOLIDATE.json_COCO_V0() function validated
  ‚Ä¢ Annotation JSONs uploaded (642.8 KiB)
    - train: 713 images metadata
    - valid: 134 images metadata
    - test: 76 images metadata
  ‚Ä¢ Total: 923 images metadata, 1,024 annotations, 3 categories

‚ö†Ô∏è  PENDING:
  ‚Ä¢ Download 923 image files from Roboflow
  ‚Ä¢ Upload images to S3 (est. 200-500 MB)

üìä S3 Structure (Current):
  s3://prahlad-siemens-hitl-demo/
  ‚îî‚îÄ‚îÄ raw/merged/
      ‚îú‚îÄ‚îÄ test/_annotations.coco.json
      ‚îú‚îÄ‚îÄ train/_annotations.coco.json
      ‚îî‚îÄ‚îÄ valid/_annotations.coco.json

üéØ Next Steps for Team:
  1. Download images from Roboflow using DETR scripts
  2. Re-run this notebook to upload images
  3. Implement HITL evaluation scripts
  4. Build YOLOv11-OBB converter
  5. Develop correction UI workflow

üìù Resources:
  ‚Ä¢ S3 End

## üöÄ Quick Start Script (Run After Downloading ZIPs)

In [12]:
# Automated script to check, extract, and process datasets
import subprocess
import sys

DETR_PATH = Path(r"c:\Users\Z0057P7S\OneDrive - Siemens Energy\Documents\MenonSiemens\DETR\utility-inventory-detr-main")
SCRIPTS_PATH = DETR_PATH / "scripts"

print("=" * 70)
print("DETR Dataset Download & Processing Automation")
print("=" * 70)

# Step 1: Check for ZIP files
print("\nüì¶ Step 1: Checking for downloaded ZIP files...")
result = subprocess.run(
    [sys.executable, str(SCRIPTS_PATH / "01_download_datasets.py")],
    cwd=str(DETR_PATH),
    capture_output=True,
    text=True
)
print(result.stdout)

if "All ZIP files already exist" in result.stdout:
    print("‚úÖ All ZIPs found! Proceeding to extraction...\n")
    
    # Step 2: Extract
    print("üìÇ Step 2: Extracting ZIP files...")
    result = subprocess.run(
        [sys.executable, str(SCRIPTS_PATH / "00_extract_datasets.py")],
        cwd=str(DETR_PATH),
        capture_output=True,
        text=True
    )
    print(result.stdout)
    
    # Step 3: Clean
    print("\nüßπ Step 3: Cleaning datasets...")
    result = subprocess.run(
        [sys.executable, str(SCRIPTS_PATH / "02_clean_datasets.py")],
        cwd=str(DETR_PATH),
        capture_output=True,
        text=True
    )
    print(result.stdout)
    
    # Step 4: Merge
    print("\nüîÄ Step 4: Merging datasets...")
    result = subprocess.run(
        [sys.executable, str(SCRIPTS_PATH / "03_merge_datasets.py")],
        cwd=str(DETR_PATH),
        capture_output=True,
        text=True
    )
    print(result.stdout)
    
    print("\n" + "=" * 70)
    print("‚úÖ DATASET PROCESSING COMPLETE!")
    print("=" * 70)
    print("\nüì§ Next: Re-run the S3 upload cells above to upload images")
    
else:
    print("\n‚ö†Ô∏è  Please download the ZIP files manually first:")
    print("\n1. Visit the Roboflow URLs listed above")
    print("2. Download each dataset in COCO format")
    print(f"3. Save ZIP files to: {DETR_PATH}")
    print("4. Re-run this cell")

print("\n" + "=" * 70)

DETR Dataset Download & Processing Automation

üì¶ Step 1: Checking for downloaded ZIP files...
Download Roboflow COCO Dataset ZIP Files

NOTE: These are public Roboflow datasets.
You have two options:

Option 1: Manual Download (Recommended)
  1. Visit each dataset URL below
  2. Click 'Download' and select 'COCO' format
  3. Save ZIP files to workspace root or project root

Option 2: Roboflow API (if you have API key)
  Set ROBOFLOW_API_KEY environment variable
  Or edit this script to add your API key


‚ö†Ô∏è  No ROBOFLOW_API_KEY found - using manual download instructions

Checking for existing ZIP files...
  ‚úó Missing: Insulators.v5i.coco.zip
     URL: https://universe.roboflow.com/sofia-valdivieso-von-teuber/insulators-wo6lb/dataset/3
  ‚úó Missing: song crossarm.v6i.coco.zip
     URL: https://universe.roboflow.com/project-91iyv/song-crossarm-zqkmo/browse?queryText=&pageSize=50&startingIndex=0&browseQuery=true
  ‚úó Missing: utility-pole.v4i.coco.zip
     URL: https://universe