In [3]:
# 01_data_preprocessing.ipynb - REVISED CELL 1

# --- Cell 1: Initial Setup and Path Definitions ---
import os
import subprocess
import tarfile
from pathlib import Path
import time
import re
import json
import shutil # For potential directory cleanup
from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter progress bars

# Define the base directory *within* the SageMaker persistent volume
# The current working directory when you open a Jupyter Notebook/Lab
# is usually /home/ec2-user/SageMaker/.
# So, we want to create our project folder directly there.
base_sagemaker_dir = Path('/home/ec2-user/SageMaker/')
# Your project's root directory will be created inside SageMaker's persistent storage
project_root_name = 'spacenet-building-detection'
project_root_dir = base_sagemaker_dir / project_root_name

# Now, define all subdirectories relative to this project_root_dir
code_dir = project_root_dir / 'code'
notebooks_dir = project_root_dir / 'notebooks'
data_dir = project_root_dir / 'data'
data_raw_dir = data_dir / 'raw'
data_processed_dir = data_dir / 'processed'
scripts_dir = project_root_dir / 'scripts'

# Create all necessary directories.
# Since project_root_dir is inside /home/ec2-user/SageMaker/,
# and base_sagemaker_dir already exists and is writable, this should work.
project_root_dir.mkdir(parents=True, exist_ok=True) # Create the top-level project folder
code_dir.mkdir(parents=True, exist_ok=True)
notebooks_dir.mkdir(parents=True, exist_ok=True)
data_raw_dir.mkdir(parents=True, exist_ok=True)
data_processed_dir.mkdir(parents=True, exist_ok=True)
scripts_dir.mkdir(parents=True, exist_ok=True)


print(f"Project root created at: {project_root_dir}")
print(f"Raw data download target: {data_raw_dir}")
print(f"Processed data target: {data_processed_dir}")

# SageMaker S3 details for final upload
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket() # This is your SageMaker-associated S3 bucket
s3_prefix = 'spacenet-building-detection' # S3 folder prefix for your project data

print(f"Default S3 bucket: {default_bucket}")
print(f"S3 prefix for uploads: {s3_prefix}")

OSError: [Errno 45] Operation not supported: '/home/ec2-user'

In [1]:
# 01_data_preprocessing.ipynb

# --- Cell 1: Initial Setup and Path Definitions ---
import os
import subprocess
import tarfile
from pathlib import Path
import time
import re
import json
import shutil # For potential directory cleanup
from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter progress bars

# Define base directories on the Notebook Instance's EBS volume
# These paths are relative to your JupyterLab working directory, typically /home/ec2-user/SageMaker/
# If your notebook is inside 'spacenet-building-detection/notebooks/', adjust parent_dir accordingly
# For simplicity, let's assume you've cloned/created 'spacenet-building-detection' directly under /home/ec2-user/SageMaker/
project_root_dir = Path('/home/ec2-user/SageMaker/spacenet-building-detection')
data_raw_dir = project_root_dir / 'data' / 'raw'
data_processed_dir = project_root_dir / 'data' / 'processed'
scripts_dir = project_root_dir / 'scripts'

data_raw_dir.mkdir(parents=True, exist_ok=True)
data_processed_dir.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root_dir}")
print(f"Raw data download target: {data_raw_dir}")
print(f"Processed data target: {data_processed_dir}")

# SageMaker S3 details for final upload
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
default_bucket = sagemaker_session.default_bucket() # This is your SageMaker-associated S3 bucket
s3_prefix = 'spacenet-building-detection' # S3 folder prefix for your project data

print(f"Default S3 bucket: {default_bucket}")
print(f"S3 prefix for uploads: {s3_prefix}")


# --- Cell 2: Download SpaceNet Tarballs to Notebook Instance EBS ---
# This downloads directly from the public SpaceNet S3 bucket to your notebook's EBS.
print("Starting download of SpaceNet tarballs to Notebook Instance EBS...")

train_tar_name = 'SN2_buildings_train_AOI_2_Vegas.tar.gz'
test_tar_name = 'AOI_2_Vegas_test_public.tar.gz'

train_tar_path = data_raw_dir / train_tar_name
test_tar_path = data_raw_dir / test_tar_name

# Function to download if not exists
def download_if_not_exists(s3_path, local_path):
    if not local_path.exists():
        print(f"Downloading {s3_path} to {local_path}...")
        try:
            # Using subprocess for aws cli as it handles large files better than boto3 for direct download
            subprocess.run(['aws', 's3', 'cp', s3_path, str(local_path)], check=True)
            print(f"✓ Downloaded: {local_path}")
        except subprocess.CalledProcessError as e:
            print(f"❌ Error downloading {local_path}: {e}")
            raise
    else:
        print(f"✓ Found existing: {local_path} (Skipping download)")

public_spacenet_s3_base = 's3://spacenet-dataset/spacenet/SN2_buildings/tarballs/'

download_if_not_exists(f'{public_spacenet_s3_base}{train_tar_name}', train_tar_path)
download_if_not_exists(f'{public_spacenet_s3_base}{test_tar_name}', test_tar_path)

print("Downloads complete or files already present.")


# --- Cell 3: Extract SpaceNet Tarballs on Notebook Instance EBS ---
def extract_with_safety(tar_path, extract_to_dir):
    print(f"🔍 Opening tarball: {tar_path.name}")
    extract_to_dir.mkdir(parents=True, exist_ok=True) # Ensure target directory exists

    # Check if the primary expected folder inside tar is already there, e.g., 'AOI_2_Vegas_Train'
    expected_extracted_folder = extract_to_dir / tar_path.name.replace('.tar.gz', '').replace('_train', '').replace('_public', '').replace('SN2_buildings', 'AOI_2_Vegas_Train' if 'train' in tar_path.name else 'AOI_2_Vegas_Test_Public')

    if expected_extracted_folder.exists() and list(expected_extracted_folder.iterdir()):
        print(f"⚠️  {expected_extracted_folder.name} directory already exists and is not empty!")
        response = input(f"Do you want to re-extract {tar_path.name}? (y/n): ")
        if response.lower() != 'y':
            print("Skipping extraction...")
            return True
        else:
            print(f"Removing existing {expected_extracted_folder.name} for re-extraction...")
            shutil.rmtree(expected_extracted_folder, ignore_errors=True)

    try:
        with tarfile.open(tar_path, 'r:gz') as tar:
            members = tar.getmembers()
            total_files = len(members)
            print(f"📁 Found {total_files} files to extract to {extract_to_dir}")

            start_time = time.time()
            for i, member in enumerate(tqdm(members, desc=f"Extracting {tar_path.name}")): # Use tqdm for progress bar
                tar.extract(member, path=extract_to_dir)
                if (i + 1) % 500 == 0 or (time.time() - start_time) > 10:
                    elapsed = time.time() - start_time
                    progress = ((i + 1) / total_files) * 100
                    rate = (i + 1) / elapsed if elapsed > 0 else 0
                    eta = (total_files - (i + 1)) / rate if rate > 0 else 0
                    # print(f"📊 Progress: {i + 1}/{total_files} ({progress:.1f}%) | Rate: {rate:.1f} files/sec | ETA: {eta:.0f}s")
                    start_time = time.time()
            print("✓ Extraction complete")
    except Exception as e:
        print(f"❌ Error during extraction: {e}")
        return False
    return True

print("🚀 Starting extraction of training data...")
train_extract_success = extract_with_safety(train_tar_path, data_processed_dir)

print("\n🚀 Starting extraction of test data...")
test_extract_success = extract_with_safety(test_tar_path, data_processed_dir)

if train_extract_success and test_extract_success:
    print("\nAll extractions completed.")
    print("\nExtracted contents in 'data/processed/':")
    for item in data_processed_dir.iterdir():
        if item.is_dir():
            print(f"📁 {item.name}")
else:
    print("❌ One or more extractions failed.")


# --- Cell 4: Run `preprocess_data.py` to Generate Masks ---
# Your `preprocess_data.py` script needs to be present in 'scripts/' directory.
# We will pass the necessary paths to it.

# This part requires the content of scripts/preprocess_data.py to be in a file.
# So, make sure you have copied scripts/preprocess_data.py from your local machine
# into the 'scripts/' directory on the Notebook Instance.

# Example of how to run a Python script from a Jupyter notebook cell
# %run magic command is convenient for this

# Define paths for preprocess_data.py
spacenet_raw_data_dir = data_processed_dir / 'AOI_2_Vegas_Train' # Input for preprocessing
processed_output_dir = data_processed_dir / 'processed_masks' # Output for masks

# Pass these as command line arguments or ensure script hardcodes them correctly
# For this, let's include the preprocess_data.py content here for self-contained notebook cell.
# Alternatively, you can run it as: %run {scripts_dir}/preprocess_data.py

print(f"\nRunning preprocessing script to generate masks. Input: {spacenet_raw_data_dir}, Output: {processed_output_dir}")

# You MUST ensure `scripts/preprocess_data.py` is uploaded to your Notebook Instance
# inside the `spacenet-building-detection/scripts/` folder.

# Option 1: Execute the script directly if it's uploaded
# %run {scripts_dir}/preprocess_data.py

# Option 2: Define the function from preprocess_data.py in this cell and call it
# This avoids needing to upload the script separately to the Notebook Instance
# and makes the notebook self-contained for the preprocessing step.

# Paste the content of your `preprocess_data.py` here:
# --- START OF PASTE FROM preprocess_data.py ---
import rasterio
from rasterio.features import rasterize
import geopandas as gpd
from shapely.geometry import Polygon, MultiPolygon
# from shapely.ops import cascaded_union # Not used in refined version
import numpy as np
from sklearn.model_selection import train_test_split

def extract_image_id_full(filename):
    match = re.search(r'(AOI_2_Vegas_img\d+)', filename)
    return match.group(1) if match else path.stem

def create_mask_from_geojson(image_path, geojson_path, output_mask_path):
    with rasterio.open(image_path) as src:
        transform = src.transform
        width = src.width
        height = src.height
        image_crs = src.crs

    try:
        gdf = gpd.read_file(geojson_path)
    except Exception as e: # Catch broader exceptions for geojson loading
        print(f"Skipping {geojson_path}: Error reading file - {e}")
        return False

    if gdf.crs and gdf.crs != image_crs:
        gdf = gdf.to_crs(image_crs)

    geometries = []
    for geom in gdf.geometry:
        if geom and geom.is_valid: # Check for None and validity
            if isinstance(geom, (Polygon, MultiPolygon)):
                geometries.append(geom)
        # else: print(f"Skipping invalid/empty geometry in {geojson_path}")

    mask = np.zeros((height, width), dtype=np.uint8)
    if geometries:
        shapes_to_rasterize = [(geom, 1) for geom in geometries]
        if shapes_to_rasterize:
            try:
                temp_mask = rasterize(
                    shapes=shapes_to_rasterize,
                    out_shape=(height, width),
                    transform=transform,
                    fill=0,
                    all_touched=True,
                    dtype=np.uint8
                )
                mask = np.logical_or(mask, temp_mask).astype(np.uint8)
            except Exception as e:
                print(f"Error rasterizing {geojson_path}: {e}")
                return False

    mask_meta = src.meta.copy()
    mask_meta.update(dtype=rasterio.uint8, count=1)

    output_mask_path.parent.mkdir(parents=True, exist_ok=True)
    with rasterio.open(output_mask_path, 'w', **mask_meta) as dst:
        dst.write(mask, 1)
    return True

def process_spacenet_dataset(base_data_dir, output_base_dir, split_ratio=0.8, rgb_only=True):
    images_dir = Path(base_data_dir)
    geojson_dir = Path(base_data_dir) / 'geojson' / 'buildings'

    if rgb_only:
        input_images_path = images_dir / 'RGB-PanSharpen'
        image_suffix = '.tif'
    else:
        input_images_path = images_dir / 'MUL-PanSharpen'
        image_suffix = '.tif'

    print(f"Collecting images from: {input_images_path}")
    all_image_files = sorted(list(input_images_path.glob(f'*{image_suffix}')))
    print(f"Found {len(all_image_files)} image files.")

    image_id_to_paths = {extract_image_id_full(p): p for p in all_image_files}
    all_geojson_files = sorted(list(geojson_dir.glob('*.geojson')))
    geojson_id_to_paths = {extract_image_id_full(p.name): p for p in all_geojson_files}

    common_image_ids = sorted(list(image_id_to_paths.keys() & geojson_id_to_paths.keys()))
    print(f"Found {len(common_image_ids)} image/GeoJSON pairs.")

    output_train_masks_dir = Path(output_base_dir) / 'train' / 'masks'
    output_val_masks_dir = Path(output_base_dir) / 'val' / 'masks'
    output_train_masks_dir.mkdir(parents=True, exist_ok=True)
    output_val_masks_dir.mkdir(parents=True, exist_ok=True)

    train_ids, val_ids = train_test_split(common_image_ids, test_size=1-split_ratio, random_state=42)
    print(f"Splitting data: {len(train_ids)} for training, {len(val_ids)} for validation.")

    for split_type, ids in [('train', train_ids), ('val', val_ids)]:
        print(f"\nProcessing {split_type} set...")
        current_output_mask_dir = output_train_masks_dir if split_type == 'train' else output_val_masks_dir
        for img_id in tqdm(ids, desc=f"Generating {split_type} masks"):
            image_path = image_id_to_paths[img_id]
            geojson_path = geojson_id_to_paths[img_id]
            output_mask_path = current_output_mask_dir / f"{img_id}.tif"
            create_mask_from_geojson(image_path, geojson_path, output_mask_path)

    print("\nData preprocessing complete.")
    train_mask_count = len(list(output_train_masks_dir.iterdir()))
    val_mask_count = len(list(output_val_masks_dir.iterdir()))
    print(f"Generated {train_mask_count} train masks and {val_mask_count} val masks.")


# Call the processing function
process_spacenet_dataset(
    base_data_dir=spacenet_raw_data_dir,
    output_base_dir=processed_output_dir,
    rgb_only=True # Set to False if you want to use MUL-PanSharpen (8 bands)
)
# --- END OF PASTE FROM preprocess_data.py ---


# --- Cell 5: Upload Processed Data to Your S3 Bucket ---
print("\nUploading processed data to your S3 bucket...")

# Define source paths on the Notebook Instance's EBS
source_rgb_images = data_processed_dir / 'AOI_2_Vegas_Train' / 'RGB-PanSharpen' # Adjust if MUL-PanSharpen
source_processed_masks = data_processed_dir / 'processed_masks'

# Define target paths on your S3 bucket
s3_raw_images_target = f's3://{default_bucket}/{s3_prefix}/raw_images/RGB-PanSharpen/'
s3_processed_masks_target = f's3://{default_bucket}/{s3_prefix}/processed_masks/'

# Function to upload recursively
def upload_recursive(local_path, s3_target_path):
    print(f"Uploading {local_path} to {s3_target_path}...")
    try:
        subprocess.run(['aws', 's3', 'cp', str(local_path), s3_target_path, '--recursive', '--quiet'], check=True)
        print(f"✓ Uploaded: {local_path}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error uploading {local_path}: {e}")
        raise

upload_recursive(source_rgb_images, s3_raw_images_target)
upload_recursive(source_processed_masks / 'train' / 'masks', f'{s3_processed_masks_target}train/masks/')
upload_recursive(source_processed_masks / 'val' / 'masks', f'{s3_processed_masks_target}val/masks/')

# You can add test data upload here if you processed it
# upload_recursive(data_processed_dir / 'AOI_2_Vegas_Test_Public' / 'RGB-PanSharpen', f's3://{default_bucket}/{s3_prefix}/raw_images/RGB-PanSharpen-Test/')
# upload_recursive(source_processed_masks / 'test' / 'masks', f'{s3_processed_masks_target}test/masks/')

print("\nAll required data uploaded to S3.")

OSError: [Errno 45] Operation not supported: '/home/ec2-user'