In [1]:
import os
import numpy as np
import tifffile as tiff
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import math

# ========== CONFIG ==========
ORIGINAL_PATH = r"F:\Exact Match Masking\trail_final_image_original_2.tif"
MASK_PATH     = r"F:\Exact Match Masking\trail_final_image_2.tif"
OUT_DIR       = r"F:\Exact Match Masking\geotile"

TILE_SIZE = 512
STRIDE = 256
ROAD_THRESHOLD = 100  # min white pixels to keep a tile
TEST_SPLIT = 0.25
# =============================

os.makedirs(OUT_DIR, exist_ok=True)

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)

def read_image(path):
    img = tiff.imread(path)
    # If grayscale, stack to 3 channels
    if img.ndim == 2:
        img = np.stack([img]*3, axis=-1)
    # If has alpha, drop 4th channel
    if img.ndim == 3 and img.shape[2] == 4:
        img = img[..., :3]
    return img

def read_mask(path):
    mask = tiff.imread(path)
    if mask.ndim == 3:
        mask = mask[..., 0]
    return mask

def generate_tiles(image, tile_size=512, stride=256):
    H, W = image.shape[:2]
    tiles = []
    for y in range(0, H - tile_size + 1, stride):
        for x in range(0, W - tile_size + 1, stride):
            tile = image[y:y+tile_size, x:x+tile_size]
            tiles.append(tile)
    return tiles

# ---------- Step 1: Load images ----------
print("Loading images...")
img = read_image(ORIGINAL_PATH)
mask = read_mask(MASK_PATH)
print("Image shape:", img.shape, "Mask shape:", mask.shape)

# ---------- Step 2: Generate tiles ----------
print("\nGenerating tiles (512×512, stride 256)...")
img_tiles = generate_tiles(img, TILE_SIZE, STRIDE)
mask_tiles = generate_tiles(mask[..., None], TILE_SIZE, STRIDE)
print(f"Generated {len(img_tiles)} tiles.")

# ---------- Step 3: Filter tiles ----------
print("\nFiltering tiles with no roads...")
filtered_X, filtered_Y = [], []
for x, y in tqdm(zip(img_tiles, mask_tiles), total=len(img_tiles)):
    if np.sum(y) > ROAD_THRESHOLD:
        filtered_X.append(x)
        filtered_Y.append(y)
print(f"Kept {len(filtered_X)} tiles after filtering.")

# ---------- Step 4: Train-test split ----------
print("\nSplitting into train/test...")
X_train, X_test, Y_train, Y_test = train_test_split(
    filtered_X, filtered_Y, test_size=TEST_SPLIT, random_state=42
)
print(f"Train: {len(X_train)}, Test: {len(X_test)}")

# ---------- Step 5: Save tiles ----------
def save_tiles(tiles, out_dir, prefix):
    ensure_dir(out_dir)
    for i, tile in enumerate(tqdm(tiles, desc=f"Saving {prefix}")):
        np.save(os.path.join(out_dir, f"{prefix}_{i:04d}.npy"), tile)

train_x_dir = os.path.join(OUT_DIR, "train", "x")
train_y_dir = os.path.join(OUT_DIR, "train", "y")
test_x_dir  = os.path.join(OUT_DIR, "test", "x")
test_y_dir  = os.path.join(OUT_DIR, "test", "y")

save_tiles(X_train, train_x_dir, "train_x")
save_tiles(Y_train, train_y_dir, "train_y")
save_tiles(X_test, test_x_dir, "test_x")
save_tiles(Y_test, test_y_dir, "test_y")

# ---------- Step 6: Save as .npy arrays ----------
np.save(os.path.join(OUT_DIR, "X_train_tiles.npy"), np.array(X_train, dtype=np.float32))
np.save(os.path.join(OUT_DIR, "Y_train_tiles.npy"), np.array(Y_train, dtype=np.uint8))
np.save(os.path.join(OUT_DIR, "X_test_tiles.npy"), np.array(X_test, dtype=np.float32))
np.save(os.path.join(OUT_DIR, "Y_test_tiles.npy"), np.array(Y_test, dtype=np.uint8))

print("\n✅ Done! All tiles and .npy arrays saved in:", OUT_DIR)


Loading images...
Image shape: (25000, 34336, 3) Mask shape: (25000, 34336)

Generating tiles (512×512, stride 256)...
Generated 12768 tiles.

Filtering tiles with no roads...


100%|██████████████████████████████████████████████████████████████████████████| 12768/12768 [00:05<00:00, 2215.58it/s]


Kept 783 tiles after filtering.

Splitting into train/test...
Train: 587, Test: 196


Saving train_x: 100%|████████████████████████████████████████████████████████████████| 587/587 [00:36<00:00, 16.17it/s]
Saving train_y: 100%|████████████████████████████████████████████████████████████████| 587/587 [00:13<00:00, 43.84it/s]
Saving test_x: 100%|█████████████████████████████████████████████████████████████████| 196/196 [00:11<00:00, 17.38it/s]
Saving test_y: 100%|█████████████████████████████████████████████████████████████████| 196/196 [00:04<00:00, 43.46it/s]



✅ Done! All tiles and .npy arrays saved in: F:\Exact Match Masking\geotile
