<a href="https://colab.research.google.com/github/ombuijabali/Coconut_Pedcition_Model/blob/main/Model_with_multiple_images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install package

In [None]:
!pip install geoai-py



#Import libraries

In [None]:
import torch
torch.cuda.is_available()
import os
import hashlib
import json
import geoai
import shutil

Define directories

In [None]:
out_folder = "/content/drive/MyDrive/Elements/output"
model_dir = f"{out_folder}/models"
hash_file = os.path.join(model_dir, "data_hashes.json")
model_path = os.path.join(model_dir, "mask_rcnn_trained.pth")

# List of training raster and vector files (Add more as needed)

In [None]:

train_raster_urls = [
    "https://huggingface.co/datasets/giswqs/geospatial/resolve/main/naip_train.tif",
    "https://huggingface.co/datasets/giswqs/geospatial/resolve/main/cars_7cm.tif",
]
train_vector_urls = [
    "https://huggingface.co/datasets/giswqs/geospatial/resolve/main/naip_train_buildings.geojson",
    "https://huggingface.co/datasets/construmgis/geospatial/resolve/main/HY.geojson"]

test_raster_url = (
    "https://huggingface.co/datasets/giswqs/geospatial/resolve/main/naip_test.tif"
)

Function to compute file hash

In [None]:
def compute_file_hash(filepath):
    """Returns SHA256 hash of a file."""
    hasher = hashlib.sha256()
    with open(filepath, "rb") as f:
        while chunk := f.read(8192):
            hasher.update(chunk)
    return hasher.hexdigest()

Function to check if training data has changed

In [None]:
def has_training_data_changed(train_raster_paths, train_vector_paths):
    """Checks if the training data has changed by comparing file hashes."""
    current_hashes = {fp: compute_file_hash(fp) for fp in train_raster_paths + train_vector_paths}

    if not os.path.exists(hash_file):
        return True  # No previous hash file, assume new data

    with open(hash_file, "r") as f:
        previous_hashes = json.load(f)

    return previous_hashes != current_hashes  # Returns True if data has changed

# Download all files

In [None]:
train_raster_paths = [geoai.download_file(url) for url in train_raster_urls]
train_vector_paths = [geoai.download_file(url) for url in train_vector_urls]
test_raster_path = geoai.download_file(test_raster_url)

naip_train.tif: 100%|██████████| 12.1M/12.1M [00:00<00:00, 37.7MB/s]
cars_7cm.tif: 100%|██████████| 92.0M/92.0M [00:04<00:00, 20.6MB/s]
naip_train_buildings.geojson: 100%|██████████| 456k/456k [00:00<00:00, 13.2MB/s]
HY.geojson: 100%|██████████| 5.13k/5.13k [00:00<00:00, 8.94MB/s]
naip_test.tif: 100%|██████████| 19.7M/19.7M [00:00<00:00, 28.0MB/s]


# View first vector over a raster (optional)

In [None]:
geoai.view_vector_interactive(train_vector_paths[0], tiles=train_raster_urls[0])
geoai.view_raster(test_raster_url)

# Export tiles from multiple raster/vector pairs

In [None]:
for train_raster, train_vector in zip(train_raster_paths, train_vector_paths):
    geoai.export_geotiff_tiles(
        in_raster=train_raster,
        out_folder=out_folder,
        in_class_data=train_vector,
        tile_size=512,
        stride=256,
        buffer_radius=0,
    )


Raster info for naip_train.tif:
  CRS: EPSG:26911
  Dimensions: 2503 x 1126
  Resolution: (0.5999999999999953, 0.5999999999996691)
  Bands: 4
  Bounds: BoundingBox(left=454780.8, bottom=5277567.0, right=456282.6, top=5278242.6)
Loaded 722 features from naip_train_buildings.geojson
Vector CRS: EPSG:4326
Reprojecting features from EPSG:4326 to EPSG:26911
Found 6 unique classes: ['apartments' None 'terrace' 'detached' 'house' 'shed']


Generated: 36, With features: 36: 100%|██████████| 36/36 [00:18<00:00,  1.92it/s]



------- Export Summary -------
Total tiles exported: 36
Tiles with features: 36 (100.0%)
Average feature pixels per tile: 45966.3
Output saved to: output

------- Georeference Verification -------

Raster info for cars_7cm.tif:
  CRS: EPSG:3857
  Dimensions: 8351 x 4463
  Resolution: (0.07464844311342499, 0.07464346097829833)
  Bands: 3
  Bounds: BoundingBox(left=-10622651.27699905, bottom=3462200.907507864, right=-10622027.88785061, top=3462534.04127421)
Loaded 17 features from HY.geojson
Vector CRS: EPSG:4326
Reprojecting features from EPSG:4326 to EPSG:3857


Generated: 544, With features: 262: 100%|██████████| 544/544 [00:34<00:00, 15.79it/s]


------- Export Summary -------
Total tiles exported: 544
Tiles with features: 262 (48.2%)
Average feature pixels per tile: 83563.1
Output saved to: output

------- Georeference Verification -------





# Train the model with multiple images

In [None]:
# Step 1: Check if model exists (first time or not)
is_first_run = not os.path.exists(model_path)
model_exists = not is_first_run

# Step 2: Compute training hashes
current_hashes = {fp: compute_file_hash(fp) for fp in train_raster_paths + train_vector_paths}
previous_hashes = {}

if os.path.exists(hash_file):
    with open(hash_file, "r") as f:
        previous_hashes = json.load(f)

# Step 3: Determine if data has changed
has_new_data = current_hashes != previous_hashes

# Step 4: Train only if needed or first run
if is_first_run or has_new_data:
    print("Preparing to train the model...")

    # Before training, check if image and label count matches
    num_images = len(os.listdir(f"{out_folder}/images"))
    num_labels = len(os.listdir(f"{out_folder}/labels"))

    if num_images == 0 or num_labels == 0:
        raise ValueError("No training data found in images or labels folders.")

    if num_images != num_labels:
        raise ValueError(f"Image-label mismatch: {num_images} images vs {num_labels} labels")

    print("Training model (first time run)" if is_first_run else "Training model due to new data...")

    geoai.train_MaskRCNN_model(
        images_dir=f"{out_folder}/images",
        labels_dir=f"{out_folder}/labels",
        output_dir=model_dir,
        num_channels=3,
        pretrained=True,
        batch_size=4,
        num_epochs=10,
        learning_rate=0.0005,
        val_split=0.2,
        pretrained_model_path=model_path if model_exists else None
    )

    # Check if training output exists
    trained_model_path = os.path.join(model_dir, "best_model.pth")
    if os.path.exists(trained_model_path):
        shutil.copy(trained_model_path, model_path)
        with open(hash_file, "w") as f:
            json.dump(current_hashes, f)
        print(f"Model saved to {model_path}")
    else:
        raise FileNotFoundError("Training failed — 'best_model.pth' was not created.")
else:
    print(f"Using existing trained model from {model_path}")

Training model due to new data...
Using device: cpu
Found 544 image files and 544 label files
Training on 435 images, validating on 109 images


Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth
100%|██████████| 170M/170M [00:01<00:00, 142MB/s]


Epoch: 0, Batch: 0/109, Loss: 1.7445, Time: 75.97s


# Object detection on test raster

In [None]:
masks_path = "naip_test_prediction.tif"
model_path = f"{out_folder}/models/best_model.pth"

geoai.object_detection(
    test_raster_path,
    masks_path,
    model_path,
    window_size=512,
    overlap=256,
    confidence_threshold=0.5,
    batch_size=4,
    num_channels=3,
)

# Convert predicted masks to GeoJSON

In [None]:
output_path = "naip_test_prediction.geojson"
gdf = geoai.orthogonalize(masks_path, output_path, epsilon=2)

# View results

In [None]:
geoai.view_vector_interactive(output_path, tiles=test_raster_url)

# Create a split map for comparison

In [None]:
geoai.create_split_map(
    left_layer=output_path,
    right_layer=test_raster_url,
    left_args={"style": {"color": "red", "fillOpacity": 0.2}},
    basemap=test_raster_url,
)