This notebook prepares the training/testing masks and split file names.

In [1]:
# Copyright (c) Microsoft Corporation. All rights reserved
# Licensed under the MIT License.
import sys
sys.path.append("..")
import os
import time
import subprocess

import pandas as pd

from multiprocessing import Pool
import rasterio
import fiona
import shapely
import shapely.geometry
import shapely.ops

from cafo import utils
os.environ.update(utils.RASTERIO_BEST_PRACTICES)

In [2]:
REPO_ROOT_DIRECTORY = os.path.dirname(os.getcwd())
BASE_OUTPUT_DIRECTORY = os.path.join(
    REPO_ROOT_DIRECTORY,
    "output/masks/"
) # This is the base directory where the different mask datasets will be generated

In [3]:
os.makedirs(os.path.join(
    REPO_ROOT_DIRECTORY,
    "data/splits/"
), exist_ok=True)

os.makedirs(BASE_OUTPUT_DIRECTORY, exist_ok=True)

In [4]:
def generate_masks(naip_paths, polygon_fn, output_dir):
    mask_fns = []
    for i, naip_path in enumerate(naip_paths):
        if i % 100 == 0:
            print(i, len(naip_paths))
        
        mask_fn = os.path.join(output_dir, naip_path)
        os.makedirs(os.path.dirname(mask_fn), exist_ok=True)
        mask_fns.append(mask_fn)

        naip_url = utils.NAIP_BLOB_ROOT + "/" + naip_path
        with rasterio.open(naip_url) as f:
            left, bottom, right, top = f.bounds
            crs = f.crs.to_string()
            height, width = f.height, f.width

        command = [
            "gdal_rasterize",
            "-ot", "Byte",
            "-burn", "1",
            "-of", "GTiff",
            "-te", str(left), str(bottom), str(right), str(top),
            "-ts", str(width), str(height),
            "-co", "COMPRESS=LZW",
            "-co", "BIGTIFF=YES",
            polygon_fn,
            mask_fn
        ]
        subprocess.call(command)
        
    return mask_fns

## Load all NAIP URLs

## Train-all

Uses NAIP imagery from every year for which we have data.

In [5]:
polygon_fn = "../data/delmarva_training_set_polygons.geojson"
tiles_fn = "../data/delmarva_training_set_tiles.geojson"
set_name = "train-all/"
output_dir = os.path.join(BASE_OUTPUT_DIRECTORY, set_name)
os.makedirs(output_dir, exist_ok=True)

naip_paths = []
with fiona.open(tiles_fn) as f:
    for row in f:
        naip_paths.append(row["properties"]["url"])

In [6]:
# This will rasterize the shapes in "polygon_fn" to the extent of each imagery tile in "naip_paths" and
# the resulting masks will be saved with the same directory structure / naming scheme under "output_dir"
output_fns = generate_masks(naip_paths, polygon_fn, output_dir)

0 1983
100 1983
200 1983
300 1983
400 1983
500 1983
600 1983
700 1983
800 1983
900 1983
1000 1983
1100 1983
1200 1983
1300 1983
1400 1983
1500 1983
1600 1983
1700 1983
1800 1983
1900 1983


In [7]:
df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in naip_paths],
    "label_fn": output_fns
})
df.to_csv("../data/splits/train-all.csv", index=False)

## Train-single

Uses NAIP imagery from only VA 2016, MD 2017, and DE 2017 (the layers that the polygons were created with).

In [8]:
new_naip_paths = []
new_output_fns = []
for i, fn in enumerate(output_fns):
    if ("de/2017/" in fn) or ("md/2017/" in fn) or ("va/2016/" in fn): 
        new_output_fns.append(fn)
        new_naip_paths.append(naip_paths[i])

df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in new_naip_paths],
    "label_fn": new_output_fns
})
df.to_csv("../data/splits/train-single.csv", index=False)

## Test-all

In [9]:
polygon_fn = "../data/delmarva_testing_set_polygons.geojson"
tiles_fn = "../data/delmarva_testing_set_tiles.geojson"
set_name = "test-all/"
output_dir = os.path.join(BASE_OUTPUT_DIRECTORY, set_name)
os.makedirs(output_dir, exist_ok=True)

naip_paths = []
with fiona.open(tiles_fn) as f:
    for row in f:
        naip_paths.append(row["properties"]["url"])

In [10]:
output_fns = generate_masks(naip_paths, polygon_fn, output_dir)

0 568
100 568
200 568
300 568
400 568
500 568


In [11]:
df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in naip_paths],
    "label_fn": output_fns
})
df.to_csv("../data/splits/test-all.csv", index=False)

## Test-single

In [12]:
new_naip_paths = []
new_output_fns = []
for i, fn in enumerate(output_fns):
    if ("de/2017/" in fn) or ("md/2017/" in fn) or ("va/2016/" in fn): 
        new_output_fns.append(fn)
        new_naip_paths.append(naip_paths[i])

df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in new_naip_paths],
    "label_fn": new_output_fns
})
df.to_csv("../data/splits/test-single.csv", index=False)

## All

In [13]:
polygon_fn = "../data/Delmarva_PL_House_Final2_epsg26918.geojson"
tiles_fn = "../data/delmarva_all_set_tiles.geojson"
set_name = "all/"
output_dir = os.path.join(BASE_OUTPUT_DIRECTORY, set_name)
os.makedirs(output_dir, exist_ok=True)

naip_urls = []
with fiona.open(tiles_fn) as f:
    for row in f:
        naip_urls.append(row["properties"]["url"])

In [14]:
output_fns = generate_masks(naip_paths, polygon_fn, output_dir)

0 568
100 568
200 568
300 568
400 568
500 568


In [15]:
df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in naip_paths],
    "label_fn": output_fns
})
df.to_csv("../data/splits/all.csv", index=False)

## All-single

In [16]:
new_naip_paths = []
new_output_fns = []
for i, fn in enumerate(output_fns):
    if ("de/2017/" in fn) or ("md/2017/" in fn) or ("va/2016/" in fn): 
        new_output_fns.append(fn)
        new_naip_paths.append(naip_paths[i])

df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in new_naip_paths],
    "label_fn": new_output_fns
})
df.to_csv("../data/splits/all-single.csv", index=False)

# Augment sets

In [17]:
test_tile_shapes = []
with fiona.open("../data/delmarva_testing_set_tiles.geojson") as f:
    for row in f:
        test_tile_shapes.append(shapely.geometry.shape(row["geometry"]))
test_area = shapely.ops.cascaded_union(test_tile_shapes)

In [18]:
def split_shapes_into_train_test(fn):
    shapes = []
    geoms = []
    with fiona.open(fn) as f:
        for row in f:
            shapes.append(shapely.geometry.shape(row["geometry"]))
            geoms.append(row)
            
    test_geoms = []
    train_geoms = []
    for i, shape in enumerate(shapes):
        if test_area.intersects(shape):
            test_geoms.append(geoms[i])
        else:
            train_geoms.append(geoms[i])
    
    return train_geoms, test_geoms

In [19]:
fns = [
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2011.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2012.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2013.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2014.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2015.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2016.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2017.geojson",
    "../data/poultry_barn_change_predictions/poultry_barns-64-200_predictions_2018.geojson",
]

for fn in fns:
    
    train_fn = fn.replace(".geojson", "_train.geojson")
    test_fn = fn.replace(".geojson", "_test.geojson")
    
    train_geoms, test_geoms = split_shapes_into_train_test(fn)
    
    with fiona.open(fn) as f:
        schema = f.schema.copy()
        crs = f.crs.copy()
        
    with fiona.open(train_fn, "w", driver="GeoJSON", schema=schema, crs=crs) as f:
        f.writerecords(train_geoms)
        
    with fiona.open(test_fn, "w", driver="GeoJSON", schema=schema, crs=crs) as f:
        f.writerecords(test_geoms)

## Train-augment

In [20]:
tiles_fn = "../data/delmarva_training_set_tiles.geojson"
set_name = "train-augment/"
output_dir = os.path.join(BASE_OUTPUT_DIRECTORY, set_name)
os.makedirs(output_dir, exist_ok=True)

polygon_fns_by_year = {
    year: "../data/poultry_barns-64-200_predictions_%d_train.geojson" % year
    for year in range(2011,2019)
}

naip_urls = []
with fiona.open(tiles_fn) as f:
    for row in f:
        naip_urls.append(row["properties"]["url"])

In [21]:
output_fns = []
for i, naip_path in enumerate(naip_paths):
    if i % 100 == 0:
        print(i, len(naip_paths))

    year = int(naip_path.split("/")[2])
    polygon_fn = polygon_fns_by_year[year]
        
    mask_fn = os.path.join(output_dir, naip_path)
    os.makedirs(os.path.dirname(mask_fn), exist_ok=True)
    output_fns.append(mask_fn)

    naip_url = utils.NAIP_BLOB_ROOT + "/" + naip_path
    with rasterio.open(naip_url) as f:
        left, bottom, right, top = f.bounds
        crs = f.crs.to_string()
        height, width = f.height, f.width

    command = [
        "gdal_rasterize",
        "-ot", "Byte",
        "-burn", "1",
        "-of", "GTiff",
        "-te", str(left), str(bottom), str(right), str(top),
        "-ts", str(width), str(height),
        "-co", "COMPRESS=LZW",
        "-co", "BIGTIFF=YES",
        polygon_fn,
        mask_fn
    ]
    subprocess.call(command)

0 568
100 568
200 568
300 568
400 568
500 568


In [22]:
df = pd.DataFrame.from_dict({
    "image_fn": [utils.NAIP_BLOB_ROOT + "/" + naip_path for naip_path in naip_paths],
    "label_fn": output_fns
})
df.to_csv("../data/splits/train-augment.csv", index=False)