In [None]:
#@title Author: Michael Evans { display-mode: "form" }
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Introduction

This notebook demonstrates methods used to acquire training data from Google Earth Engine that can be used to train a [fully convolutional neural network (FCNN)](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Long_Fully_Convolutional_Networks_2015_CVPR_paper.pdf), specifically [U-net](https://arxiv.org/abs/1505.04597) using Tensorflow. In this example, we extract 256x256 pixel image chips containing the 3 visible, infrared, and 2 near infrared bands in Sentinel-2 imagery based on [hand-delineated solar array footprints in North Carolina](https://osf.io/ygbwj/). This relatively simple model is a mostly unmodified version of [this example](https://github.com/tensorflow/models/blob/master/samples/outreach/blogs/segmentation_blogpost/image_segmentation.ipynb) from the TensorFlow docs.

In [None]:
from os.path import join
from google.cloud import storage
import ee
from sys import path
import json
import numpy as np
import rasterio as rio
import folium

In [None]:
## Clone repo containing preprocessing and prediction functions
!git clone https://github.com/mjevans26/Satellite_ComputerVision.git

In [None]:
# Load the necessary modules from repo
path.append('/content/Satellite_ComputerVision')
from utils.clouds import basicQA, maskTOA, maskSR

In [None]:
# Import, authenticate and initialize the Earth Engine library.
ee.Authenticate()
ee.Initialize()

In [None]:
# Folium setup.

print(folium.__version__)

# Define a method for displaying Earth Engine image tiles to a folium map.
def add_ee_layer(self, ee_image_object, vis_params, name):
  map_id_dict = ee.Image(ee_image_object).getMapId(vis_params)
  folium.raster_layers.TileLayer(
    tiles = map_id_dict['tile_fetcher'].url_format,
    attr = "Map Data © Google Earth Engine",
    name = name,
    overlay = True,
    control = True
  ).add_to(self)

# Add EE drawing method to folium.
folium.Map.add_ee_layer = add_ee_layer

In [None]:
# Specify names locations for outputs in Cloud Storage. 
BUCKET = '{YOUR_GCS BUCKET HERE}'
BUCKET_PATH = join('gs://', BUCKET)

FOLDER = 'NC_solar'
PRED_BASE = 'data/predict'
TRAIN_BASE = 'data/training'
EVAL_BASE = 'data/eval'

# Specify inputs (Sentinel bands) to the model and the response variable.
opticalBands = ['B2', 'B3', 'B4']
thermalBands = ['B8', 'B11', 'B12']

BANDS = opticalBands + thermalBands
RESPONSE = 'landcover'
FEATURES = BANDS + [RESPONSE]
SCENEID = 'SENSING_ORBIT_NUMBER'

# Specify the size and shape of patches expected by the model.
KERNEL_SIZE = 256
KERNEL_SHAPE = [KERNEL_SIZE, KERNEL_SIZE]
COLUMNS = [
  tf.io.FixedLenFeature(shape=KERNEL_SHAPE, dtype=tf.float32) for k in FEATURES
]
FEATURES_DICT = dict(zip(FEATURES, COLUMNS))

# Imagery

Access and process the imagery to use for predictor variables using Google Earth Engine.  This is a three-month, cloud-free, Sentinel-2 composite corresponding to the latest date from which we have confirmed training data.  Display it in the notebook for a sanity check.

In [None]:
# Use Sentinel-2 surface reflectance data.
S2 = ee.ImageCollection("COPERNICUS/S2")
# Grab a feature corresponding to our study area - North Carolina
states = ee.FeatureCollection("TIGER/2016/States")
nc = states.filter(ee.Filter.eq('NAME', 'North Carolina')).geometry().buffer(2500)
begin = '2019-01-01'
end = '2020-03-01'

# The image input collection is cloud-masked.
filtered = S2.filterDate(begin, end)\
.filterBounds(nc)\
.filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))


# Create a simple median composite to visualize
winter = filtered.filterDate('2019-12-01', '2020-02-28').map(basicQA).median().select(BANDS).clip(nc)
spring = filtered.filterDate('2019-03-01', '2019-05-31').map(basicQA).median().select(BANDS).clip(nc)
summer = filtered.filterDate('2019-06-01', '2019-08-31').map(basicQA).median().select(BANDS).clip(nc)
fall = filtered.filterDate('2019-09-01', '2019-11-30').map(basicQA).median().select(BANDS).clip(nc)

# Use folium to visualize the imagery.
#mapid = image.getMapId({'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 0.3})
rgbParams = {'bands': ['B4', 'B3', 'B2'],
             'min': 0,
             'max': 0.3}

nirParams = {'bands': ['B8', 'B11', 'B12'],
             'min': 0,
             'max': 0.3}

map = folium.Map(location=[35.402, -78.376])
map.add_ee_layer(spring, rgbParams, 'Color')
map.add_ee_layer(spring, nirParams, 'Thermal')

map.add_child(folium.LayerControl())
map

Prepare the response variable.  This is the footprints of ground mounted solar arrays as of 2019. These polygons have been loaded into GEE as a FeatureCollection asset, and coded into a background class [0] and a target class [1].Display on the map to verify.

In [None]:
def set_landcover(ft):
  """
  Add a property to a feature and set it to 1
  Parameters:
    ft (ee.Feature): feature to have property added
  Returns:
    ee.Feature: input feature with 'label' property set to 1
  """
  return ft.set('landcover', 1)

# Get solar footprints data from our GEE Asset
NC_solar_footprints = ee.FeatureCollection("users/defendersofwildlifeGIS/NC/NC_solar_footprints")
# Label each polygon with property 'label' equal to 1
NC_solar_footprints = NC_solar_footprints.map(set_landcover)
# Create an image with all pixels equal to 0
blankimg = ee.Image.constant(0)
# Convert solar footprints to an image (band value will be 1 based on 'label')
solar_footprint = NC_solar_footprints.reduceToImage(['landcover'], ee.Reducer.first())
# Convert pixels of blank image to 1 where the values of the footprint image are 1
# and rename to 'landcover'
labelimg = blankimg.where(solar_footprint, solar_footprint).rename('landcover')

solarParams = {'bands': 'landcover', 'min':0, 'max': 1}

map = folium.Map(location = [35.402, -78.376])
map.add_ee_layer(labelimg,  solarParams, 'Solar footprint')
map.add_child(folium.LayerControl())
map

Use some pre-made geometries to sample the stack in strategic locations.  We constrain sampling to occur within 10km of mapped solar arrays. Because our target features are small and sparse, relative to the landscape, we also guide sampling based on their centroids to ensure that we get training data for solar arrays.

In [None]:
def buff(ft):
  return ft.buffer(10000)

def centroid(ft):
  return ft.centroid()

centroids = NC_solar_footprints.map(centroid)
studyArea = NC_solar_footprints.map(buff).union()
studyImage = ee.Image(0).byte().paint(studyArea, 1)
studyImage = studyImage.updateMask(studyImage)
centroids = centroids.randomColumn('random')

aoiParams = {'min':0, 'max': 1, 'palette': ['red']}
map = folium.Map(location=[35.402, -78.376], zoom_start=8)
map.add_ee_layer(studyImage, aoiParams, 'Sampling area')
map.add_child(folium.LayerControl())
map

# Sampling

If the mapped data look reasonable, we use a 2-stage approach to sample 256-256 pixel image 'chips' for use in model training.
1.) sample from the centroid of each polygon to create 'positive' examples.
2.) sample the image at random points to generate 'negative' examples.

To sample chips we create an array image in which each pixel contains a nested list of the surrounding 256x256 pixel values. We can sample this array image at points, to get all the pixels in a 256x256 neighborhood at each point.  It's worth noting that to build the training and testing data for the FCNN, you export a single TFRecord file that contains patches of pixel values in each record.  You do NOT need to export each training/testing patch to a different image.  Since each record potentially contains a lot of data (especially with big patches or many input bands), some manual sharding of the computation is necessary to avoid the `computed value too large` error.  Specifically, the following code takes multiple (smaller) samples within each geometry, merging the results to get a single export.

In [None]:
def make_array_image(features, labels, aoi):
  """Combine predictor bands and label band into an array image
  Parameters:
    features (ee.Image): image containing bands to be used as predictor variables in model
    labels (ee.Image): binary[0,1], single-band image indicating presence (1) and absence (0) of target features
    aoi (ee.Geometry): bounds
  Return:
    ee.Image: array image
  """
  
  featureStack = ee.Image.cat([features, labels]).clip(aoi)

  ls = ee.List.repeat(1, KERNEL_SIZE)
  lists = ee.List.repeat(ls, KERNEL_SIZE)
  kernel = ee.Kernel.fixed(KERNEL_SIZE, KERNEL_SIZE, lists)

  arrays = featureStack.neighborhoodToArray(kernel)
  return arrays

First we'll collect image patches from the centroids of known solar array locations

In [None]:
# Add a random column to the centroids
S = centroids.size().getInfo()
centroidList = centroids.toList(S)

In [None]:
#@title Centroids slicing
# Get samples from delineated features using slice() on a feature collection

x = 0

# set the number of samples to include in a single export. may need to experiment with this parameter to avoid memory issues
n = 25

while x < S:
  # select a subset of 25 centroids
  subset = ee.FeatureCollection(centroidList.slice(x, x+n))
  # buffer those
  studyArea = subset.map(buff).union()
  arrays = make_array_image(fall.select(BANDS), labelimg.select(RESPONSE), studyArea)
  sample = arrays.sampleRegions(
      collection = subset.geometry(),
      scale = 10,
      tileScale = 12
  )
  x += n
                                  
  # assign a random number to samples and create a 70/30 train/test split
  sample = sample.randomColumn('random')
  training = sample.filter(ee.Filter.gte('random', 0.3))
  testing = sample.filter(ee.Filter.lt('random', 0.3))

  desc = 'UNET_' + str(KERNEL_SIZE) + '_centFall' + str(x)
  task = ee.batch.Export.table.toCloudStorage(
    collection = training,
    description = desc, 
    bucket = BUCKET, 
    fileNamePrefix = join(FOLDER, TRAIN_BASE, desc),
    fileFormat = 'TFRecord',
    selectors = BANDS + [RESPONSE]
  )
  task.start()

  desc = 'UNET_' + str(KERNEL_SIZE) + '_centFall' + str(x)
  task = ee.batch.Export.table.toCloudStorage(
    collection = testing,
    description = desc, 
    bucket = BUCKET, 
    fileNamePrefix = join(FOLDER, EVAL_BASE, desc),
    fileFormat = 'TFRecord',
    selectors = BANDS + [RESPONSE]
  )
  task.start()

Generate random samples within the buffered area

In [None]:
#@title Random sampling

# Define sample sizes for shards and chunks. 
# These numbers determined experimentally.
n = 30 # Number of shards in each chunk.
N = 300 # Total sample size in each chunk.
C = 2# Number of chunks

iterator = iter(range(N*C))
arrays = make_array_image(fall.select(BANDS),
                          labelimg.select(RESPONSE),
                          studyArea)
for c in range(C):
  geomSample = ee.FeatureCollection([])

  for i in range(n):
    seed = next(iterator)
    sample = arrays.sample(
        region = studyArea,
        scale = 10,
        numPixels = N/n,
        seed = seed,
        tileScale = 8
    )
    geomSample = geomSample.merge(sample)

  #divide samples into training and evaluation data
  geomSample = geomSample.randomColumn('random')
  training = geomSample.filter(ee.Filter.gte('random', 0.3))
  testing = geomSample.filter(ee.Filter.lt('random', 0.3))

  desc = 'UNET_' + str(KERNEL_SIZE) + '_randFall'+str(c)
  task = ee.batch.Export.table.toCloudStorage(
    collection = training,
    description = desc, 
    bucket = BUCKET, 
    fileNamePrefix = join(FOLDER, TRAIN_BASE, desc),
    fileFormat = 'TFRecord',
    selectors = BANDS + [RESPONSE]
  )
  task.start()

  desc = 'UNET_' + str(KERNEL_SIZE) + '_randFall' + str(c)
  task = ee.batch.Export.table.toCloudStorage(
    collection = testing,
    description = desc, 
    bucket = BUCKET, 
    fileNamePrefix = join(FOLDER, EVAL_BASE, desc),
    fileFormat = 'TFRecord',
    selectors = BANDS + [RESPONSE]
  )
  task.start() 