##### **Download Sentinel-2 data**

In [1]:
from tqdm.notebook import tqdm

from multiprocessing import Pool
from time import sleep

import numpy as np

import datetime
from pathlib import Path

import pandas as pd
import geopandas as gpd

import io

import ee

Authentication must be done firsthand. Replace with own project name.

For further details on ee, see https://developers.google.com/earth-engine/guides/auth and https://developers.google.com/earth-engine/apidocs/ee-image

In [5]:
ee.Authenticate()
ee.Initialize(project='sentinel-treeclassification')

Mask clouds in a Sentinel-2 image using the QA band and perform other pre-processing operations such as band selection, date filter, and collection median.

In [3]:
class SentinelGetter:
    def mask_s2_clouds(self, image):
      # Quality assessment with resolution in meters
      qa = image.select('QA60')
      # Bits 10 and 11 are clouds and cirrus, respectively.
      cloud_bit_mask = 1 << 10
      cirrus_bit_mask = 1 << 11
      # Both flags should be set to zero, indicating clear conditions.
      mask = (
          qa.bitwiseAnd(cloud_bit_mask)
          .eq(0)
          .And(qa.bitwiseAnd(cirrus_bit_mask).eq(0))
      )
      return image.updateMask(mask)

    def get_image(self, bbox, start_date, end_date):
        selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']
        image = (
            ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
            .filterDate(start_date, end_date)
            # Pre-filter to get less cloudy granules.
            .map(self.mask_s2_clouds)
            .select(selected_bands)
            .median()
            .clip(bbox)
        )
        return image

Read in data and perform some operations. Alternatively, use Geopandas directly.

In [3]:
target = 'tree_name'
usecols = ['latitude', 'longitude'] + [target, 'load_date']
df = pd.read_csv("data/Borough_tree_list_2021July.csv", usecols=usecols, parse_dates=['load_date'])
df = df.dropna(subset=target)

In [6]:
gdf = gpd.GeoDataFrame(
    df, geometry=gpd.points_from_xy(x=df['longitude'], y=df['latitude'], crs=4326)
)
gdf[target] = gdf[target].astype('category')

gdf[target] = gdf[target].cat.codes

gdf.geometry = gdf.to_crs(epsg=6933).buffer(30, cap_style=3).to_crs(epsg=4326)

bbox = ee.Geometry.BBox(*(gdf.geometry.total_bounds + 0.01))

In [8]:
# gdf.to_file('buffered_london_trees.geojson')

In [6]:
def download_npy(bbox, start_date, end_date, gdf, i):
    # Sleep time helps with parallel processing,
    # if you're brave enough to try it
    sleep_time = i*2
    sleep(sleep_time)

    # Cloud masked, band selected, mean image of the bbox area. 
    sentinel_image = SentinelGetter().get_image(bbox, start_date, end_date)
        
    # For further options, see
    # https://developers.google.com/earth-engine/apidocs/ee-data-computepixels
    params = {'fileFormat': 'NPY'}

    save_path = Path('london').joinpath(f"london_trees_{start_date.strftime('%Y%m')}_{i}.npy")
    print(f'Downloading {start_date}')
    # Continue from a previous run, else start new.
    if save_path.is_file():
        with open(save_path, 'rb') as f:
            # Convert outer array to list for appending, avoid ndarray.tolist()
            # as that converts nested arrays to list as well.
            all_data = list(np.load(save_path))
    else:
        all_data = []

    # Continue from previous iteration, or start new.
    continue_gdf = gdf.iloc[len(all_data):]
    
    # Progress bar, tracks continuations
    for i, row in tqdm(
        continue_gdf.iterrows(), total=gdf.shape[0], initial=len(all_data)):
        # Not ideal but a lot of connection errors can occur here.
        # They are (so far) not program ending, simply retry.
        retry = True
        while retry:
            try:
                this_bbox = ee.Geometry.BBox(*row.geometry.bounds)
                params['expression'] = sentinel_image.clipToBoundsAndScale(
                    this_bbox, width=6, height=6)

                # There can be a delay before the URL becomes available,
                # in which case the loop simply retries (seems rare so far).
                pixels = ee.data.computePixels(params)
                data = np.load(io.BytesIO(pixels))

                # Numpy ndarray being appended to a list of ndarrays.
                # Ensure all_data uses python's list instead of ndarray.tolist().
                all_data.append(data)
                retry = False

            except Exception as e:
                # Sleep for 1 second if error, Google claims to be
                # fine with 100/s requests.
                sleep(sleep_time)
                retry = True
                
        # Save every 1000th run but not the first  
        if (i % 1000 == 0) and (i > 0):
            # Convert to array first in case of errors,
            # as that would erase the file.
            save_data = np.array(all_data)
            with open(save_path, 'wb') as f:
                np.save(f, save_data)

    # Save the remainder
    save_data = np.array(all_data)
    with open(save_path, 'wb') as f:
        np.save(f, save_data)
                
    return np.array(all_data)

Download samples at roughly 2 images per second in each process.

In [7]:
n_chunks = 50000
chunks = [gdf[i: i + n_chunks] for i in range(0, gdf.shape[0], n_chunks)]

In [8]:
pool = Pool(processes=len(chunks))
map_inputs = []

date = datetime.datetime(2019, 1, 1)
for i, chunk in enumerate(chunks):
    start_date = date.replace(month=4)
    end_date = start_date.replace(month=8)
    map_inputs.append((bbox, start_date, end_date, chunk, i))

pool.starmap(download_npy, map_inputs)
pool.close()
pool.join()

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


 96%|#########5| 47767/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 50000/50000 [00:00<?, ?it/s]

Downloading 2019-04-01 00:00:00


100%|##########| 15916/15916 [00:00<?, ?it/s]

In [10]:
# Optional shut down pc once downloads complete.
# Tested on Windows Subsystem for Linux.
import subprocess
# subprocess.run(["shutdown.exe", "/s"])
subprocess.run(["shutdown.exe", "/a"])

CompletedProcess(args=['shutdown.exe', '/a'], returncode=0)