##### **Download Sentinel-2 data**

In [1]:
from tqdm.notebook import tqdm

from multiprocessing import Pool
from time import sleep

import numpy as np

import datetime
from pathlib import Path

# For the below intake_geopandas is also required
# use "pip install intake_geopandas" or check github
import intake

import io

import ee

Authentication must be done firsthand. Replace with own project name.

For further details on ee, see https://developers.google.com/earth-engine/guides/auth and https://developers.google.com/earth-engine/apidocs/ee-image

In [2]:
ee.Authenticate()
ee.Initialize(project='sentinel-treeclassification')

Mask clouds in a Sentinel-2 image using the QA band and perform other pre-processing operations such as band selection, date filter, and collection median.

In [3]:
class SentinelGetter:
    def mask_s2_clouds(self, image):
      # Quality assessment with resolution in meters
      qa = image.select('QA60')
      # Bits 10 and 11 are clouds and cirrus, respectively.
      cloud_bit_mask = 1 << 10
      cirrus_bit_mask = 1 << 11
      # Both flags should be set to zero, indicating clear conditions.
      mask = (
          qa.bitwiseAnd(cloud_bit_mask)
          .eq(0)
          .And(qa.bitwiseAnd(cirrus_bit_mask).eq(0))
      )
      return image.updateMask(mask)

    def get_image(self, bbox, start_date, end_date):
        selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']
        image = (
            ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
            .filterDate(start_date, end_date)
            # Pre-filter to get less cloudy granules.
            .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', 20))
            .map(self.mask_s2_clouds)
            .select(selected_bands)
            .median()
            .clip(bbox)
        )
        return image

Read in data and perform some operations. Alternatively, use Geopandas directly.

In [4]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
# Geopandas does not seem to always recognise the crs.
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)
# Add some padding to avoid border polygons being cut off.
bbox = ee.Geometry.BBox(*(gdf.geometry.total_bounds + 0.01))

In [5]:
def download_npy(bbox, start_date, end_date, gdf, sleep_time):
    # Sleep time helps with parallel processing,
    # if you're brave enough to try it
    sleep(sleep_time)

    # Cloud masked, band selected, mean image of the bbox area. 
    sentinel_image = SentinelGetter().get_image(bbox, start_date, end_date)
        
    # For further options, see
    # https://developers.google.com/earth-engine/apidocs/ee-data-computepixels
    params = {'fileFormat': 'NPY'}

    save_path = Path('seasonal_median').joinpath(f"treesat_{start_date.strftime('%Y%m')}.npy")
    print(f'Downloading {start_date}')
    # Continue from a previous run, else start new.
    if save_path.is_file():
        with open(save_path, 'rb') as f:
            # Convert outer array to list for appending, avoid ndarray.tolist()
            # as that converts nested arrays to list as well.
            all_data = list(np.load(save_path))
    else:
        all_data = []

    # Continue from previous iteration, or start new.
    continue_gdf = gdf.loc[len(all_data):]
    
    # Progress bar, tracks continuations
    for i, row in tqdm(
        continue_gdf.iterrows(), total=gdf.shape[0], initial=len(all_data)):
        # Not ideal but a lot of connection errors can occur here.
        # They are (so far) not program ending, simply retry.
        retry = True
        while retry:
            try:
                this_bbox = ee.Geometry.BBox(*row.geometry.bounds)
                params['expression'] = sentinel_image.clipToBoundsAndScale(
                    this_bbox, width=6, height=6)

                # There can be a delay before the URL becomes available,
                # in which case the loop simply retries (seems rare so far).
                pixels = ee.data.computePixels(params)
                data = np.load(io.BytesIO(pixels))

                # Numpy ndarray being appended to a list of ndarrays.
                # Ensure all_data uses python's list instead of ndarray.tolist().
                all_data.append(data)
                retry = False

            except Exception as e:
                # Sleep for 1 second if error, Google claims to be
                # fine with 100/s requests.
                sleep(sleep_time)
                retry = True
                
        # Save every 1000th run but not the first  
        if (i % 1000 == 0) and (i > 0):
            # Convert to array first in case of errors,
            # as that would erase the file.
            save_data = np.array(all_data)
            with open(save_path, 'wb') as f:
                np.save(f, save_data)

    # Save the remainder
    save_data = np.array(all_data)
    with open(save_path, 'wb') as f:
        np.save(f, save_data)
                
    return np.array(all_data)

Download samples at roughly 2 images per second in each process. Each month seems to take roughly 5-8 hours.

In [None]:
map_inputs = []
start_date = datetime.datetime(2017, 3, 1)

i = 0
while start_date < datetime.datetime(2024, 4, 1):
    end_month = (start_date.month + 3)%12 or 12
    end_date = start_date.replace(
        month=end_month, 
        year=start_date.year + (start_date.month + 3)//13)
    
    map_inputs.append((bbox, start_date, end_date, gdf, i))

    start_date = end_date
    i += 2

pool = Pool(processes=10)
pool.starmap(download_npy, map_inputs)
pool.close()
pool.join()

Downloading 2017-03-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2017-06-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2017-09-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2017-12-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2018-03-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2018-06-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2018-09-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2018-12-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2019-03-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2019-06-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2019-09-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2019-12-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2020-03-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2020-06-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2020-09-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2020-12-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2021-03-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2021-06-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2021-09-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2021-12-01 00:00:00


100%|##########| 50381/50381 [00:00<?, ?it/s]

Downloading 2022-03-01 00:00:00


 93%|#########3| 47001/50381 [00:00<?, ?it/s]

Downloading 2022-06-01 00:00:00


 95%|#########5| 48001/50381 [00:00<?, ?it/s]

Downloading 2022-09-01 00:00:00


 91%|#########1| 46001/50381 [00:00<?, ?it/s]

Downloading 2022-12-01 00:00:00


 85%|########5 | 43001/50381 [00:00<?, ?it/s]

Downloading 2023-03-01 00:00:00


 81%|########1 | 41001/50381 [00:00<?, ?it/s]

Downloading 2023-06-01 00:00:00


 73%|#######3  | 37001/50381 [00:00<?, ?it/s]

Downloading 2023-09-01 00:00:00


 75%|#######5  | 38001/50381 [00:00<?, ?it/s]

Downloading 2023-12-01 00:00:00


 73%|#######3  | 37001/50381 [00:00<?, ?it/s]

Downloading 2024-03-01 00:00:00


 77%|#######7  | 39001/50381 [00:00<?, ?it/s]

In [None]:
# Optional shut down pc once downloads complete.
# Tested on Windows Subsystem for Linux.
import subprocess
subprocess.run(["shutdown.exe", "/s"])
# subprocess.run(["shutdown.exe", "/a"])