# Preprocessing Sentinel-2 images

The *Copernicus Sentinel-2* mission comprises a constellation of two polar-orbiting satellites placed in the same sun-synchronous orbit, phased at 180° to each other. It aims at monitoring variability in land surface conditions, and its wide swath width (290 km) and high revisit time (10 days at the equator with one satellite, and 5 days with 2 satellites under cloud-free conditions which results in 2-3 days at mid-latitudes) will support monitoring of Earth's surface changes.

Before strating this notebook, you should download a Sentinel-2 Level-2A product in <a href="https://scihub.copernicus.eu/dhus/#/home" target="_blank">Copernicus Open Access Hub</a>. The L2A products are downloadable by tiles which are 100x100 km2 ortho-images in UTM/WGS84 projection.

The different preprocessing step are as following :

1. Resample images at 20m to 10m (if you want to work with Red, Green, Blue, NIR bands)
2. Clip images to the extent of Region of Interest (ROI)
3. Apply Scene Classification map (SCL) on reflectance images to mask invalid pixels

<img src="figures/prepro_S2.png" width="1000">

In [None]:
import glob, os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
import rasterio.mask
from rasterio.enums import Resampling
from rasterio import plot
from rasterio.plot import show
import matplotlib
import matplotlib.pyplot as plt
from pathlib import Path

print(f'Numpy : {np.__version__}')
print(f'Pandas : {pd.__version__}')
print(f'GeoPandas : {gpd.__version__}')
print(f'Rasterio : {rasterio.__version__}')

Numpy : 1.19.2
Pandas : 1.1.5
GeoPandas : 0.8.1
Rasterio : 1.1.0


## Set paths for input and output directories

In [2]:
#computer_path = 'X:/'
computer_path = '/Volumes/nbdid-sst-lbrat2104/'
grp_letter    = 'X'

# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'

# ----- #
# INPUT #
# ----- #
 
ROI_file = f'{work_path}ROI/extent_roi_32631.shp'
#ROI_file = f'{work_path}ROI/extent_roi_32736.shp'
L2A_path = f'{work_path}L2A/'

# ------ #
# OUTPUT #
# ------ #

# For each step of the preprocessing,
# a folder will be created to store 
# the intermediary files.

resampled_path = f'{work_path}1_L2A_RESAMPLED/'
clipped_path   = f'{work_path}2_L2A_CLIPPED/'
masked_path    = f'{work_path}3_L2A_MASKED/'

Path(resampled_path).mkdir(parents=True, exist_ok=True)
Path(clipped_path).mkdir(parents=True, exist_ok=True)
Path(masked_path).mkdir(parents=True, exist_ok=True)


print(f'General work path is set to     : {work_path}')
print(f'L2A SAFE folders path is set to : {L2A_path}')

General work path is set to     : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/
L2A SAFE folders path is set to : /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/L2A/


## Select reflectance bands you want to work with

### 10 metre spatial resolution

- B2 - Blue
- B3 - Green
- B4 - Red
- B8 - Near Infra-Red (NIR)

### 20 metre spatial resolution

- B5 - Red-Edge
- B6 - Red-Edge
- B7 - Red-Edge
- B8a - Near Infra-Red (NIR)
- B11 - Short Wave Infra-Red (SWIR)
- B12 - Short Wave Infra-Red (SWIR)



In [11]:
band_10m_list = ['B04','B08']

band_20m_list = ['B11']

## Set resampling and masking parameters

In [12]:
# ---------- #
# RESAMPLING #
# ---------- #

# We will upscale the image by 2: 20m --> 10m
upscale_factor = 2

# As SCL is categorical data, we MUST use "nearest neighbor" resampling method
resampling_method_cat = Resampling.nearest

# As BOA is continuous data, we can use other resampling methods : nearest, bilinear, cubic
resampling_method_cont = Resampling.bilinear

# ------------- #
# MASK with SCL #
# ------------- #

nodata_val = -10000


## Set the list of L2A that will be processed

In [13]:
list_L2A = glob.glob(f'{L2A_path}*.SAFE')

print(f'{len(list_L2A)} L2A will be pre-processed \n')

for L2A_safe in list_L2A:
    L2A_name = os.path.basename(L2A_safe)
    print(L2A_name)

12 L2A will be pre-processed 

S2A_MSIL2A_20200417T104021_N0214_R008_T31UFS_20200417T112906.SAFE
S2A_MSIL2A_20200520T105031_N0214_R051_T31UFS_20200520T134332.SAFE
S2A_MSIL2A_20200719T105031_N0214_R051_T31UFS_20200719T134605.SAFE
S2A_MSIL2A_20200914T104031_N0214_R008_T31UFS_20200914T133417.SAFE
S2B_MSIL2A_20200116T105309_N0213_R051_T31UFS_20200116T122813.SAFE
S2B_MSIL2A_20200212T104049_N0214_R008_T31UFS_20200213T134833.SAFE
S2B_MSIL2A_20200316T104709_N0214_R051_T31UFS_20200316T135256.SAFE
S2B_MSIL2A_20200621T103629_N0214_R008_T31UFS_20200621T140338.SAFE
S2B_MSIL2A_20200813T104629_N0214_R051_T31UFS_20200813T133458.SAFE
S2B_MSIL2A_20201019T103959_N0214_R008_T31UFS_20201019T140516.SAFE
S2B_MSIL2A_20201118T104329_N0214_R008_T31UFS_20201118T123717.SAFE
S2B_MSIL2A_20201218T104349_N0214_R008_T31UFS_20201218T124059.SAFE


## 1. Resample images at 20m resolution to 10m

### 1.1 Resample Scene Classification map

Only if you are planning to work at 10m resolution. If you are planning to work at 20m resolution, you can skip this step.

In [14]:
for L2A_safe in list_L2A:

    im_file_20m = glob.glob(f'{L2A_safe}/GRANULE/*/IMG_DATA/R20m/*_SCL_20m.jp2')[0]
    im_file_20m = im_file_20m.replace('\\','/')
    im_file_10m = f'{resampled_path}{os.path.basename(im_file_20m)[:-7]}10m.tif'
    
    if not os.path.isfile(im_file_10m):

        # Open file
        src = rasterio.open(im_file_20m, "r", driver='JP2OpenJPEG')

        # Resample data to target shape
        resampled_data = src.read(out_shape=(src.count,
                                int(src.height * upscale_factor),
                                int(src.width * upscale_factor)),
                                resampling=resampling_method_cat)

        # Scale image transform
        new_transform = src.transform * src.transform.scale(
            (src.width / resampled_data.shape[-1]),
            (src.height / resampled_data.shape[-2])
        )

        # Update metadata
        profile = src.profile
        profile.update(driver='GTiff',
                    width=src.width*upscale_factor,
                    height=src.height*upscale_factor,
                    transform=new_transform)


        # Write resampled image
        dst = rasterio.open(im_file_10m, "w", **profile)
        dst.write(resampled_data)

        # Close rasterio objects
        src.close()
        dst.close()
        
        print(f'--> A new resampled raster file is created : {im_file_10m}')

    else:
            print(f'--> {im_file_10m} - already exists')


--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200417T104021_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200520T105031_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200719T105031_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200914T104031_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200116T105309_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200212T104049_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200316T104709_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200621T103629_SCL_10m.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200813T104629_SCL

### 1.2 Resample Bottom-Of-Atmosphere corrected reflectance

If you work only with Blue (B02), Green (B03), Red (B04) and NIR (B08) bands, you don't have to do the resampling step because theses reflectances are already available at 10m resolution.

This step is only necessary if you work with bands 5,6,7,8A,11,12 which are only available at 20m resolution.

In [15]:
for L2A_safe in list_L2A:

    for band in band_20m_list:

        print(f'Band : {band}')

        im_file_20m = glob.glob(f'{L2A_safe}/GRANULE/*/IMG_DATA/R20m/*{band}_20m.jp2')[0]
        im_file_20m = im_file_20m.replace('\\','/')
        im_file_10m = f'{resampled_path}{os.path.basename(im_file_20m)[:-7]}10m.tif'

        if not os.path.isfile(im_file_10m):

            # Open file
            src = rasterio.open(im_file_20m, "r", driver='JP2OpenJPEG')

            # Resample data to target shape
            resampled_data = src.read(out_shape=(src.count,
                                    int(src.height * upscale_factor),
                                    int(src.width * upscale_factor)),
                                    resampling=resampling_method_cont)

            # Scale image transform
            new_transform = src.transform * src.transform.scale(
                (src.width / resampled_data.shape[-1]),
                (src.height / resampled_data.shape[-2])
            )

            # Update metadata
            profile = src.profile
            profile.update(driver='GTiff',
                        width=src.width*upscale_factor,
                        height=src.height*upscale_factor,
                        transform=new_transform)


            # Write resampled image
            dst = rasterio.open(im_file_10m, "w", **profile)
            dst.write(resampled_data)

            # Close rasterio objects
            src.close()
            dst.close()
            
            print(f'--> A new resampled raster file is created : {im_file_10m}')
        
        else:
            print(f'--> {im_file_10m} - already exists')


Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200417T104021_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200520T105031_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200719T105031_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200914T104031_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200116T105309_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200212T104049_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200316T104709_B11_10m.tif - already exists
Band : B11
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/1_L2A_RESAMPLED/T31UFS_20200621T103629_B11_10m.tif - already exists


## 2. Clip images to the extent of Region of Interest (ROI)

In [16]:
# Geometry of ROI

roi_geom = gpd.read_file(ROI_file).geometry

# Get list of image to clip (all images must be at same spatial resolution !)

list_im_to_clip = glob.glob(f'{resampled_path}*_10m.tif')

for band in band_10m_list:

    list_im_to_clip += glob.glob(f'{L2A_path}*.SAFE/GRANULE/*/IMG_DATA/R10m/*{band}_10m.jp2')


# CLIP
# ----

for im_file in list_im_to_clip:

    im_file_roi = f'{clipped_path}{os.path.basename(im_file)[:-4]}_ROI.tif'

    if not os.path.isfile(im_file_roi):
        
        # Open file
        src = rasterio.open(im_file, "r")

        # Crop the raster to the extent of the shape
        out_image, out_transform = rasterio.mask.mask(src, roi_geom, crop=True)
        
        # Update metadata
        profile = src.profile

        profile.update(driver='GTiff',
                    width=out_image.shape[2],
                    height=out_image.shape[1],
                    transform=out_transform)

        # Write clipped image
        dst = rasterio.open(im_file_roi, "w", **profile)
        dst.write(out_image)

        # Close rasterio objects
        src.close()
        dst.close()


        print(f'A new raster file is created : {im_file_roi}')

    else:
        print(f'--> {im_file_roi} - already exists')


--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200116T105309_B05_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200116T105309_B11_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200116T105309_SCL_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200212T104049_B05_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200212T104049_B11_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200212T104049_SCL_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200316T104709_B05_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200316T104709_B11_10m_ROI.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/2_L2A_CLIPPED/T31UFS_20200

## 3. Apply Scene Classification map (SCL) on reflectance images to mask invalid pixels

In [17]:
# Get a list with all reflectance bands clipped on the ROI

list_im_ROI = glob.glob(f'{clipped_path}*_B*_ROI.tif')


# APPLY SCL
# ---------

for im_file in list_im_ROI:
    
    # Get date of image
    date = os.path.basename(im_file)[7:7+15]
    
    # Find SCL corresponding to the given reflectances image
    scl_file = glob.glob(f'{clipped_path}*{date}*SCL_10m_ROI.tif')[0]
    scl_file = scl_file.replace('\\','/')
    
    im_file_scl = f'{masked_path}{os.path.basename(im_file)[:-4]}_SCL.tif'

    if not os.path.isfile(im_file_scl):
        
        # Open SCL and change invalid pixels categories by NaN
        src = rasterio.open(scl_file, "r")

        # Read file as numpy array
        SCL = src.read(1)
        src.close()

        #print('Scene Classification map')
        #show(SCL, cmap='Set3')

        SCL = SCL.astype(float)

        SCL[SCL == 0] = np.nan    # No data
        SCL[SCL == 1] = np.nan    # Saturated or defective
        SCL[SCL == 2] = np.nan    # Dark area pixels
        SCL[SCL == 3] = np.nan    # Cloud shadows
        SCL[SCL == 4] = 1         # Vegetation
        SCL[SCL == 5] = 1         # Not vegetated
        SCL[SCL == 6] = 1         # Water
        SCL[SCL == 7] = 1         # Unclassified
        SCL[SCL == 8] = np.nan    # Cloud medium probability
        SCL[SCL == 9] = np.nan    # Cloud high probability
        SCL[SCL == 10] = np.nan   # Thin cirrus
        SCL[SCL == 11] = np.nan   # Snow

        # Open file
        src = rasterio.open(im_file, "r")

        # Read file as numpy array
        im = src.read(1)

        # Update metadata
        profile = src.profile
        profile.update(dtype=rasterio.int16,  # Set to int16 it is lighter than float
                       nodata=nodata_val,     # Set nodata value in metadata
                       compress='lzw')        # Compression option

        # Mask image reflectance with SCL
        im_SLC = im * SCL

        # Change numpy NaN by nodata_val (e.g. -10000)
        im_SLC[np.isnan(im_SLC)] = nodata_val

        # Change the array's type : from float to integer 16
        im_SLC = im_SLC.astype(np.int16)

        # Write image
        dst = rasterio.open(im_file_scl, 'w', **profile)
        dst.write(im_SLC, 1)

        # Close rasterio objects
        src.close()
        dst.close()

        print(f'A new raster file is created : {im_file_scl}')
    
    else:
        print(f'--> {im_file_scl} - already exists')


--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B02_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B03_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B04_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B05_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B08_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200116T105309_B11_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200212T104049_B02_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_L2A_MASKED/T31UFS_20200212T104049_B03_10m_ROI_SCL.tif - already exists
--> /Volumes/nbdid-sst-lbrat2104/GROUP_X/WORK/3_