# SCRIPT 05: Extract Samples Values

This is the fifth script in the methodology. Here, the values for the selected samples are extracted and samples files are generated. This can only be done after creating grids with possible samples with SCRIPT 04, and manually selecting appropriate samples locations in a GIS software by comparing the reference created in SCRIPT 03 with images and time series.

In the following cells, please refer to the comments in the code for further explanations of its functioning.

In [None]:
# importing packages
import rasterio as r
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from matplotlib.colors import ListedColormap

In [None]:
# sample related predefinitions
chip_size = 254 # the width and height of every sample
total_overlap = 186 # the overlap between each sample (equals to the difference between
                    # the sample size and the prediction size)
side_overlap = int(total_overlap/2) # overlap on each side
chip_util_size = chip_size-total_overlap # the output size. depends on the network.

samples_id = '5K'

In [None]:
# opens the edited possible samples (they must all be merged in one file)
# here the format GPKG is used, but shapefiles are also compatible, just change the 'driver' parameter
shp = gpd.read_file('/home/bruno.matosak/Semiarido/MultiInput/samples/shp_possible_samples_edited/samples_location.gpkg', driver='GPKG')
shp.head()

In [None]:
# select only the filtered samples to extract
shp = shp[shp['to_use']==1]
print(f'Total samples: {len(shp)}')

In [None]:
# shuffling samples
shp = shp.sample(frac=1)
shp.head()

In [None]:
# acquiring reference data
reference = np.zeros([len(shp), chip_util_size, chip_util_size], dtype = np.byte)

# looping through every sample and acquiring its reference
for i in tqdm(range(len(shp))):
    # open the reference file
    raster_dataframe = r.open(f'/home/bruno.matosak/Semiarido/MultiInput/segmentations/GEM_id{str(shp.tile_id.values[i]).zfill(3)}.tif')
    # saving it to the variable 'reference'
    reference[i] = raster_dataframe.read(1, window=r.windows.Window(shp.ori_col.values[i]+side_overlap,
                                                                    shp.ori_row.values[i]+side_overlap,
                                                                    chip_util_size, 
                                                                    chip_util_size))

In [None]:
# showing the percentage of agriculture distribution in the samples

# percentage of agriculture per sample
per = np.zeros([len(shp)], dtype=np.float32)
for i in range(len(shp)):
    per[i] = 100*np.sum(reference[i])/(chip_util_size**2)

plt.hist(per, bins=10)
plt.title('Histogram of Agriculture Percentage Per Sample')
plt.xlabel('Percentage (%)')
plt.ylabel('Qt. of Samples')
plt.show()

In [None]:
# extrating samples image data

# define placeholder for the samples
# yearly data
samples_s1_y = np.zeros([len(shp), chip_size, chip_size, 2], dtype=np.int16)
samples_s2_y = np.zeros([len(shp), chip_size, chip_size, 6], dtype=np.int16)
# monthly data
samples_s1_m = np.zeros([len(shp), 12, chip_size, chip_size, 2], dtype=np.int16)
samples_s2_m = np.zeros([len(shp), 12, chip_size, chip_size, 6], dtype=np.int16)

# bands
bands_s1 = ['VV', 'VH']
bands_s2 = ['B2', 'B3', 'B4', 'B8', 'B11', 'B12']

# looping through every sample
for i in tqdm(range(len(shp))):
    # the sample origin
    ori_row = shp.ori_row.values[i]
    ori_col = shp.ori_col.values[i]
    
    # acquiring yearly data
    samples_s1_y[i] = np.moveaxis(r.open(f'/home/bruno.matosak/Semiarido/MultiInput/yearly_reduction_S1/Reduction_SAR_Year_id{str(shp.tile_id.values[i]).zfill(3)}.tif').read(window=r.windows.Window(ori_col, ori_row, chip_size, chip_size)), 0, -1)
    samples_s2_y[i] = np.moveaxis(r.open(f'/home/bruno.matosak/Semiarido/MultiInput/yearly_reduction_S2/Reduction_Optical_Year_id{str(shp.tile_id.values[i]).zfill(3)}.tif').read(window=r.windows.Window(ori_col, ori_row, chip_size, chip_size)), 0, -1)
    
    # acquiring monthly data
    for ii in range(len(bands_s1)):
        samples_s1_m[i, :, :, :, ii] = r.open(f'/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S1/Reduction_SAR_Months_id{str(shp.tile_id.values[i]).zfill(3)}_{bands_s1[ii]}.tif').read(window=r.windows.Window(ori_col, ori_row, chip_size, chip_size))
    
    for ii in range(len(bands_s2)):
        samples_s2_m[i, :, :, :, ii] = r.open(f'/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(shp.tile_id.values[i]).zfill(3)}_{bands_s2[ii]}.tif').read(window=r.windows.Window(ori_col, ori_row, chip_size, chip_size))

In [None]:
# saving unmodified samples
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_RAW_reference.npy', reference)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_RAW_s1_y.npy', samples_s1_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_RAW_s2_y.npy', samples_s2_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_RAW_s1_m.npy', samples_s1_m)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_RAW_s2_m.npy', samples_s2_m)

In [None]:
# preparing samples to be used by scaling them between 0 and 1.
# this is done using the 1% and 99% limits of the data (ignoring
# masked areas). the limits are stored in files that are needed later
# during the prediction in order to correctly scale data used to
# generate the full map.

# reference to categorical
reference_cat = to_categorical(reference)

# functions to find the 1% bottom and top most limits of data
def get_limits(data):
    return np.nanpercentile(data, q=[1,99])

# placeholder for the limits to adjust the data
limits_s1_y = np.zeros([len(bands_s1), 2], dtype=np.float32)
limits_s2_y = np.zeros([len(bands_s2), 2], dtype=np.float32)
limits_s1_m = np.zeros([len(bands_s1), 2], dtype=np.float32)
limits_s2_m = np.zeros([len(bands_s2), 2], dtype=np.float32)

# converting the typer of data
samples_s1_y = np.asarray(samples_s1_y, dtype=np.float32)
samples_s1_y[samples_s1_y==0] = None
samples_s2_y = np.asarray(samples_s2_y, dtype=np.float32)
samples_s2_y[samples_s2_y==0] = None
samples_s1_m = np.asarray(samples_s1_m, dtype=np.float32)
samples_s1_m[samples_s1_m==0] = None
samples_s2_m = np.asarray(samples_s2_m, dtype=np.float32)
samples_s2_m[samples_s2_m==0] = None

# loop to scale Sentinel-1 data reductions
for i in tqdm(range(len(bands_s1))):
    limits_s1_y[i] = get_limits(samples_s1_y[:, :, :, i])
    samples_s1_y[:, :, :, i] = (samples_s1_y[:, :, :, i]-limits_s1_y[i, 0])/(limits_s1_y[i, 1]-limits_s1_y[i, 0])
    
    limits_s1_m[i] = get_limits(samples_s1_m[:, :, :, :, i])
    samples_s1_m[:, :, :, :, i] = (samples_s1_m[:, :, :, :, i]-limits_s1_m[i, 0])/(limits_s1_m[i, 1]-limits_s1_m[i, 0])

# loop to scale Sentinel-2 data reductions
for i in tqdm(range(len(bands_s2))):
    limits_s2_y[i] = get_limits(samples_s2_y[:, :, :, i])
    samples_s2_y[:, :, :, i] = (samples_s2_y[:, :, :, i]-limits_s2_y[i, 0])/(limits_s2_y[i, 1]-limits_s2_y[i, 0])
    
    limits_s2_m[i] = get_limits(samples_s2_m[:, :, :, :, i])
    samples_s2_m[:, :, :, :, i] = (samples_s2_m[:, :, :, :, i]-limits_s2_m[i, 0])/(limits_s2_m[i, 1]-limits_s2_m[i, 0])

# adjust remaining data beyond 0 and 1
print('Adjusting data...')
samples_s1_y[np.isnan(samples_s1_y)] = 1
samples_s1_m[np.isnan(samples_s1_m)] = 1
samples_s2_y[np.isnan(samples_s2_y)] = 0
samples_s2_m[np.isnan(samples_s2_m)] = 0

samples_s1_y[samples_s1_y>1] = 1
samples_s1_y[samples_s1_y<0] = 0
samples_s2_y[samples_s2_y>1] = 1
samples_s2_y[samples_s2_y<0] = 0
samples_s1_m[samples_s1_m>1] = 1
samples_s1_m[samples_s1_m<0] = 0
samples_s2_m[samples_s2_m>1] = 1
samples_s2_m[samples_s2_m<0] = 0

print('Done!')

In [None]:
# saving the prepared data
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_reference.npy', reference_cat)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s1_y.npy', samples_s1_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s2_y.npy', samples_s2_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s1_m.npy', samples_s1_m)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s2_m.npy', samples_s2_m)

# save the limits for later use during prediction phase
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s1_y_limits.npy', limits_s1_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s2_y_limits.npy', limits_s2_y)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s1_m_limits.npy', limits_s1_m)
np.save(f'/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/{samples_id}_PRO_s2_m_limits.npy', limits_s2_m)

In [None]:
# cell to print some example samples

# defines some colors to plor reference
colors = ["gray", 'orange']
cmap_custom = ListedColormap(colors)

# a lil message
print('Showing Some Samples')

# loop to plot 5 rows of samples
for i in range(5):
    fig, ax = plt.subplots(1,4, sharey=True, sharex=True)
    fig.set_figwidth(10)
    fig.set_figheight(5)

    sample = samples_s2_y[i*2, 92:-92, 92:-92, :]
    r = sample[:,:,2]
    g = sample[:,:,3]
    b = sample[:,:,1]
    ax[0].imshow(np.moveaxis(np.asarray([r,g,b]), 0, -1))
    ax[1].imshow(reference[i*2], cmap=cmap_custom, vmin=-0.5, vmax=1.5, interpolation='nearest')
    sample = samples_s2_y[i*2+1, 92:-92, 92:-92, :]
    r = sample[:,:,2]
    g = sample[:,:,3]
    b = sample[:,:,1]
    ax[2].imshow(np.moveaxis(np.asarray([r,g,b]), 0, -1))
    ax[3].imshow(reference[i*2+1], cmap=cmap_custom, vmin=-0.5, vmax=1.5, interpolation='nearest')
    plt.show()

In [None]:
# print the limits for the curious mind
print('Limits:')
print(limits_s1_y)
print(limits_s2_y)
print(limits_s1_m)
print(limits_s2_m)

In [None]:
# plot histogram of samples data from Sentinel-1 yearly reduction

for i in range(len(bands_s1)):
    plt.hist(samples_s1_y[:,:,:,i].ravel(), bins=100, alpha=.4, label=bands_s1[i])
plt.legend()
plt.title('Histogram - S1 Yearly')
plt.show()

In [None]:
# plot histogram of samples data from Sentinel-2 yearly reduction

for i in range(len(bands_s2)):
    plt.hist(samples_s2_y[:,:,:,i].ravel(), bins=100, alpha=.4, label=bands_s2[i])
plt.legend()
plt.title('Histogram - S2 Yearly')
plt.show()

In [None]:
# plot histogram of samples data from Sentinel-1 monthly reductions

for i in range(len(bands_s1)):
    plt.hist(samples_s1_m[:,:,:,:,i].ravel(), bins=100, alpha=.4, label=bands_s1[i])
plt.legend()
plt.title('Histogram - S1 Monthly')
plt.show()

In [None]:
# plot histogram of samples data from Sentinel-2 monthly reductions

for i in range(len(bands_s2)):
    plt.hist(samples_s2_m[:,:,:,:,i].ravel(), bins=100, alpha=.4, label=bands_s2[i])
plt.legend()
plt.title('Histogram - S2 Monthly')
plt.show()