# Create Samples for LSTM Training


## 1. Importing libraries

Reference for the libraries:

+ [numpy](https://numpy.org/install/)
+ [matplotlib](https://matplotlib.org/)
+ [tqdm](https://github.com/tqdm/tqdm)
+ [gdal](https://gdal.org/api/python.html)
+ [time](https://docs.python.org/3/library/time.html)
+ [os](https://docs.python.org/3/library/os.html)
+ [datetime](https://docs.python.org/3/library/datetime.html)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import gdal
import time
import os
import datetime

## 2. Changing working directory

In [None]:
# folder where all data is stored
os.chdir(os.getcwd().rsplit('/',2)[0]+'/Data')

## 3. Opening Reference

In [None]:
# Cube cell to select the samples from.
cell = '089098'

# Approach number.
samples_id = 'appr1.BA'

# Platforma, as in Landsat or Sentinel.
platform = 'Sentinel'

# Max. number of pixels per PRODES polygon to select.
number_pixels = 1800 # BA: landsat=200 and Sentinel=1800

# Year to select the samples from.
year = 2019

# ----------------------------
# PATHS FOR THE REFERENCE DATA
# All data must be rasterized with the same width and height as the cubes in pixels,
# same bounding box and same EPSG.

# one of the cubes to be used as reference
path_band      = f'./cubes/filled/{str(year)}.{cell}.band4.tif'
# PRODES deforestation polygons for the reference year. Each polygon being
# rasterized with a unique int number. Available from <http://terrabrasilis.dpi.inpe.br/downloads/>
path_prodes    = f'./ref/Approach1_PRODES_{str(year)}.tif'
# All PRODES past deforestation from before the reference year. 0 not past deforestation,
# 1 for past deforestation. Available from <http://terrabrasilis.dpi.inpe.br/downloads/>
# from "Accumulated area mask of native vegetation suppression" and "Yearly increment in native 
# vegetation suppression".
path_prodes_up = f'./ref/PRODES_up{str(year-1)}.tif'
# SRTM slope.
path_slope     = f'./ref/slope_{cell}.tif'
# ----------------------------

# Opening needed data.
data      = gdal.Open(path_band)
band      = data.GetRasterBand(1)
band      = band.ReadAsArray()
prodes    = gdal.Open(path_prodes).ReadAsArray()
prodes_up = gdal.Open(path_prodes_up).ReadAsArray()
slope     = gdal.Open(path_slope).ReadAsArray()

prodes[band==data.GetRasterBand(1).GetNoDataValue()] = 0
prodes_up[band==data.GetRasterBand(1).GetNoDataValue()] = 1
slope[band==data.GetRasterBand(1).GetNoDataValue()] = None

In [None]:
# Plotting the reference data.
fig, (ax0, ax1, ax2, ax3) = plt.subplots(1,4, sharey=True)

fig.set_figwidth(15)

ax0.imshow(band)
ax0.set_title('Band')

ax1.imshow(prodes)
ax1.set_title('prodes')
    
ax2.imshow(prodes_up)
ax2.set_title('prodes_up')
ax3.imshow(slope)
ax3.set_title('slope')

plt.show()

## 4. Samples Deforestation

In [None]:
# Get unique polygons. The raster was previously created, write one unique number 
# for each polygon. The noData value is 0.
unique, counts = np.unique(prodes, return_counts=True)
unique.shape

In [None]:
# Substitute the original numbers to sequential numbers in the form range(0,n,1), 
# where n is the total of unique values.
print(unique.shape, counts.shape)

In [None]:
# Plotting a graph with the number of series available per deforestation polygon.
plt.figure(figsize=(20,3))
plt.plot(range(1,len(counts)), counts[1:], linewidth=.3)
plt.ylim(0,number_pixels)
plt.xlim(0,)
plt.show()

In [None]:
# Calculating the total of deforestation samples.
counts_copy = counts.copy()
counts_copy[counts>number_pixels] = number_pixels
total = np.sum(counts_copy[1:])
print('Half number of samples:', total)

In [None]:
# Randomly select the samples.
ind_def = np.zeros([total,2], dtype=np.int)
num = 0

for i in tqdm(unique[1:]):
    ind_i, ind_j = np.where(prodes==i)
    
    if len(ind_i)<number_pixels:
        rng = np.random.default_rng()
        indexes = rng.choice(len(ind_i), size=len(ind_i), replace=False)
        
    else:
        rng = np.random.default_rng()
        indexes = rng.choice(len(ind_i), size=number_pixels, replace=False)
        
    for ii in indexes:
        ind_def[num,0] = ind_i[ii]
        ind_def[num,1] = ind_j[ii]
        num += 1

In [None]:
# Plotting the locations of deforestation training samples over the reference band.
plt.figure(figsize=(10,10))
plt.imshow(band, vmin=-50, vmax=3000)
plt.plot(ind_def[:,1], ind_def[:,0], '.r')
plt.show()

## 5. Samples not Deforestation

In [None]:
# Plotting slope reference data.
slope[band==data.GetRasterBand(1).GetNoDataValue()]  = None
slope[prodes!=0]        = None
slope[prodes_up==1]     = None

plt.imshow(slope)
plt.show()

In [None]:
# Creating the slope classes.
# 0-4
# 4-6
# 6+

slope_ref = slope.copy()

cut = slope<4
slope_ref[cut] = -3

cut = slope>=4
cut = cut * (slope<6)
slope_ref[cut] = -2

cut = slope>=6
slope_ref[cut] = -1

slope_ref = slope_ref + 4

In [None]:
# Histogram for the number of pixels in each class.
plt.hist(slope_ref.ravel())
plt.show()

In [None]:
# Getting the classes.
slope_ref[np.isnan(slope_ref)] = 0
unique, counts = np.unique(slope_ref, return_counts=True)
unique, counts

In [None]:
# Number of pixels in each slope class.
number_pixels = int(total/(len(unique)-1))
print('Number of pixels for each class:', number_pixels)

In [None]:
# Randomly select the natural vegetation samples.
ind_not_def = np.zeros([number_pixels*(len(unique)-1),2], dtype=np.int)
num = 0

for i in tqdm(range(1, len(unique))):
    ind_i, ind_j = np.where(slope_ref==i)
    
    if len(ind_i)<number_pixels:
        rng = np.random.default_rng()
        indexes = rng.choice(len(ind_i), size=len(ind_i), replace=False)
        
    else:
        rng = np.random.default_rng()
        indexes = rng.choice(len(ind_i), size=number_pixels, replace=False)
        
    for ii in indexes:
        ind_not_def[num,0] = ind_i[ii]
        ind_not_def[num,1] = ind_j[ii]
        num += 1

In [None]:
# Plotting the location of natural vegetation training samples.
plt.figure(figsize=(10,10))
plt.imshow(band, vmin=-50, vmax=3000)
plt.plot(ind_not_def[:,1], ind_not_def[:,0], '.r')
plt.show()

In [None]:
# Difference between the number of deforestation and natural vegetation samples.

total-num

## 6. Merge the Samples Indices

In [None]:
# Merging the deforestation and natural vegetation samples indices in the same array.

samples_ind = np.zeros([len(ind_def)+len(ind_not_def), 3], dtype=np.int)

samples_ind[:len(ind_def), 0] = ind_def[:,0]
samples_ind[:len(ind_def), 1] = ind_def[:,1]
samples_ind[:len(ind_def), 2] = [1]*len(ind_def)

samples_ind[len(ind_def):, 0] = ind_not_def[:,0]
samples_ind[len(ind_def):, 1] = ind_not_def[:,1]
samples_ind[len(ind_def):, 2] = [0]*len(ind_not_def)

print(samples_ind[-1,:], '\n')

np.random.shuffle(samples_ind)
np.random.shuffle(samples_ind)
print(samples_ind[-10:])

Populating...

## 7. Populate Series

In [None]:
# Acquiring the pixel time series for the training samples.

t1 = time.time()

if platform == 'Sentinel':
    cubes = ['band2', 'band3', 'band4', 'band8a', 'band11', 'band12', 'NDVI', 'EVI']
elif platform == 'Landsat':
    cubes = ['band2', 'band3', 'band4', 'band5', 'band6', 'band7', 'NDVI', 'EVI']

cube = gdal.Open(path_band)

samples = np.zeros([len(samples_ind), cube.RasterCount, len(cubes)], dtype=np.float32)

for cube_ind in range(len(cubes)):
    print('---- Populate '+cubes[cube_ind]+' ----')
    cube_path = f'./cubes/filled/{str(year)}.{cell}.{cubes[cube_ind]}.tif'
    cube = gdal.Open(cube_path).ReadAsArray()
    
    ii = 0
    for i,j,k in tqdm(samples_ind):
        samples[ii, :, cube_ind] = cube[:,i,j]/10000
        ii += 1
        
    del cube
    
print('---- Done! ----')
t2 = time.time()
print('Elapsed time: %.3f minutes' % ((t2-t1)/60))

In [None]:
# Plotting some of the training samples. 0 is natural deforestation and 1 is deforestation.

f, ax = plt.subplots(ncols=10, sharey=True)
f.set_figwidth(20)
f.set_figheight(10)

for i in range(10):
    ax[i].imshow(samples[i,:,:], vmin=0, vmax=.8, interpolation='nearest')
    ax[i].title.set_text(str(samples_ind[i,2]))

plt.show()

Saving the samples.

In [None]:
# Saving the training samples in a file. The reference values for the samples
# are saved in a separate file.
os.makedirs('./training_samples/LSTM')

date = datetime.datetime.today()

np.save(f'./training_samples/LSTM/{samples_id}.{year}.samples.npy', samples)
np.save(f'./training_samples/LSTM/{samples_id}.{year}.truth.npy', samples_ind[:,2])

In [None]:
# printing the path to the created training samples.
print('Saved in: ', f'./training_samples/LSTM/{samples_id}.{year}.samples.npy')
print('Saved in: ', f'./training_samples/LSTM/{samples_id}.{year}.truth.npy')

## END