# SCRIPT 08: Prediction

This is the eighth script in the methodology. Here, a model is used to generate the final classification map over all tiles. It can take a long time to complete. This script was created to let the user define which tiles need to be predicted upon, so that if the prediction stops before completion of all tiles, it can be restarted with only the remaining tiles. It also allows the prediction of only some test tiles before commiting the process to the whole study area.

In the following cells, please refer to the comments in the code for further explanations of its functioning.

In [None]:
# import packages
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
import rasterio as r
import os

In [None]:
# defines the model number (id) an the iteration used for prediction
model_num = 5
iteration = 3

# folder to save the predictions
predictions_folder = f'/home/bruno.matosak/Semiarido/MultiInput/predictions/model_{str(model_num).zfill(2)}/iteration_{str(iteration).zfill(2)}'

# creates the folder to save predictions, in case it still does not exist
os.makedirs(predictions_folder, exist_ok = True)

In [None]:
# loads the limits from the samples used during training. it is very important that
# these limits must match the limits used in the samples used to train the model.
limits_s1_y = np.load('/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/5K_PRO_s1_y_limits.npy')
limits_s2_y = np.load('/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/5K_PRO_s2_y_limits.npy')
limits_s1_m = np.load('/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/5K_PRO_s1_m_limits.npy')
limits_s2_m = np.load('/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data/5K_PRO_s2_m_limits.npy')

In [None]:
# loading the model and showing its layers
model = tf.keras.models.load_model(f'/home/bruno.matosak/Semiarido/MultiInput/trainings/model_{str(model_num).zfill(2)}/iteration_{str(iteration).zfill(2)}/Model.h5')
model.summary()

In [None]:
# in this cell, the prediction is made

# defining the tiles to be predicted upon
tiles = [  9,  10,  11,  13,  14,  15,  23,  24,  25,  26,
          27,  28,  29,  30,  31,  32,  38,  39,  40,  41, 
          42,  43,  44,  45,  46,  47,  48,  51,  52,  53, 
          54,  55,  56,  57,  58,  59,  60,  61,  62,  63, 
          65,  66,  67,  68,  69,  70,  71,  72,  73,  74, 
          75,  76,  77,  78,  79,  80,  81,  82,  83,  84, 
          85,  86,  87,  88,  89,  90,  91,  92,  93,  94, 
          95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
         105, 106, 107, 108, 109, 110, 111, 113, 114, 115,
         116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
         126, 127, 130, 131, 132, 133, 134, 135, 136, 137,
         138, 139, 147, 148, 149, 150, 151, 152, 153, 154,
         163, 164, 165, 166, 167, 168, 169, 179, 180, 181,
         182, 183]

# defining samples predefinintions. must match values used previously.
chip_size = 254
total_overlap = 186
side_overlap = int(total_overlap/2)
chip_util_size = chip_size-total_overlap

# the number of samples to predict at every model.predict call.
batch_size = 1000

# looping through all tiles
for tile in tqdm(tiles):
    # do the prediction if the result file still does not exist.
    if not os.path.exists(os.path.join(predictions_folder, f'result_id{str(tile).zfill(3)}.tif')):

        # path to referemce to save file
        ref_reduced = r.open(f'/home/bruno.matosak/Semiarido/MultiInput/yearly_reduction_S1/Reduction_SAR_Year_id{str(tile).zfill(3)}.tif').read(1)

        # rows and columns of tile
        i_size = int((ref_reduced.shape[0]-2*side_overlap)/chip_util_size)
        j_size = int((ref_reduced.shape[1]-2*side_overlap)/chip_util_size)

        # chips origins
        origins = []
        for i in range(i_size):
            for j in range(j_size):
                chip_check = ref_reduced[i*chip_util_size:i*chip_util_size+chip_size, j*chip_util_size:j*chip_util_size+chip_size]
                if np.sum(chip_check==0)<chip_size*chip_size:
                    origins.append([i*chip_util_size,j*chip_util_size])
        origins = np.asarray(origins)

        # load tile data
        s1_y =  r.open(f"/home/bruno.matosak/Semiarido/MultiInput/yearly_reduction_S1/Reduction_SAR_Year_id{str(tile).zfill(3)}.tif")
        s2_y =  r.open(f"/home/bruno.matosak/Semiarido/MultiInput/yearly_reduction_S2/Reduction_Optical_Year_id{str(tile).zfill(3)}.tif")
        s1_m = [r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S1/Reduction_SAR_Months_id{str(tile).zfill(3)}_VV.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S1/Reduction_SAR_Months_id{str(tile).zfill(3)}_VH.tif")]
        s2_m = [r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B2.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B3.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B4.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B8.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B11.tif"),
                r.open(f"/home/bruno.matosak/Semiarido/MultiInput/monthly_reduction_S2/Reduction_Optical_Months_id{str(tile).zfill(3)}_B12.tif")]

        # placeholder for prediction result
        result = ref_reduced*0

        # process data in batches
        for i in range(0, len(origins), batch_size):
            # origins for samples in batch
            batch_origins = origins[i:i+batch_size]
            bs = len(batch_origins)

            # creates a placeholder to store the data to be predicted upon
            chips_s1_y = np.zeros([bs, chip_size, chip_size, s1_y.count], dtype=np.float32)
            chips_s2_y = np.zeros([bs, chip_size, chip_size, s2_y.count], dtype=np.float32)
            chips_s1_m = np.zeros([bs, s1_m[0].count, chip_size, chip_size, len(s1_m)], dtype=np.float32)
            chips_s2_m = np.zeros([bs, s2_m[0].count, chip_size, chip_size, len(s2_m)], dtype=np.float32)

            # filling the placeholders - iterates through every sample origin
            for ii in range(len(batch_origins)):
                # creates the window to obtain data
                w = r.windows.Window(batch_origins[ii][1], batch_origins[ii][0], chip_size, chip_size)
                # filling S1 data
                for j in range(len(s1_m)):
                    chips_s1_m[ii, :, :, :, j] = s1_m[j].read(window=w)
                chips_s1_y[ii, :, :, :] = np.moveaxis(s1_y.read(window=w), 0, -1)
                # filling S2 data
                for j in range(len(s2_m)):
                    chips_s2_m[ii, :, :, :, j] = s2_m[j].read(window=w)
                chips_s2_y[ii, :, :, :] = np.moveaxis(s2_y.read(window=w), 0, -1)

            # scalling data according to limits defined during training
            # S1
            for ii in range(s1_y.count):
                chips_s1_y[:,:,:,ii] = (chips_s1_y[:,:,:,ii]-limits_s1_y[ii,0])/(limits_s1_y[ii,1]-limits_s1_y[ii,0])
                chips_s1_m[:,:,:,:,ii] = (chips_s1_m[:,:,:,:,ii]-limits_s1_m[ii,0])/(limits_s1_m[ii,1]-limits_s1_m[ii,0])

            # S2
            for ii in range(s2_y.count):
                chips_s2_y[:,:,:,ii] = (chips_s2_y[:,:,:,ii]-limits_s2_y[ii,0])/(limits_s2_y[ii,1]-limits_s2_y[ii,0])
                chips_s2_m[:,:,:,:,ii] = (chips_s2_m[:,:,:,:,ii]-limits_s2_m[ii,0])/(limits_s2_m[ii,1]-limits_s2_m[ii,0])

            # correcting data limits
            chips_s1_y[chips_s1_y>1] = 1
            chips_s1_y[chips_s1_y<0] = 0
            chips_s2_y[chips_s2_y>1] = 1
            chips_s2_y[chips_s2_y<0] = 0
            chips_s1_m[chips_s1_m>1] = 1
            chips_s1_m[chips_s1_m<0] = 0
            chips_s2_m[chips_s2_m>1] = 1
            chips_s2_m[chips_s2_m<0] = 0

            # doing the prediction
            batch_predict = [chips_s1_y, chips_s2_y, chips_s1_m, chips_s2_m]
            pred = model.predict(batch_predict, batch_size=25)
            pred = tf.argmax(pred, -1)
            for ii in range(len(pred)):
                result[batch_origins[ii][0]+side_overlap:batch_origins[ii][0]+side_overlap+chip_util_size, batch_origins[ii][1]+side_overlap:batch_origins[ii][1]+side_overlap+chip_util_size] = pred[ii]

        # Register GDAL format drivers and configuration options with a
        # context manager.
        with r.Env():

            # Write an array as a raster band to a new 8-bit file. For
            # the new file's profile, we start with the profile of the source
            profile = s2_y.profile

            # And then change the band count to 1, set the
            # dtype to uint8, and specify LZW compression.
            profile.update(
                dtype=r.uint8,
                count=1,
                nodata=0)
            
            # opening file and writig data to it
            with r.open(os.path.join(predictions_folder, f'result_id{str(tile).zfill(3)}.tif'), 'w', **profile) as dst:
                dst.write(result.astype(r.uint8), 1)