# SCRIPT 04: Create SHP with Possible Samples

This is the fourth script used in the methodology. Here, a shapefile is created for each tile with a grid of possible training samples. This is used afterwards with the reference created in the past script in order to manually select appropriate training samples.

In the following cells, please refer to the comments in the code for further explanations of its functioning.

In [None]:
# importing packages
import rasterio as r
import glob
import numpy as np
import geopandas as gpd
from shapely.geometry import Polygon
from tqdm import tqdm

In [None]:
# gets the list of files to use to define the grid. is is the rasterized reference files.
files = glob.glob('/home/bruno.matosak/Semiarido/MultiInput/segmentations/GEM_id*.tif')
files.sort()

In [None]:
# in this cell the possible samples grid is created
# iterating through all tiles
for file in tqdm(files):
    # open a file for reference
    ref_dataset = r.open(file)
    # reads first band from reference dataset
    ref = ref_dataset.read(1)

    # defining some important parameters
    chip_size = 254 # the width and height of every sample
    total_overlap = 186 # the overlap between each sample (equals to the difference between
                        # the sample size and the prediction size)
    side_overlap = int(total_overlap/2) # overlap on each side
    chip_util_size = chip_size-total_overlap # the output size. depends on the network.

    # gets the number of lines and columns for the grid
    i_size = int((ref.shape[0]-2*side_overlap)/chip_util_size)
    j_size = int((ref.shape[1]-2*side_overlap)/chip_util_size)

    # list to receive all origins of samples
    origins = []
    # array to save the reference of teh samples. used to study the references later
    chips_refs = []
    # loops to create the grid
    for i in range(i_size):
        for j in range(j_size):
            # appends the origin to origins list
            origins.append([i*chip_util_size,j*chip_util_size])
            # appends th reference
            chips_refs.append(ref[i+side_overlap:i+side_overlap+chip_util_size, j+side_overlap:j+side_overlap+chip_util_size])

    # converts list to array
    origins = np.asarray(origins)
    # list to save geometries
    geometries = []
    # iterates through origins array
    for origin in origins:
        # converts origin list to array
        o = np.asarray(origin)
        # defines the sample geometry coordinates
        square = np.asarray([o+side_overlap,
                             [o[0]+side_overlap, o[1]+chip_util_size+side_overlap],
                             [o[0]+chip_util_size+side_overlap, o[1]+chip_util_size+side_overlap],
                             [o[0]+chip_util_size+side_overlap, o[1]+side_overlap],
                             o+side_overlap])
        # transform coordinates from row, col to lat, long
        coords = np.asarray(r.transform.xy(transform=ref_dataset.transform, rows=square[:,0], cols=square[:,1])).T
        # add coordinated to list
        geometries.append(Polygon(coords))
    
    # creates GeoDataFrame with geopandas to save it later to shapefile
    shp = gpd.GeoDataFrame({'ori_row': origins[:,0], 'ori_col': origins[:,1], 'tile_id': [int(file.split('d')[-1].split('.')[0])]*len(geometries), 'geometry': geometries}, geometry='geometry')
    # assign correct projection
    shp.crs = ref_dataset.crs
    # finally, saves the shapefile to a file
    shp.to_file(f"/home/bruno.matosak/Semiarido/MultiInput/samples/shp_possible_samples/possible_samples_{file.split('_')[-1].split('.')[0]}.shp")