In [1]:
# Dataset generation

In [2]:
# Import libraries
import numpy as np
from pathlib import Path
import rasterio as rio
from rasterio.warp import calculate_default_transform, reproject, Resampling
from rasterio.crs import CRS
from rasterio.io import MemoryFile
from rasterio.enums import Resampling
import affine
import os
from shapely.geometry import box
from rasterio.coords import BoundingBox
from rasterio.mask import mask as masker
import pystac

# RAM check
import psutil

In [4]:
# RAM check
process = psutil.Process()
print(process.memory_info().rss / 10**6)

121.663488


In [5]:
# input paths
load_vv = Path("../data/input/vv/vv.tif")
load_mask = Path("../data/input/mask/merged/mask.tif")
load_hand = Path("../data/input/hand/hand.tif")

In [6]:
# Setting
OUTPUT_SIZE = (256, 256)
DROPNA = True

In [7]:
# Function to change CRS system
# REPROJECTION CRS WITHOUT SAVING
# KEEPS IN --> RAM <--


def reproject_crs(file_path, target_crs):
    """Function to load tiff file from path
    with desired crs.
    """

    # Open the input GeoTIFF file
    src = rio.open(file_path)
    print(type(src))

    # Read metadata
    src_crs = src.crs
    src_transform = src.transform
    src_width = src.width
    src_height = src.height

    # Calculate the transform for reprojecting
    transform, width, height = calculate_default_transform(
        src_crs, target_crs, src_width, src_height, *src.bounds
    )

    # Create options for the output file
    kwargs = src.meta.copy()
    kwargs.update(
        {"crs": target_crs, "transform": transform, "width": width, "height": height}
    )

    # Create an in-memory dataset
    memfile = MemoryFile()
    dst = memfile.open(**kwargs)

    # Reproject and write to the in-memory dataset
    reproject(
        source=rio.band(src, 1),
        destination=rio.band(dst, 1),
        src_transform=src_transform,
        src_crs=src_crs,
        dst_transform=transform,
        dst_crs=target_crs,
        resampling=Resampling.nearest,
    )
    src.close()
    return dst

In [8]:
# Function to change CRS system
# REPROJECTION CRS WITHOUT SAVING
# KEEPS IN --> RAM <--


def reproject_crs2(file_path, target_crs):
    """Function to load tiff file from path
    with desired crs.
    """

    # Open the input GeoTIFF file
    with rio.open(file_path) as src:
        print(type(src))

        # Read metadata
        src_crs = src.crs
        src_transform = src.transform
        src_width = src.width
        src_height = src.height

        # Calculate the transform for reprojecting
        transform, width, height = calculate_default_transform(
            src_crs, target_crs, src_width, src_height, *src.bounds
        )

        # Create options for the output file
        kwargs = src.meta.copy()
        kwargs.update(
            {
                "crs": target_crs,
                "transform": transform,
                "width": width,
                "height": height,
            }
        )

        # Create an in-memory dataset
        memfile = MemoryFile()
        dst = memfile.open(**kwargs)

        # Reproject and write to the in-memory dataset
        reproject(
            source=rio.band(src, 1),
            destination=rio.band(dst, 1),
            src_transform=src_transform,
            src_crs=src_crs,
            dst_transform=transform,
            dst_crs=target_crs,
            resampling=Resampling.nearest,
        )
    return dst

In [9]:
# Loading all files with CRS:32632 as example
target_crs = CRS.from_epsg(32632)
vv = reproject_crs2(load_vv, target_crs)
mask = reproject_crs2(load_mask, target_crs)
hand = reproject_crs2(load_hand, target_crs)

<class 'rasterio.io.DatasetReader'>
<class 'rasterio.io.DatasetReader'>
<class 'rasterio.io.DatasetReader'>


In [10]:
# Load check
mask.read(1).shape

(10951, 14135)

In [11]:
# Memory check
print(process.memory_info().rss / 10**6)

1610.6496


In [12]:
print(vv.meta, mask.meta, hand.meta, sep="\n")

{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 14105, 'height': 10932, 'count': 1, 'crs': CRS.from_epsg(32632), 'transform': Affine(20.0, 0.0, 153265.79316470574,
       0.0, -20.0, 1000962.9311765691)}
{'driver': 'GTiff', 'dtype': 'uint8', 'nodata': 255.0, 'width': 14135, 'height': 10951, 'count': 1, 'crs': CRS.from_epsg(32632), 'transform': Affine(19.96365218701852, 0.0, 153227.4398530922,
       0.0, -19.96365218701852, 1000925.7648684348)}
{'driver': 'GTiff', 'dtype': 'float32', 'nodata': -9999.0, 'width': 12093, 'height': 12140, 'count': 1, 'crs': CRS.from_epsg(32632), 'transform': Affine(91.7998855124291, 0.0, 56178.489241547475,
       0.0, -91.7998855124291, 1667057.241646418)}


In [13]:
print(vv.bounds, mask.bounds, hand.bounds, sep="\n")

BoundingBox(left=153265.79316470574, bottom=782322.9311765691, right=435365.79316470574, top=1000962.9311765691)
BoundingBox(left=153227.4398530922, bottom=782303.8097683949, right=435413.663516599, top=1000925.7648684348)
BoundingBox(left=56178.489241547475, bottom=552606.6315255289, right=1166314.5047433525, top=1667057.241646418)


In [14]:
# Getting overlap bounding box between files
# So to keep parts of data that are needed in the RAM

left = [vv.bounds.left, mask.bounds.left, hand.bounds.left]
bottom = [vv.bounds.bottom, mask.bounds.bottom, hand.bounds.bottom]
right = [vv.bounds.right, mask.bounds.right, hand.bounds.right]
top = [vv.bounds.top, mask.bounds.top, hand.bounds.top]

overlap_bounds = BoundingBox(
    left=max(left), bottom=max(bottom), right=min(right), top=min(top)
)

In [15]:
# Convert bounds to polygon
overlap_polygon = box(*overlap_bounds)
print(overlap_polygon)

POLYGON ((435365.79316470574 782322.9311765691, 435365.79316470574 1000925.7648684348, 153265.79316470574 1000925.7648684348, 153265.79316470574 782322.9311765691, 435365.79316470574 782322.9311765691))


In [16]:
crop_img, crop_transform = masker(hand, shapes=[overlap_polygon], crop=True)

In [17]:
# Cropped image shape
crop_img.shape

(1, 2382, 3074)

In [18]:
# Create MemoryFile() out of crop_img
profile = hand.profile.copy()
profile.update(
    driver="GTiff",
    height=crop_img.shape[1],
    width=crop_img.shape[2],
    transform=crop_transform,
)

memfile = MemoryFile()
cropped_hand = memfile.open(**profile)
cropped_hand.write(crop_img)

In [19]:
cropped_hand.meta

{'driver': 'GTiff',
 'dtype': 'float32',
 'nodata': -9999.0,
 'width': 3074,
 'height': 2382,
 'count': 1,
 'crs': CRS.from_epsg(32632),
 'transform': Affine(91.7998855124291, 0.0, 153210.96822818503,
        0.0, -91.7998855124291, 1000957.2723682326)}

In [20]:
crop_transform

Affine(91.7998855124291, 0.0, 153210.96822818503,
       0.0, -91.7998855124291, 1000957.2723682326)

In [21]:
crop_img.shape

(1, 2382, 3074)

In [22]:
hand.transform

Affine(91.7998855124291, 0.0, 56178.489241547475,
       0.0, -91.7998855124291, 1667057.241646418)

In [23]:
hand.shape

(12140, 12093)

In [24]:
bigger = hand.shape[0] * hand.shape[1]
smaller = crop_img.shape[1] * crop_img.shape[2]

bigger / smaller

20.049664939879282

In [25]:
# Function to change the resolution of files to desired one


def rescale_image(input_file, scale_factor):
    # Read the data from the source file
    src = input_file
    data = src.read(
        out_shape=(
            src.count,
            int(src.height * scale_factor),
            int(src.width * scale_factor),
        ),
        resampling=Resampling.bilinear,
    )

    # Update the metadata
    transform = src.transform * src.transform.scale(
        (src.width / data.shape[-1]), (src.height / data.shape[-2])
    )

    # Update the profile
    profile = src.profile
    profile.update(
        driver="GTiff",
        height=data.shape[1],
        width=data.shape[2],
        transform=transform,
    )

    memfile = MemoryFile()
    scaled_dataset = memfile.open(**profile)
    scaled_dataset.write(data)

    return scaled_dataset, profile

In [26]:
# Set resolution to standard
RES = 20
mask_refactor = mask.res[0] / RES
hand_refactor = cropped_hand.res[0] / RES
print(mask_refactor, hand_refactor)

0.9981826093509261 4.589994275621455


In [27]:
mask_scaled, _ = rescale_image(mask, mask_refactor)
hand_scaled, _ = rescale_image(cropped_hand, hand_refactor)

In [28]:
print(vv.meta, mask_scaled.meta, sep="\n")

{'driver': 'GTiff', 'dtype': 'float32', 'nodata': None, 'width': 14105, 'height': 10932, 'count': 1, 'crs': CRS.from_epsg(32632), 'transform': Affine(20.0, 0.0, 153265.79316470574,
       0.0, -20.0, 1000962.9311765691)}
{'driver': 'GTiff', 'dtype': 'uint8', 'nodata': 255.0, 'width': 14109, 'height': 10931, 'count': 1, 'crs': CRS.from_epsg(32632), 'transform': Affine(20.000441113013455, 0.0, 153227.4398530922,
       0.0, -20.00017885829657, 1000925.7648684348)}


In [29]:
import psutil

process = psutil.Process()
print(process.memory_info().rss / 10**6)

2473.590784


In [30]:
# x, y = vv.transform * (120, 100)
# print(x, y)

In [31]:
class DatasetGenerator:

    # OK
    def __init__(self, collection_id):
        """Initiates an empty list. To be fed with different images data"""
        self.images = []
        self.ref_flag = False
        self.clipped_addresses = []
        self.row = 0
        self.col = 0
        self.collection_id = collection_id

    # OK
    # THE WAT THAT NO DATA ADDED?
    def add(self, image, name: str, set_nodata: int = 0):
        """Creates a dictionary of infromation about the given image.
        Then adds that dict to the images_data list.
        """

        # Correcting Nodata
        if image.nodata == None:
            image.nodata = set_nodata

        image_dict = dict()
        image_dict["name"] = name
        image_dict["image"] = image
        image_dict["band"] = image.read(1)

        self.images.append(image_dict)

    # OK
    def set_ref_image(self, name: str = "vv"):
        """Sets the image file which all dataset should be cliped
        with respect to that.
        Input:str -> name of the image
        Output:Bool -> True if done, False name not in the list
        """
        # Checks if the name is in added images list
        check_availibility = [True for image in self.images if image["name"] == name]
        if not check_availibility:
            return False

        for image in self.images:
            if image["name"] == name:
                self.ref_name = name
                self.ref_image = image["image"]
                self.ref_crs = image["image"].crs
                self.ref_res = image["image"].res
                self.ref_shape = image["image"].shape

        self.refrence_flag = True
        return True

    # OUTPUT IS NOT IMAGE, IS A LIST
    # NEED CHANGE?
    # Self.band needs to change
    def _create_clipped_image(self, image, band, row, col, height, width, name):

        ### need change self.band
        # band data array

        clipped_band = band[
            row : row + height,
            col : col + width,
        ]
        clipped_band = np.array(clipped_band)

        # Positioning
        tcol, trow = image.transform * (col, row)
        new_transform = affine.Affine(
            image.transform[0],
            image.transform[1],
            tcol,
            image.transform[3],
            image.transform[4],
            trow,
        )

        # creating clipped_image
        return_image = [
            clipped_band,
            image.crs,
            new_transform,
            clipped_band.shape[0],
            clipped_band.shape[1],
            image.dtypes[0],
            image.nodata,
            name,
        ]
        return return_image

    def _check_complete(self, images, height, width):
        """
        If any image in the same coordination has
        nodata value returns False, otherwise True.
        """
        # for image in images:
        #     with image.read(1) as band:
        #         if sum(sum(band == image.nodata)):
        #             return False
        # return True

        for image in images:
            if sum(sum(image[0] == image[6])):
                return False
            if image[0].shape != (height, width):
                return False
        return True

    def _save_image(self, save_path_format, image, col, row, mask_coverage):
        """
        Saves Clipped image into file
        """
        name = image[7]

        file_name = save_path_format.format(
            name=name, col=col, row=row, mask_coverage=mask_coverage
        )
        file_name = Path(file_name)

        os.makedirs(os.path.split(file_name)[0], exist_ok=True)

        with rio.open(
            file_name,
            "w",
            driver="GTiff",
            height=image[3],
            width=image[4],
            count=1,
            dtype=image[5],
            crs=image[1],
            transform=image[2],
        ) as dst:
            dst.write(image[0], 1)
        self.clipped_addresses.append(file_name)
        return file_name

    def _xy_from_row_col(self, image, row, col):
        """Returns coordinate of a pixel in one image from it's (row,col)"""
        x, y = image.xy(row, col)
        return x, y

    def _row_col_from_xy(self, image, x, y):
        """Returns (row,col) position of a pixel from it's coordinate"""
        row, col = image.index(x, y)
        return row, col

    def run(self, height: int = 256, width: int = 256, only_complete: bool = True):

        save_path_format = "../data/dataset/x{row}_y{col}_{mask_coverage}/{name}.tif"

        row = self.row
        while row < self.ref_shape[0]:
            col = self.col
            while col < self.ref_shape[1]:

                mask_coverage = 0
                clipped_images = []
                for img in self.images:

                    name = img["name"]
                    image = img["image"]
                    band = img["band"]

                    # Coverting row,col of refrence image to row,col of the current image
                    x, y = self._xy_from_row_col(self.ref_image, row=row, col=col)
                    trow, tcol = self._row_col_from_xy(image=image, x=x, y=y)

                    # Creating cropped image
                    clipped_image = self._create_clipped_image(
                        image, band, trow, tcol, height, width, name
                    )

                    # Append to list of images on same location
                    clipped_images.append(clipped_image)

                    # Calculate mask_coverage for the area of interest
                    if name == "mask":
                        mask_coverage = self._get_mask_coverage(clipped_image)

                ### _check_complete
                complete_check = self._check_complete(clipped_images, height, width)

                ### save images
                if complete_check:
                    # Calculate water coverage in the image
                    # Save images into path
                    for image in clipped_images:
                        self._save_image(
                            save_path_format, image, col, row, mask_coverage
                        )
                ### add to STAC
                ### append saved images into a list?
                # ->## Done inside _save_image method

                # Update column position
                col = col + width

            # Update row position
            row = row + height

        print("Tiles saved successfully")
        return self.clipped_addresses

    def set_row_col_for_generator(self, row, col):
        """This method is used when a half bulit collection is loaded.
        To continue generating clips from the given row, col
        """
        self.row = row
        self.col = col

    def _create_collection(self):

        self.collection = pystac.Collection(
            id="",
            description="",
            extent="",
            title="",
            href="",
            extra_fields={},
            catalog_type="",
            license="",
        )

    # -------------- later ------------
    def _get_mask_coverage(self, mask):
        """Returns the mask coverage in the scene
        0 is considered as non covered area in input file
        1 is considered as covered area in input file
        Returns integer in range (0, 100)
        """

        data = mask[0]
        image_count = data.size
        mask_count = (data == 1).sum()

        # print(mask)
        # print(data)

        if image_count == 0:
            return -1

        return int((mask_count / image_count) * 100)

    def _create_asset(self, file_href):
        return pystac.Asset(
            href=file_href,
            media_type=pystac.MediaType.GEOTIFF,
        )

    def _create_item(
        self,
        id,
        geometry,
        bbox,
    ):

        item = pystac.Item()
        item.id = id
        item.geometry = geometry
        item.bbox = bbox

    pystac.Item(
        id="",  # ?
        geometry="",  #
        bbox="",  #
        datetime="",  #
        start_datetime="",  #
        end_datetime="",  #
        href="",  #
        collection="",  #
        # for 1 or 2:
        # "water_coverage": in percent%
        # "minimum MASK" value
        properties={},  # 1 -> 1 or 2
        extra_fields={},  # 2 -> 1 or 2
        assets={},  #
    )

    def _add_to_stac(self):
        pass

In [32]:
data = np.array(None)
image_count = data.size if data.size is not None else 0
image_count

1

In [33]:
class StacHandler:

    def __init__(self, catalog_ref):

        if catalog_ref:  # if is valid
            # load catalog from ref
            pass
        else:
            # create stac catalog
            pass

In [34]:
ds_generator = DatasetGenerator(collection_id="randomId")
ds_generator.add(vv, name="vv", set_nodata=0)
ds_generator.add(mask_scaled, name="mask")
ds_generator.add(hand_scaled, name="HAND")

ds_generator.set_ref_image("vv")
paths = ds_generator.run(height=256, width=256)

Tiles saved successfully


In [39]:
# Memory usage
print(process.memory_info().rss / 10**6)

4756.852736
