# Stratified random sampling from NDWI mosaic

## Load packages

In [1]:
%matplotlib inline

import os
import xarray as xr
import numpy as np
import pandas as pd
import geopandas as gpd
from matplotlib import pyplot as plt

import sys
sys.path.append('../Scripts')
from deafrica_spatialtools import xr_rasterize


In [10]:
# define area name

area_name = 'Eastern'

## Convert to one mosaic (only do it once)

In [3]:
# make tif

if not os.path.exists(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif"):
    os.chdir('NDWI_composite')
    os.system(f"gdalbuildvrt {area_name.lower()}_NDWI_mosaic.vrt {area_name.lower()}_NDWI_tile*.tif")
    os.system("gdal_translate "\
       "-co BIGTIFF=YES "\
       "-co COMPRESS=DEFLATE "\
       "-co ZLEVEL=9 "\
       "-co PREDICTOR=1 "\
       "-co TILED=YES "\
       "-co BLOCKXSIZE=1024 "\
       "-co BLOCKYSIZE=1024 "\
       +f"{area_name.lower()}_NDWI_mosaic.vrt "+ f"{area_name.lower()}_NDWI_mosaic.tif")
    os.chdir('../')

## Load NDWI mosaic and clip to AEZ (TODO: use AEZ-large_water_bodies)

In [None]:
if not os.path.exists(area_name): os.mkdir(area_name)

In [None]:
ds = xr.open_rasterio(f"NDWI_composite/{area_name.lower()}_NDWI_mosaic.tif").squeeze()

In [None]:
#load shapefile
#gdf = gpd.read_file(f'../../shapes/simplified_AEZs/{area_name}.shp')
gdf = gpd.read_file(f'../../shapes/AEZs_ExcludeLargeWB/AEZs_ExcludeLargeWB_update_{area_name}.shp')

#rasterize shapeile
mask = xr_rasterize(gdf=gdf,
                     da=ds)

ds = ds.where(mask)
ds = ds.where(ds!=0)

In [None]:
dataset = ds.to_dataset(name='ndwi')

In [None]:

#ds.plot.imshow();

In [None]:
del mask

## Check NDWI distribution and determine thresholds

In [11]:
# 5 bins
freq_thresh = [0.1, 0.3, 0.6, 0.9]
n_class = len(freq_thresh)+1
frac_sample = [0.1, 0.1, 0.2, 0.3, 0.3]

In [None]:
if not os.path.exists(f'{area_name}/ndwi_{area_name}.csv'):
    histy, histx, tmp = dataset.ndwi.plot.hist(bins=100, cumulative=True, density=True);
    np.savetxt(f'{area_name}/ndwi_{area_name}.csv', np.vstack((histx[1:], histy)).transpose(),fmt='%.3f', delimiter=',')

In [None]:
# use wofs
x, y, t = np.loadtxt(f'wofs_summary_aez/wofs_{area_name}.csv', delimiter=',', unpack=True)
perc = np.interp(freq_thresh, x, y)
print('percentile for ephemeral and permanent water', perc)
histx, histy = np.loadtxt(f'{area_name}/ndwi_{area_name}.csv', delimiter=',', unpack=True)
thresh = np.interp(perc, histy, histx)
print('Thresholds', thresh)

Southen Thresholds [-0.071488 -0.048836 -0.000376  0.033232]
Eastern Thresholds [-0.0627   -0.043991 -0.035081  0.030288]

## Classify into bins of different water detection frequencies

In [None]:
label = np.zeros_like(dataset.ndwi.values, dtype=np.uint8)

label +=(dataset.ndwi.values<thresh[0]).astype(np.uint8)*1
for i in range(2, n_class):
    label += ((dataset.ndwi.values>=thresh[i-2]) & (dataset.ndwi.values<thresh[i-1])).astype(np.uint8)*i

label += (dataset.ndwi.values>=thresh[-1]).astype(np.uint8)*n_class

dataset['label'] = ('y','x'), label
dataset['label'].attrs = dataset.ndwi.attrs

In [None]:
# save classes

from datacube.utils.cog import write_cog

write_cog(dataset.label, f'{area_name}/{area_name}_label.tif')

## If the labels are alreay saved, read the labels

In [12]:
#
data = xr.open_rasterio(f'{area_name}/{area_name}_label.tif').squeeze()
dataset = data.to_dataset(name='label')

## plot classified ndwi

In [None]:
#dataset.label.plot.imshow(figsize=(10,10));
#plt.savefig(f'{area_name}_ndwi_classes.png')

## sample from array

In [None]:
del ds

In [None]:
# this will take a while, and we already know only class 1 (dry) is dominant

#class_sizes =[]
#for class_id in np.arange(1, n_class+1):
#    class_sizes.append((dataset.label==class_id).sum().values)

#class_sizes = np.array(class_sizes)
#print(class_sizes)
#print(class_sizes/class_sizes.sum())

In [13]:
if area_name in ['Western', 'Eastern', 'Southern', 'Central']: 
    n_sample = 500
else: n_sample = 300

# distribute points across classes
n_sample_class = (n_sample * np.array(frac_sample)).astype(int) #np.ceil(n_sample*1./ n_class).astype(int)
print(n_sample_class)

[ 50  50 100 150 150]


In [None]:
label_picked = {}
for class_id in np.arange(1, n_class+1):
    #if class_sizes[class_id-1]> 1e9:
    #already know only class 1 (dry) and big
    if class_id ==1:
        # slightly over sample
        n_sample_over = 5*n_sample_class[class_id-1] #np.ceil(1.5*n_sample_class[class_id-1]*len(dataset.x)*len(dataset.y)/class_sizes[class_id-1]).astype(int)
        random_x = np.random.choice(np.arange(len(dataset.x)), n_sample_over, replace=False)
        random_y = np.random.choice(np.arange(len(dataset.y)), n_sample_over, replace=False)
        match = dataset.label.values[random_y, random_x] == class_id
        random_y, random_x = random_y[match], random_x[match]
        if len(random_y) < n_sample_class[class_id-1]:
            print("Not enough points are picked, try increase the number of random points")
            break
        else:
            pick = np.random.choice(np.arange(len(random_y)), n_sample_class[class_id-1], replace=False)
            y, x = random_y[pick], random_x[pick]
    else:
        index = np.argwhere(dataset.label.values.flatten() == class_id).squeeze()
        picked = np.random.choice(index, n_sample_class[class_id-1], replace=False)
        # convert back to x, y 
        y, x  = np.unravel_index(picked, dataset.label.values.shape)
    label_picked[class_id] = (y, x)
    np.savetxt(f'{area_name}/{area_name}_class_{class_id}.csv', np.vstack((dataset.y[y].values, dataset.x[x].values)).transpose(),fmt='%d', delimiter=',')

In [None]:
for class_id in np.arange(1, n_class+1):
    y, x = label_picked[class_id]
    df = pd.DataFrame({'y': dataset.y[y].values, 'x':dataset.x[x].values})
    #df = pd.read_csv(f'{area_name}/{area_name}_class_{class_id}.csv', header=None, names=['y','x'] )
    df['class']=class_id
    if class_id ==1: 
        dfs = df
    else: 
        dfs = dfs.append(df, ignore_index=True)

len(dfs)

In [None]:
gdf = gpd.GeoDataFrame(
        dfs,
        crs=dataset.label.crs,
        geometry=gpd.points_from_xy(dfs.x, dfs.y)).reset_index()

gdf = gdf.drop(['x', 'y'],axis=1)

gdf.to_file(f'{area_name}/{area_name}_samples.shp')

In [14]:
%%time

from skimage.morphology import disk

min_dist = 1000 # this is x, y index, so 15 km 
# buffer around picked points
offset_y, offset_x = np.where(disk(min_dist)==1)
offset_y, offset_x = offset_y-min_dist, offset_x-min_dist

da_shape = dataset.label.values.shape
da = dataset.label.values.copy().flatten()


label_picked = {}

class_id = 1
# slightly over sample
n_sample_over = 5*n_sample_class[class_id-1] #np.ceil(1.5*n_sample_class[class_id-1]*len(dataset.x)*len(dataset.y)/class_sizes[class_id-1]).astype(int)
random_x = np.random.choice(np.arange(len(dataset.x)), n_sample_over, replace=False)
random_y = np.random.choice(np.arange(len(dataset.y)), n_sample_over, replace=False)
match = dataset.label.values[random_y, random_x] == class_id
random_y, random_x = random_y[match], random_x[match]
if len(random_y) < n_sample_class[class_id-1]:
    print("Not enough points are picked, try increase the number of random points")
else:
    pick = np.random.choice(np.arange(len(random_y)), n_sample_class[class_id-1], replace=False)
    y, x = random_y[pick], random_x[pick]
label_picked[class_id] = (y, x)
for yx in zip(y,x):
    buffer_y, buffer_x = yx[0]+offset_y, yx[1]+offset_x
    # within boundary
    mask_ind = (buffer_y>=0) & (buffer_x>=0) & (buffer_y<da_shape[0]) & (buffer_x<da_shape[1])
    mask = np.ravel_multi_index((buffer_y[mask_ind], buffer_x[mask_ind]), da_shape)
    da[mask] = 0
    
for class_id in np.arange(2, n_class+1):
    
    n_batch = (n_sample_class[class_id-1]/5).astype(int)
    n_points = (n_sample_class[class_id-1]/n_batch).astype(int)

    for i in range(n_batch):
        index = np.argwhere(da == class_id).squeeze()
        picked = np.random.choice(index, n_points, replace=False)
        # convert back to x, y 
        y, x  = np.unravel_index(picked, da_shape)
        for yx in zip(y,x):
            buffer_y, buffer_x = yx[0]+offset_y, yx[1]+offset_x
            # within boundary
            mask_ind = (buffer_y>=0) & (buffer_x>=0) & (buffer_y<da_shape[0]) & (buffer_x<da_shape[1])
            mask = np.ravel_multi_index((buffer_y[mask_ind], buffer_x[mask_ind]), da_shape)
            da[mask] = 0
        if i ==0: 
            sample_y, sample_x = y, x
        else:
            sample_y, sample_x = np.concatenate((y, sample_y)), np.concatenate((x, sample_x))
    label_picked[class_id] = (sample_y, sample_x)
    

CPU times: user 5min 36s, sys: 1min 49s, total: 7min 26s
Wall time: 7min 26s


In [15]:
for class_id in np.arange(1, n_class+1):
    y, x = label_picked[class_id]
    df = pd.DataFrame({'y': dataset.y[y].values, 'x':dataset.x[x].values})
    #df = pd.read_csv(f'{area_name}/{area_name}_class_{class_id}.csv', header=None, names=['y','x'] )
    df['class']=class_id
    if class_id ==1: 
        dfs = df
    else: 
        dfs = dfs.append(df, ignore_index=True)

len(dfs)

500

In [16]:
gdf = gpd.GeoDataFrame(
        dfs,
        crs=dataset.label.crs,
        geometry=gpd.points_from_xy(dfs.x, dfs.y)).reset_index()

gdf = gdf.drop(['x', 'y'],axis=1)

gdf.to_file(f'{area_name}/{area_name}_samples_batched.shp')