In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import rasterio
import json
import random
import numpy as np
import shapely
from tqdm import tqdm
import matplotlib.pyplot as plt
import geojson
import rasterio.features
random.seed(42) # Set a fixed seed for reproducibility

img_width, img_height = 256, 256

In [2]:
# Open the tif file containing the orthographic map of Denmark 
path = './ortodata_2014_res_2_crop.tif'
dataset = rasterio.open(path)

# Get some statistics about the data, such as the bounds of the coordinates
left, bottom, right, top = np.array(dataset.bounds).astype(int)
print(left, bottom, right, top)
print(dataset.shape)
print(dataset.crs)

440000 6070000 610000 6340000
(135000, 85000)
EPSG:25832


In [3]:
%%time
# This file contains annotations for different types of nature phenomena (e.g. lakes, forests)
path_to_file = './naturtyper_layer.geojson'

with open(path_to_file, 'r') as f:
    gj = geojson.load(f)
print(len(gj['features']), gj.keys())

# Filter to the annotations of the lakes around Denmark
gj_features = []
for feature in gj['features']:
    if feature['properties']['Natyp_kode'] == 6: # Code for lakes is 6
        gj_features.append(feature)
print(len(gj_features))
gj_features[0]

312325 dict_keys(['type', 'name', 'crs', 'features'])
152920
CPU times: user 1min 14s, sys: 5.62 s, total: 1min 20s
Wall time: 1min 20s


{"geometry": {"coordinates": [[[[501332.248, 6224773.935], [501334.244, 6224779.934], [501334.244, 6224784.933], [501333.246, 6224790.932], [501327.243, 6224789.932], [501317.249, 6224783.933], [501316.243, 6224780.934], [501319.244, 6224774.935], [501324.25, 6224771.936], [501329.247, 6224770.936], [501332.248, 6224773.935]]]], "type": "MultiPolygon"}, "properties": {"Aendr_kode": 0, "Aendrbegr": "Ikke udfyldt", "Besig_dato": null, "Bruger_id": "00000000-0000-0000-0000-000000000000", "CVR_kode": 29189919, "CVR_navn": "Herning kommune", "Gl_sys_ref": null, "Journalnr": null, "Link": null, "Natyp_kode": 6, "Natyp_navn": "Sø", "Objekt_id": "0460cd7c-5353-11e2-af2b-00155d01e765", "Off_kode": 1, "Offentlig": "Synlig for alle", "Oprettet": "2006-12-31T01:00:00", "Oprindelse": "Ikke udfyldt", "Oprindkode": 0, "Sagsbeh": null, "Shape_area": 252.94599999301087, "Shape_length": 0.0, "Status": "Gældende / Vedtaget", "Statuskode": 3, "Systid_fra": "2006-12-31T01:00:00", "Systid_til": null, "Temak

In [4]:
# Get dimensions and transform of the original raster
out_shape = (dataset.height, dataset.width)
transform = dataset.transform

# Initialize an empty list to store the geometries
geometries = []

failed, skipped = 0, 0

# Loop over the annotations
for i, feature in tqdm(enumerate(gj_features)):
    geometry = feature['geometry']
    coords = geometry['coordinates']
    try:
        poly = shapely.geometry.shape(geometry)
        if not poly.is_valid: # Skip invalid shapes
            skipped += 1
            continue
    
        # Get the coordinates of the lake in the form [(x1, y1), (x2, y2), ...]
        coords_xy = np.array(coords).reshape(-1, 2)#.astype(int)
            
        # Split x and y coordinates
        x, y = coords_xy[:, 0], coords_xy[:, 1]
        
        # Transform the polygon's coordinates to indices within the orthographic raster
        x_trans, y_trans = rasterio.transform.rowcol(transform, x, y)

        # Convert to a polygon and add to our list of shapes
        poly = shapely.Polygon(zip(x_trans, y_trans))
        
        # Convert to the GeoJSON format for rasterio
        geometries.append((shapely.geometry.mapping(poly), 1))
        
    except Exception as e:
        if failed == 0:
            print(f"Failed to convert annotation {i}: {e}")
        failed += 1
        continue

print('Creating mask...')
# Rasterize all shapes onto a single map
mask = rasterio.features.rasterize(
    geometries, out_shape=out_shape, fill=0, 
    default_value=1, dtype=rasterio.uint8
)

print(f'Failed to convert {failed} annotations')
print(f'Skipped {skipped} annotations')

174it [00:00, 1735.64it/s]

Failed to convert annotation 7: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (1, 2) + inhomogeneous part.


152920it [01:01, 2474.06it/s]


Creating mask...
Failed to convert 3760 annotations
Skipped 34 annotations


In [5]:
# Prepare folders to store the annotated data in
base_folder = 'denmark_data'
os.makedirs(base_folder, exist_ok = True)
# Create the 'splits' - here we only use the test split as we only evaluate on the Danish data
for split in ['test']: #['train', 'val', 'test']:
    os.makedirs(f'{base_folder}/{split}', exist_ok = True)
    os.makedirs(f'{base_folder}/{split}/msk', exist_ok = True)
    os.makedirs(f'{base_folder}/{split}/img', exist_ok = True)

In [6]:
%%time
img_rgb  = dataset.read((1,2,3))#.astype(np.float16)
print(img_rgb.shape)

(3, 135000, 85000)
CPU times: user 9.74 s, sys: 1min 23s, total: 1min 33s
Wall time: 1min 40s


In [7]:
I, J = img_rgb.shape[1:]
success = 0
idx = 0
chosen_splits = []
for i in tqdm(range(0, I-img_width, img_width)):
    for j in range(0, J, img_height):
        x_start, y_start = i, j

        # Define the bounds of the image and mask, then crop them
        x_end, y_end = i + img_width, j + img_height
        mask_crop = mask[x_start:x_end, y_start:y_end]
        img_crop = img_rgb[:, x_start:x_end, y_start:y_end]

        # Ensure we have the right shape
        if mask_crop.shape != (img_width, img_height):
            continue
        # Only keep images with some amount of water in them for simplicity
        if np.where(mask_crop == 1)[0].shape == (0,):
            continue
        success += 1
        # Choose which split to assign the image and mask to, using weighted randomness
        # Since we just use the Danish data as test data, I have set it to 100% test set
        splits = ['test'] # * 25 + ['val'] * 25 + ['train'] * 50
        chosen_split = random.choice(splits)
        chosen_splits.append(chosen_split)

        # Save the mask as a numpy array
        np.save(f'{base_folder}/{chosen_split}/msk/{idx}.npy', mask_crop)

        # Make a new TIF file with the right properties in terms of bounds, transforms, etc.
        x_min, y_min = rasterio.transform.xy(dataset.transform, x_start, y_start)
        x_max, y_max = rasterio.transform.xy(dataset.transform, x_end, y_end)
        transform = rasterio.transform.from_bounds(x_min, y_min, x_max, y_max, x_end - x_start, y_end - y_start)
    
        with rasterio.open(f'{base_folder}/{chosen_split}/img/{idx}.tif', 'w', driver = 'GTiff', width = img_crop.shape[2], height = img_crop.shape[1],
                            count = 3,  dtype = img_crop.dtype, crs = dataset.crs, transform = transform) as f:
            for c in range(3): # Write the color channels to the tif file
                f.write(img_crop[c], c + 1)
        idx += 1
print(success)

100%|█████████████████████████████████████████████████████████████████████████████████| 527/527 [20:24<00:00,  2.32s/it]

40270





In [8]:
from collections import Counter
Counter(chosen_splits)

Counter({'test': 40270})

In [9]:
import sys
sys.path.append('./UNet_code')
from data_loader import CustomDataLoader
from torch.utils.data import DataLoader

batch_size = 4
base_folder = 'denmark_data'
image_path = base_folder + '/{}/img/*'
mask_path = base_folder + '/{}/msk/*'

test_dataset = CustomDataLoader(image_path.format('test'), mask_path.format('test'), channels = 'r.g.b')
test_loader = DataLoader(dataset = test_dataset, batch_size = batch_size, shuffle = False, num_workers = 4)

# To test that it works :)
# for batch in train_loader:
#     print(batch)
#     break