This notebook contains code on collecting land use data from Copernicus LULC dataset for the generated grids and then calculating the entropy value (mix of land use) for each 100m x 100m grid. The developed code relies on google earth engine for processing and google cloud for intermediate storage of data.

In [None]:
import ee
import geopandas as gpd
import rasterio
from rasterio.windows import from_bounds
import os
import numpy as np
import logging
from multiprocess import Pool
import glob
from functools import partial

### data collection

In [None]:
# set relative path for grid files
grid_path = 'data/*.parquet'

# create a list of all parquet grid files from the specified directory
grids_list = [parquet for parquet in glob.glob(grid_path)]

# authenticate and Initialize Google Earth Engine
ee.Authenticate()
ee.Initialize()

In [None]:
# access Copernicus LULC data
lc_dataset = ee.Image('COPERNICUS/CORINE/V20/100m/2018')
lc_img = lc_dataset.select('landcover')

In [None]:
# remap land cover categories into simplified classes and mask out zeroes
mask = lc_img.remap(
    [111, 112, 121, 122, 123, 124, 141, 142, 211, 212, 213, 221, 222, 223, 231, 241, 242, 243, 244, 311, 312, 313, 321, 322, 323, 324, 333, 511, 512],
    [1, 1, 2, 2, 2, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
    0)
masked_lc_img = mask.updateMask(mask.neq(0))

In [None]:
# calculate entropy with different kernel sizes
entropy_5 = masked_lc_img.entropy(ee.Kernel.square(radius=5))
entropy_10 = masked_lc_img.entropy(ee.Kernel.square(radius=10))
entropy_15 = masked_lc_img.entropy(ee.Kernel.square(radius=15))

entropy_imgs = [entropy_5, entropy_10, entropy_15]

In [None]:
def export_entropy(grid_path, img, radius):
    grid_number = grid_path.split('_')[-1].split('.')[0]
    img_name = f'ent_100m_{grid_number}_{radius}.tif'

    grid_data = gpd.read_parquet(grid_path)
    extent = grid_data.to_crs('epsg:4326').total_bounds

    ee_bounds = ee.Geometry.Polygon([
        [[extent[0], extent[1]], [extent[0], extent[3]], [extent[2], extent[3]], [extent[2], extent[1]]]
    ])

    export_params = {
        'image': img,
        'bucket': 'cog-bucket-test',
        'description': f'ent_100m_{grid_number}_{radius}',
        'fileNamePrefix': f'ent_100m_{grid_number}_{radius}',
        'scale': 100,
        'region': ee_bounds.getInfo()['coordinates'],
        'crs': 'EPSG:3035',
        'fileFormat': 'GeoTIFF',
        'formatOptions': {'cloudOptimized': True},
        'maxPixels': 1e12,
    }

    task = ee.batch.Export.image.toCloudStorage(**export_params)
    task.start()

In [None]:
# sequential
for i, img in enumerate(entropy_imgs):
    radius = '5' if i == 0 else '10' if i == 1 else '15'
    for grid_path in grids_list:
        export_entropy(grid_path, img, radius)

### calculate land use mix for 100m x 100m grids

In [None]:
# set the environment variable for Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/your/credentials.json'

# list to hold the paths of the parquet files
grids_list = []
for parquet_file in glob.glob('data/*.parquet'):
    grids_list.append(parquet_file)

# # configure logging (recommended if you monitor processing over a lot of files)
# log_path = 'logs/slope.log'

# # ensure log directory exists
# log_dir = os.path.dirname(log_path)
# if not os.path.exists(log_dir):
#     os.makedirs(log_dir)
    
# logging.basicConfig(filename=log_path, level=logging.INFO,
#                     format='%(asctime)s:%(levelname)s:%(message)s', force=True)

In [None]:
def calculate_ent_row(row, src):
    """
    Calculate the entropy for a single row (geometry) from a GeoDataFrame using a given raster source.
    
    Args:
    - row (GeoSeries): A GeoDataFrame row representing a polygon.
    - src (rasterio.io.DatasetReader): Open raster source to read data from.
    
    Returns:
    - float: Entropy value from the raster.
    """
    left, bottom, right, top = row.geometry.bounds
    window = from_bounds(left, bottom, right, top, src.transform)
    data = src.read(window=window)
    img = data[0]

    if img.shape != (1, 1):
        logging.info(f'More than one pixel in the window')
        return None  # Consider returning NaN or another value as appropriate
    return img[0, 0]

def process_grid(grid_path, radius):
    """
    Process a single grid file to calculate and update entropy data.
    
    Args:
    - grid_path (str): Path to the grid file.
    - radius (str): The radius parameter for which entropy was calculated.
    """
    try:
        grid_number = grid_path.split('_')[-1].split('.')[0]
        grid_gdf = gpd.read_parquet(grid_path)

        if f'ent_{radius}' in grid_gdf.columns:
            logging.info(f'Skipping grid {grid_number} for radius {radius} as entropy already calculated')
            return

        cog_path = f'./eurostat_grid/grids_100/entropy_data/entropy_{radius}/ent_100m_{grid_number}_{radius}.tif'
        logging.info(f'Started processing grid {grid_number} with {cog_path}')

        with rasterio.open(cog_path) as src:
            if grid_gdf.crs != src.crs:
                grid_gdf = grid_gdf.to_crs(src.crs)
            calculate_with_src = partial(calculate_ent_row, src=src)
            grid_gdf[f'ent_{radius}'] = grid_gdf.apply(lambda row: calculate_with_src(row), axis=1)

        grid_gdf.to_parquet(grid_path)
        logging.info(f'Successfully processed grid {grid_path} for {radius} radius')
    except Exception as e:
        logging.error(f'Error processing grid {grid_path} for {radius} radius: {e}')

In [None]:
# define the radii to process
radii = ['5', '10', '15']

# sequential
for elem in grids_list:
    for radius in radii:
        process_grid(elem, radius)
        
# # parallel
# for radius in radii:
#     num_processes = 7
#     process_with_radius = partial(process_grid, radius=radius)
#     with Pool(processes=num_processes) as pool:
#         pool.map(process_with_radius, grids_list)