## Population projections


Notebook environment to migrate TIF files to CF compliant CoG

In [2]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

### Configure OS independent paths

In [1]:
# Import standard packages
import os
import glob
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio
#load_dotenv()

# Import custom functionality
from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy
from coastmonitor.io.utils import name_block

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

# use local or remote data dir
use_local_data = False
ds_dirname = "WP5"

if use_local_data: 
    ds_dir = pathlib.Path().home().joinpath("data", "tmp", ds_dirname)
else: 
    ds_dir = coclico_data_dir.joinpath(ds_dirname)

if not ds_dir.exists():
    raise FileNotFoundError("Directory with data does not exist.")

# directory to export result (make if not exists)
cog_dir = ds_dir.joinpath("single_cog_test") # for checking CF compliancy
cog_dirs = ds_dir.joinpath("cogs") # for making all files CF compliant
cog_dir.mkdir(parents=True, exist_ok=True)
cog_dirs.mkdir(parents=True, exist_ok=True)


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# Project paths & files (manual input)
test_path = ds_dir.joinpath("data","population_v2","SSP1_2010_EU_UK.tif")
CF_dir = ds_dir.joinpath("CF")

In [3]:
# NetCDF attribute alterations by means of metadata template
import json
meta_json = open(ds_dir.joinpath("metadata", "metadata_population.json"), 'r')
meta_data = json.load(meta_json)

# NOTE: json module can not handle trailing comma's, these have manually been deleted from the data

In [4]:
# Run gdal info to check no_data_value shown as _FillValue in xarray
# For this data its: -2147483647
pp_test = xr.open_dataset(test_path, engine="rasterio", mask_and_scale=False) 


In [5]:
# Test for single .tif-file
pp_test = xr.open_dataset(test_path, engine="rasterio", mask_and_scale=False) 
pp_test = pp_test.assign_coords(band=("band", [f"B{k+1:02}" for k in range(pp_test.dims["band"])]))
pp_test = pp_test["band_data"].to_dataset("band")

for var in pp_test:

    da = pp_test[var]

    da = (
    da.where(da != -2147483647, -9999)
    .astype("int32")
    .rio.write_nodata(-9999)
    .rio.set_spatial_dims(x_dim="x", y_dim="y")
    )

    item_name = name_block(
    da,
    storage_prefix="",
    name_prefix="",
    include_band=da.name,
    time_dim=False,
    x_dim="x",
    y_dim="y",
    )

    # convert to dataset
    dad = da.to_dataset()

    # add all attributes (again)
    for attr_name, attr_val in meta_data.items():
        if attr_name == 'PROVIDERS':
            attr_val = json.dumps(attr_val)
        if attr_name == "MEDIA_TYPE": # change media type to tiff, leave the rest as is
            attr_val = "IMAGE/TIFF"
        dad.attrs[attr_name] = attr_val

    dad.attrs['Conventions'] = "CF-1.8"

    # export file
    outpath = cog_dir.joinpath(item_name)
    dad.rio.to_raster(outpath, driver= 'COG')



  pp_test = pp_test.assign_coords(band=("band", [f"B{k+1:02}" for k in range(pp_test.dims["band"])]))


In [6]:
# Check newly created .tif

pp_test_new = xr.open_dataset(outpath, engine="rasterio", mask_and_scale=False)

print('min of created  COG: ' + str(np.min(pp_test_new['band_data'].values[pp_test_new['band_data'].values>0]))) # For min no data values are excluded
print('min of original TIF: ' + str(np.min(pp_test['B01'].values[pp_test['B01'].values>0]))) # For min no data values are excluded
print('max of created  COG: ' + str(np.max(pp_test_new['band_data'].values)))
print('max of original TIF: ' + str(np.max(pp_test['B01'].values)))
print('Values should be the same')


min of created  COG: 1
min of original TIF: 1.8026876e-06
max of created  COG: 28445
max of original TIF: 28445.998
Values should be the same


In [7]:
# export to nc for quick CF compliancy check..
dad.to_netcdf(path=cog_dir.joinpath(item_name.replace(".tif", ".nc")))

CF_dir

WindowsPath('P:/11207608-coclico/FULLTRACK_DATA/WP5/CF')

In [8]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=cog_dir.joinpath(item_name.replace(".tif", ".nc")), working_dir=CF_dir)

In [9]:
# save original CF compliancy
save_compliancy(cap, testfile=cog_dir.joinpath(item_name.replace(".tif", ".nc")), working_dir=CF_dir)



In [12]:
def get_paths(folder_structure, base_dir=''):
    """Generate paths for a folder structure defined by a dict"""
    paths = []
    for key, value in folder_structure.items():
        if isinstance(value, dict):
            paths.extend(get_paths(value, os.path.join(base_dir, key)))
        elif isinstance(value, list):
            if value:
                for item in value:
                    if item != "":
                        paths.append(os.path.join(base_dir, key, item))
            else:
                paths.append(os.path.join(base_dir, key))
        else:
            continue
    return paths

# List all tif files present in first folder (note: it is assumed that the same files are present in all folders)
tif_list = glob.glob(str(ds_dir.joinpath("data","population_v2", "*.tif")))

# List the desired folder structure as a dict
# NOTE: make sure the resulting path_list (based on folder structure) matches the tif_list
# NOTE: shortcut taken by calling every year twice, because there are two tif's per year. 
folder_structure = {
    "SSP1": ["2010","2030","2050","2100"],
    "SSP2": ["2010","2030","2050","2100"],
    "SSP3": ["2010","2030","2050","2100"],
    "SSP4": ["2010","2030","2050","2100"],
    "SSP5": ["2010","2030","2050","2100"],
}

# Get list of paths for the folder structure
path_list = get_paths(folder_structure)

In [13]:
# Iterate over the original tif files

for cur_path, cur_tif in zip(path_list, tif_list):
    
    # Convert current paths to pathlib
    cur_tif = pathlib.Path(cur_tif)
    cur_dir = pathlib.Path(os.path.join(cog_dirs,cur_path))
    # Mkdir if not existing
    cur_dir.mkdir(parents=True,exist_ok=True)
    # Print what we're working on
    print('currently working on: ' +str(cur_tif.name))

    # Open original dataset
    pp = xr.open_dataset(cur_tif, engine="rasterio", mask_and_scale=False) 
    pp = pp.assign_coords(band=("band", [f"B{k+1:02}" for k in range(pp.dims["band"])]))
    pp = pp["band_data"].to_dataset("band")

    for var in pp:

        da = pp[var]

        da = (
        da.where(da != -2147483647, -9999)
        .astype("int32")
        .rio.write_nodata(-9999)
        .rio.set_spatial_dims(x_dim="x", y_dim="y")
        )

        # convert to dataset
        dad = da.to_dataset()

        # add all attributes (again)
        for attr_name, attr_val in meta_data.items():
            if attr_name == 'PROVIDERS':
                attr_val = json.dumps(attr_val)
            if attr_name == "MEDIA_TYPE": # change media type to tiff, leave the rest as is
                attr_val = "IMAGE/TIFF"
            dad.attrs[attr_name] = attr_val

        dad.attrs['Conventions'] = "CF-1.8"

        # export file
        outpath = cog_dirs.joinpath(cur_path,cur_tif.name)
        dad.rio.to_raster(outpath, driver= 'COG')

currently working on: SSP1_2010_EU_UK.tif
currently working on: SSP1_2030_EU_UK.tif
currently working on: SSP1_2050_EU_UK.tif
currently working on: SSP1_2100_EU_UK.tif
currently working on: SSP2_2010_EU_UK.tif
currently working on: SSP2_2030_EU_UK.tif
currently working on: SSP2_2050_EU_UK.tif
currently working on: SSP2_2100_EU_UK.tif
currently working on: SSP3_2010_EU_UK.tif
currently working on: SSP3_2030_EU_UK.tif
currently working on: SSP3_2050_EU_UK.tif
currently working on: SSP3_2100_EU_UK.tif
currently working on: SSP4_2010_EU_UK.tif
currently working on: SSP4_2030_EU_UK.tif
currently working on: SSP4_2050_EU_UK.tif
currently working on: SSP4_2100_EU_UK.tif
currently working on: SSP5_2010_EU_UK.tif
currently working on: SSP5_2030_EU_UK.tif
currently working on: SSP5_2050_EU_UK.tif
currently working on: SSP5_2100_EU_UK.tif


In [16]:
dad

<IPython.core.display.Javascript object>