## Population projections statistics

Notebook environment to migrate TIF files to CF compliant CoG

In [1]:
# Import standard packages
import os
import glob
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio
from itertools import product
#load_dotenv()

# Import custom functionality
from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy
from coastmonitor.io.utils import name_block

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

# use local or remote data dir
use_local_data = False
ds_dirname = "WP6"

if use_local_data: 
    ds_dir = pathlib.Path().home().joinpath("data", ds_dirname)
else: 
    ds_dir = coclico_data_dir.joinpath(ds_dirname, "data", "bgrm_delivery_18feb25")

if not ds_dir.exists():
    raise FileNotFoundError("Directory with data does not exist.")

# set ouput directory for front/end files
fe_output_dir = ds_dir.parent.parent.joinpath("front_end_data")



import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
parquet_file = ds_dir.joinpath("pop200225.parquet")

pop_stats = gpd.read_parquet(parquet_file)

# Change the direction of slashes
pop_stats = pop_stats.rename(columns=lambda x: x.replace('/', '\\'))

pop_stats

Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,...,UNDEFENDED_MAPS\static\SSP585\2100\rel_affected,UNDEFENDED_MAPS\static\SSP585\2100\rel_unaffected,HIGH_DEFENDED_MAPS\static\SSP585\2100\abs_affected,HIGH_DEFENDED_MAPS\static\SSP585\2100\abs_unaffected,HIGH_DEFENDED_MAPS\static\SSP585\2100\rel_affected,HIGH_DEFENDED_MAPS\static\SSP585\2100\rel_unaffected,LOW_DEFENDED_MAPS\static\SSP585\2100\abs_affected,LOW_DEFENDED_MAPS\static\SSP585\2100\abs_unaffected,LOW_DEFENDED_MAPS\static\SSP585\2100\rel_affected,LOW_DEFENDED_MAPS\static\SSP585\2100\rel_unaffected
0,DE_03361001,DE,DE93,DE93B,03361001,"Achim, Stadt",31923.0,471.398771,67.719735,2020,...,0.000007,0.999993,0.221546,30784.194354,0.000007,0.999993,0.221546,30784.194354,0.000007,0.999993
1,DE_03361002,DE,DE93,DE93B,03361002,Blender,2885.0,75.034640,38.448908,2020,...,0.000000,1.000000,0.000000,3042.486282,0.000000,1.000000,0.000000,3042.486282,0.000000,1.000000
2,DE_03361003,DE,DE93,DE93B,03361003,Dörverden,9009.0,107.940908,83.462333,2020,...,,,,,,,,,,
3,DE_03361004,DE,DE93,DE93B,03361004,Emtinghausen,1464.0,68.573846,21.349247,2020,...,,,,,,,,,,
4,DE_03361005,DE,DE93,DE93B,03361005,Kirchlinteln,9911.0,56.675823,174.871744,2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,FR_97614,FR,FRY5,FRY50,97614,Ouangani,10203.0,555.838786,18.356042,2020,...,,,,,,,,,,
11961,FR_97616,FR,FRY5,FRY50,97616,Sada,11156.0,1012.643086,11.016715,2020,...,,,,,,,,,,
11962,FR_97617,FR,FRY5,FRY50,97617,Tsingoni,13934.0,406.428213,34.284037,2020,...,,,,,,,,,,
11963,FR_97608,FR,FRY5,FRY50,97608,Dzaoudzi,17831.0,2797.974566,6.372824,2020,...,,,,,,,,,,


In [6]:
# Check if column names contain "None"
for col in pop_stats.columns:
    if "High_end" in col:
        print(col)

In [None]:
# # Store the data in a new parquet file
pop_stats.to_parquet(parquet_file.parent.parent.parent.joinpath("front_end_data", "pop_stats.parquet"))

In [11]:
str(parquet_file.parent.parent.joinpath("front_end_data", "pop_stats.parquet"))

'P:\\11207608-coclico\\FULLTRACK_DATA\\WP6\\data\\front_end_data\\pop_stats.parquet'

In [14]:
for col in pop_stats.columns:
    print(col)

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
UNDEFENDED_MAPS\static\SSP126\2010\abs_affected
UNDEFENDED_MAPS\static\SSP126\2010\abs_unaffected
UNDEFENDED_MAPS\static\SSP126\2010\rel_affected
UNDEFENDED_MAPS\static\SSP126\2010\rel_unaffected
HIGH_DEFENDED_MAPS\static\SSP126\2010\abs_affected
HIGH_DEFENDED_MAPS\static\SSP126\2010\abs_unaffected
HIGH_DEFENDED_MAPS\static\SSP126\2010\rel_affected
HIGH_DEFENDED_MAPS\static\SSP126\2010\rel_unaffected
LOW_DEFENDED_MAPS\static\SSP126\2010\abs_affected
LOW_DEFENDED_MAPS\static\SSP126\2010\abs_unaffected
LOW_DEFENDED_MAPS\static\SSP126\2010\rel_affected
LOW_DEFENDED_MAPS\static\SSP126\2010\rel_unaffected
UNDEFENDED_MAPS\1\SSP126\2010\abs_affected
UNDEFENDED_MAPS\1\SSP126\2010\abs_unaffected
UNDEFENDED_MAPS\1\SSP126\2010\rel_affected
UNDEFENDED_MAPS\1\SSP126\2010\rel_unaffected
UNDEFENDED_MAPS\100\SSP126\2010\abs_affected
UNDEFENDED_MAPS\100\SSP126\2010\abs_unaffected
UNDEFENDED_MAPS\100\SSP126\2

In [None]:
# For the front-end the there is a maximum amount of rows (255) that is allowed. For that reason we rework the dataset.
# The variables map_type, return_period, scenario and time will be added in a column. Strechting the dataset in the vertical direction.
df = pop_stats.copy()

# Identify base columns that remain unchanged
base_columns = [
    "GISCO_ID", "CNTR_CODE", "nuts_2", "nuts_3", "LAU_ID", "LAU_NAME", 
    "POP_2020", "POP_DENS_2", "AREA_KM2", "YEAR", "FID", "geometry"
]

# Identify dynamic columns (all non-base columns)
variable_columns = [col for col in df.columns if col not in base_columns]

# Extract components from column names
df_extracted = pd.DataFrame([col.split('\\') for col in variable_columns], 
                            columns=["map_type", "return_period", "scenario", "time", "impact_type"])
df_extracted["original_column"] = variable_columns  # Keep track of original names

# Melt dataframe to long format
df_long = df.melt(id_vars=base_columns, value_vars=variable_columns, var_name="original_column", value_name="value")

# Merge extracted components directly into the long dataframe
df_long = df_long.merge(df_extracted, on="original_column").drop(columns=["original_column"])

# Pivot to create separate columns for impact types
df_wide = df_long.pivot(index=base_columns + ["map_type", "return_period", "scenario", "time"], 
                        columns="impact_type", 
                        values="value").reset_index()

# Remove column index name
df_wide.columns.name = None


pop_stats_gpkg = df_wide.copy()



KeyboardInterrupt: 

In [None]:
# Convert to geodataframe
pop_stats_gpkg = gpd.GeoDataFrame(pop_stats_gpkg, geometry="geometry")

# Rename FID to LAU_FID
pop_stats_gpkg.rename(columns={'FID': 'LAU_FID'}, inplace=True)

pop_stats_gpkg

In [None]:
# Write to geopackage
pop_stats_gpkg.to_file(fe_output_dir.joinpath("pop_stats.gpkg"), driver="GPKG")

In [8]:
# List all columns containing "MAPS"
maps_columns = [str(Path(col).parent) for col in pop_stats.columns if "MAPS" in col]
print(len(maps_columns)/4)

144.0


In [265]:
# Write pop_stats to gpkg

pop_stats_gpkg = pop_stats.copy()

pop_stats_gpkg.rename(columns={'FID': 'LAU_FID'}, inplace=True)

pop_stats_gpkg.to_file(ds_dir.joinpath("pop_fp_LAU.gpkg"), driver="GPKG")

In [None]:
ds_dir.joinpath("pop_fp_LAU.gpkg")

WindowsPath('P:/11207608-coclico/FULLTRACK_DATA/WP6/data/bgrm_delivery_18feb25/pop_stats.gpkg')

In [20]:
# Only get unique map_columns
maps_columns = list(set([str(Path(col).parent) for col in pop_stats.columns if "MAPS" in col]))

for map_column in maps_columns:
    
    print(map_column)
    # Select all column names containing the current map column
    cur_stats = pop_stats.loc[:, pop_stats.columns.str.startswith(map_column)]

    # split column names at backslashes, only keep last element
    cur_stats.columns = cur_stats.columns.str.split("\\").str[-1]

    # Add the cur stats to the first 11 columns of pp_stats
    pp_gpkg = pd.concat([pop_stats.iloc[:, :12], cur_stats], axis=1)

    # Set FID to LAU_FID for storing as geopackage
    pp_gpkg.rename(columns={'FID': 'LAU_FID'}, inplace=True)

    # Set geopackage file name
    scen_name = map_column.replace("\\", "_")

    vars = map_column.split("\\")
    map_type = vars[0]
    rp = vars[1]
    scen = vars[2]
    year = vars[3]

    gpkg_fp = ds_dir.parent.parent.joinpath("front_end_data","map_stats", map_type,rp,scen)
    gpkg_fp.mkdir(parents=True, exist_ok=True)

    gpkg_file = gpkg_fp.joinpath(f"pop_stats_{scen_name}.gpkg")

    # Write to geopackage
    pp_gpkg.to_file(str(gpkg_file), layer=f'pop_stats_{scen_name}', driver='GPKG')

    # Write to parquet
    pp_gpkg.to_parquet(gpkg_fp.joinpath(f"pop_stats_{scen_name}.parquet"))

LOW_DEFENDED_MAPS\1000\SSP245\2030
LOW_DEFENDED_MAPS\1\SSP126\2050
LOW_DEFENDED_MAPS\100\SSP585\2010
HIGH_DEFENDED_MAPS\1\SSP245\2100
UNDEFENDED_MAPS\1000\SSP585\2010
UNDEFENDED_MAPS\1\SSP126\2010
UNDEFENDED_MAPS\1000\SSP585\2050
LOW_DEFENDED_MAPS\static\SSP245\2100
LOW_DEFENDED_MAPS\100\SSP585\2050
LOW_DEFENDED_MAPS\static\SSP585\2050
LOW_DEFENDED_MAPS\100\SSP245\2050
LOW_DEFENDED_MAPS\1\SSP126\2010
HIGH_DEFENDED_MAPS\static\SSP126\2030
UNDEFENDED_MAPS\1\SSP245\2030
LOW_DEFENDED_MAPS\1000\SSP126\2100
HIGH_DEFENDED_MAPS\static\SSP585\2050
HIGH_DEFENDED_MAPS\100\SSP585\2030
HIGH_DEFENDED_MAPS\1\SSP126\2030
UNDEFENDED_MAPS\1\SSP126\2050
UNDEFENDED_MAPS\1000\SSP585\2100
UNDEFENDED_MAPS\static\SSP585\2030
UNDEFENDED_MAPS\100\SSP245\2050
UNDEFENDED_MAPS\100\SSP585\2030
HIGH_DEFENDED_MAPS\1\SSP126\2100
LOW_DEFENDED_MAPS\static\SSP126\2050
UNDEFENDED_MAPS\100\SSP245\2010
UNDEFENDED_MAPS\1\SSP245\2100
UNDEFENDED_MAPS\1000\SSP126\2050
HIGH_DEFENDED_MAPS\1000\SSP585\2050
HIGH_DEFENDED_MAPS\1000\

In [50]:
maps_columns = list(set([str(Path(col).parent) for col in pop_stats.columns if "MAPS" in col]))
maps_columns

['UNDEFENDED_MAPS\\1000\\SSP126\\2100',
 'UNDEFENDED_MAPS\\100\\SSP126\\2010',
 'LOW_DEFENDED_MAPS\\100\\SSP126\\2100',
 'HIGH_DEFENDED_MAPS\\static\\SSP245\\2050',
 'HIGH_DEFENDED_MAPS\\1000\\SSP585\\2030',
 'UNDEFENDED_MAPS\\100\\SSP245\\2100',
 'HIGH_DEFENDED_MAPS\\100\\SSP126\\2010',
 'LOW_DEFENDED_MAPS\\100\\SSP126\\2010',
 'HIGH_DEFENDED_MAPS\\1000\\SSP126\\2010',
 'UNDEFENDED_MAPS\\1000\\SSP585\\2010',
 'LOW_DEFENDED_MAPS\\1\\SSP245\\2030',
 'LOW_DEFENDED_MAPS\\1\\SSP245\\2010',
 'HIGH_DEFENDED_MAPS\\100\\SSP126\\2100',
 'LOW_DEFENDED_MAPS\\1\\SSP126\\2050',
 'LOW_DEFENDED_MAPS\\100\\SSP245\\2010',
 'UNDEFENDED_MAPS\\1000\\SSP126\\2010',
 'LOW_DEFENDED_MAPS\\100\\SSP126\\2030',
 'LOW_DEFENDED_MAPS\\static\\SSP245\\2050',
 'LOW_DEFENDED_MAPS\\1000\\SSP245\\2100',
 'HIGH_DEFENDED_MAPS\\static\\SSP126\\2050',
 'LOW_DEFENDED_MAPS\\1000\\SSP585\\2100',
 'HIGH_DEFENDED_MAPS\\100\\SSP245\\2010',
 'LOW_DEFENDED_MAPS\\static\\SSP126\\2010',
 'HIGH_DEFENDED_MAPS\\1\\SSP245\\2050',
 'HIGH_

In [19]:
pp_gpkg

Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,LAU_FID,geometry,abs_affected,abs_unaffected,rel_affected,rel_unaffected
0,DE_03361001,DE,DE93,DE93B,03361001,"Achim, Stadt",31923.0,471.398771,67.719735,2020,DE_03361001,"POLYGON ((4258927.295 3320011.519, 4259193.583...",1.106465,29489.192522,3.751963e-05,0.999962
1,DE_03361002,DE,DE93,DE93B,03361002,Blender,2885.0,75.034640,38.448908,2020,DE_03361002,"POLYGON ((4264556.862 3317435.819, 4265677.354...",0.059352,2918.771304,2.033413e-05,0.999980
2,DE_03361003,DE,DE93,DE93B,03361003,Dörverden,9009.0,107.940908,83.462333,2020,DE_03361003,"POLYGON ((4275479.269 3295636.790, 4275328.930...",0.008317,9357.887071,8.887267e-07,0.999999
3,DE_03361004,DE,DE93,DE93B,03361004,Emtinghausen,1464.0,68.573846,21.349247,2020,DE_03361004,"POLYGON ((4250468.027 3309420.666, 4250432.180...",,,,
4,DE_03361005,DE,DE93,DE93B,03361005,Kirchlinteln,9911.0,56.675823,174.871744,2020,DE_03361005,"POLYGON ((4272425.673 3310568.711, 4274136.202...",,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,FR_97614,FR,FRY5,FRY50,97614,Ouangani,10203.0,555.838786,18.356042,2020,FR_97614,"POLYGON ((8727685.023 -2785490.190, 8727139.84...",,,,
11961,FR_97616,FR,FRY5,FRY50,97616,Sada,11156.0,1012.643086,11.016715,2020,FR_97616,"POLYGON ((8727114.505 -2787670.745, 8725517.75...",,,,
11962,FR_97617,FR,FRY5,FRY50,97617,Tsingoni,13934.0,406.428213,34.284037,2020,FR_97617,"POLYGON ((8730964.436 -2773602.954, 8730972.75...",,,,
11963,FR_97608,FR,FRY5,FRY50,97608,Dzaoudzi,17831.0,2797.974566,6.372824,2020,FR_97608,"MULTIPOLYGON (((8745852.102 -2769772.861, 8742...",,,,


In [None]:
import shapely

# Open parquet file from google bucket URL
# URL of the Parquet file
url = "https://storage.googleapis.com/coclico-data-public/coclico/pp_stats/pop_fp_LAU_rel.parquet"

# Read the Parquet file
pp_stats = pd.read_parquet(url, engine='pyarrow')

# Because we load with regular pandas the polygon data needs to be converted from WKB - Well-Known Binary to shapely.Polygon
pp_stats['geometry'] = pp_stats['geometry'].apply(shapely.wkb.loads)

# Now convert to geopandas
pp_stats = gpd.GeoDataFrame(pp_stats, geometry='geometry', crs='EPSG:3035')

pp_stats