## Population projections statistics

Notebook environment to migrate TIF files to CF compliant CoG

In [69]:
# Import standard packages
import os
import glob
import pathlib
import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio
#load_dotenv()

# Import custom functionality
from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy
from coastmonitor.io.utils import name_block

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

# use local or remote data dir
use_local_data = False
ds_dirname = "WP5"

if use_local_data: 
    ds_dir = pathlib.Path().home().joinpath("data", ds_dirname)
else: 
    ds_dir = coclico_data_dir.joinpath(ds_dirname, "data", "pop_fp_statistics")

if not ds_dir.exists():
    raise FileNotFoundError("Directory with data does not exist.")


In [70]:
parquet_file = ds_dir.joinpath("pop_fp_LAU.parquet")

pop_stats = gpd.read_parquet(parquet_file)

pop_stats

Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,...,UNDEFENDED_MAPS\1\High_End\2150\population_SSP5,UNDEFENDED_MAPS\SLR_High_end\2150\population_SSP1,UNDEFENDED_MAPS\SLR_High_end\2150\population_SSP2,UNDEFENDED_MAPS\SLR_High_end\2150\population_SSP5,UNDEFENDED_MAPS\static\High_End\2150\population_SSP1,UNDEFENDED_MAPS\static\High_End\2150\population_SSP2,UNDEFENDED_MAPS\static\High_End\2150\population_SSP5,tot_pop\2150\population_SSP1,tot_pop\2150\population_SSP2,tot_pop\2150\population_SSP5
0,DE_03361001,DE,DE93,DE93B,03361001,"Achim, Stadt",31923.0,471.398771,67.719735,2020,...,,0.144375,0.179375,,,,0.752165,14295.223125,17457.186875,63640.814859
1,DE_03361002,DE,DE93,DE93B,03361002,Blender,2885.0,75.034640,38.448908,2020,...,,,,,,,,,,
2,DE_03361003,DE,DE93,DE93B,03361003,Dörverden,9009.0,107.940908,83.462333,2020,...,,,,,,,,,,
3,DE_03361004,DE,DE93,DE93B,03361004,Emtinghausen,1464.0,68.573846,21.349247,2020,...,,,,,,,,,,
4,DE_03361005,DE,DE93,DE93B,03361005,Kirchlinteln,9911.0,56.675823,174.871744,2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,FR_97614,FR,FRY5,FRY50,97614,Ouangani,10203.0,555.838786,18.356042,2020,...,,,,,,,,,,
11961,FR_97616,FR,FRY5,FRY50,97616,Sada,11156.0,1012.643086,11.016715,2020,...,,,,,,,,,,
11962,FR_97617,FR,FRY5,FRY50,97617,Tsingoni,13934.0,406.428213,34.284037,2020,...,,,,,,,,,,
11963,FR_97608,FR,FRY5,FRY50,97608,Dzaoudzi,17831.0,2797.974566,6.372824,2020,...,,,,,,,,,,


In [71]:
# Print all column names in for loop
for col in pop_stats.columns:
    print(col)

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP1
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP2
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP5
LOW_DEFENDED_MAPS\100\None\2010\population_SSP1
LOW_DEFENDED_MAPS\100\None\2010\population_SSP2
LOW_DEFENDED_MAPS\100\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1

In [72]:
# Get rid of the 'B01' element
pop_stats.columns = pop_stats.columns.str.replace('\\B01', '', regex=False)

# Print all column names in for loop
for col in pop_stats.columns:
    print(col)

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP1
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP2
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP5
LOW_DEFENDED_MAPS\100\None\2010\population_SSP1
LOW_DEFENDED_MAPS\100\None\2010\population_SSP2
LOW_DEFENDED_MAPS\100\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1

In [73]:
pop_stats_red = pop_stats

# There are columns that represent combinations of population SSP and Flood map SSP that can be dropped
for col in pop_stats_red.columns:
    
    print(col)
    # split col at backslash
    vars = col.split("\\")

    # Find outlying naming of columns
    if 'MAPS' in vars[0] and len(vars) < 5:
                
        # Drop column
        pop_stats_red.drop(columns=[col], inplace=True)

        continue

    # Match floodmap ssp with population ssp
    if 'MAPS' in vars[0] and not vars[-1].split('_')[-1] in vars[2] and not (vars[2] == 'None' and "SSP2" in vars[-1]) and not (vars[2] == 'High_End' and "SSP5" in vars[-1]):

        # Drop column
        pop_stats_red.drop(columns=[col], inplace=True)

# Print all column names in for loop
for col in pop_stats_red.columns:
    print(col)

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP5
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP1
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP1
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP2
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP5
LOW_DEFENDED_MAPS\100\None\2010\population_SSP1
LOW_DEFENDED_MAPS\100\None\2010\population_SSP2
LOW_DEFENDED_MAPS\100\None\2010\population_SSP5
LOW_DEFENDED_MAPS\1

In [74]:
# Move all columns containing "tot_pop" to column position 11
# Identify columns starting with 'tot_pop'
tot_pop_cols = [col for col in pop_stats_red.columns if col.startswith('tot_pop')]

# Identify remaining columns
other_cols = [col for col in pop_stats_red.columns if col not in tot_pop_cols]

# New column order: insert tot_pop columns at the specified location (e.g., index 11)
# Adjust the insertion index (e.g., 11) based on the size of your DataFrame
insert_index = min(12, len(other_cols))  # Ensures the index is valid
new_column_order = other_cols[:insert_index] + tot_pop_cols + other_cols[insert_index:]

# Reorder columns
pop_stats_red = pop_stats_red[new_column_order]

for col in pop_stats_red.columns:
    print(col)  

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
tot_pop\2010\population_SSP1
tot_pop\2010\population_SSP2
tot_pop\2010\population_SSP5
tot_pop\2030\population_SSP1
tot_pop\2030\population_SSP2
tot_pop\2030\population_SSP5
tot_pop\2050\population_SSP1
tot_pop\2050\population_SSP2
tot_pop\2050\population_SSP5
tot_pop\2100\population_SSP1
tot_pop\2100\population_SSP2
tot_pop\2100\population_SSP5
tot_pop\2150\population_SSP1
tot_pop\2150\population_SSP2
tot_pop\2150\population_SSP5
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\1\None\2010\population_SSP2
HIGH_DEFENDED_MAPS\static\None\2010\population_SSP2
LOW_DEFENDED_MAPS\1000\None\2010\population_SSP2
LOW_DEFENDED_MAPS\100\None\2010\population_SSP2
LOW_DEFENDED_MAPS\1\None\2010\population_SSP2
LOW_DEFENDED_MAPS\static\None\2010\population_SSP2
UNDEFENDED_MAPS\1000\None\2010\population_SSP2
UNDEFENDED_MAPS\100\None\2010\

In [76]:
# Add columns for the population statistics
# Suffixes to append
suffixes = ["\\abs_affected", "\\rel_affected", "\\abs_unaffected", "\\rel_unaffected"]

# Identify columns containing "MAPS"
maps_columns = [col for col in pop_stats_red.columns if "MAPS" in col]

# Generate expanded columns for "MAPS"
expanded_columns = [f"{col}{suffix}" for col in maps_columns for suffix in suffixes]

# Retain original columns (non-MAPS)
non_maps_columns = [col for col in pop_stats_red.columns if "MAPS" not in col]

# Create new DataFrame structure
new_columns = non_maps_columns + expanded_columns
pop_stats_rel = pd.DataFrame(columns=new_columns)

# Copy original non-MAPS data to the new DataFrame
pop_stats_rel[non_maps_columns] = pop_stats_red[non_maps_columns]

for col in pop_stats_rel.columns:
    print(col)  

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
tot_pop\2010\population_SSP1
tot_pop\2010\population_SSP2
tot_pop\2010\population_SSP5
tot_pop\2030\population_SSP1
tot_pop\2030\population_SSP2
tot_pop\2030\population_SSP5
tot_pop\2050\population_SSP1
tot_pop\2050\population_SSP2
tot_pop\2050\population_SSP5
tot_pop\2100\population_SSP1
tot_pop\2100\population_SSP2
tot_pop\2100\population_SSP5
tot_pop\2150\population_SSP1
tot_pop\2150\population_SSP2
tot_pop\2150\population_SSP5
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\abs_affected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\rel_affected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\abs_unaffected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\rel_unaffected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\abs_affected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\rel_affected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\abs_unaffected
HIGH_DEFENDED_MAPS\100\No

In [80]:
# Iterate over all columns of pop_stats_rel containing "MAPS"	
for col in maps_columns:
    # Iterate over all columns containing "tot_pop"
    for tot_pop_col in tot_pop_cols:

        # Get column name variables
        vars = col.split("\\")
        tot_pop_vars = tot_pop_col.split("\\")
    
        if vars[-1].split('_')[-1] in tot_pop_vars[-1] and vars[3] in tot_pop_vars[1]:

            # Extract the total population
            tot_pop_cur = pop_stats_red[tot_pop_col].values

            # Extract population affected
            pop_aff_cur = pop_stats_red[col].values

            # Compute the absolute number of affected people
            abs_affected = pop_aff_cur

            # Compute the relative number of affected people
            rel_affected = pop_aff_cur / tot_pop_cur

            # Compute the absolute number of unaffected people
            abs_unaffected = tot_pop_cur - pop_aff_cur

            # Compute the relative number of unaffected people
            rel_unaffected = abs_unaffected / tot_pop_cur

            # Append the computed values to the new DataFrame
            pop_stats_rel[col + suffixes[0]] = abs_affected
            pop_stats_rel[col + suffixes[1]] = rel_affected
            pop_stats_rel[col + suffixes[2]] = abs_unaffected
            pop_stats_rel[col + suffixes[3]] = rel_unaffected

# Convert the DataFrame to a GeoDataFrame
pop_stats_rel = gpd.GeoDataFrame(pop_stats_rel, geometry=pop_stats_red.geometry)

  rel_affected = pop_aff_cur / tot_pop_cur
  rel_unaffected = abs_unaffected / tot_pop_cur


In [81]:
# Check if pop_stat_rel is still a geodataframe
if isinstance(pop_stats_rel, gpd.GeoDataFrame):
    print("pop_stats_rel is a geodataframe")

pop_stats_rel is a geodataframe


In [48]:
# Write parquet file
# parquet_file = ds_dir.joinpath("pop_fp_LAU_rel.parquet")
# pop_stats_rel.to_parquet(parquet_file)

In [82]:
# Rename the FID column to 'LAU_FID'
pop_stats_rel.rename(columns={'FID': 'LAU_FID'}, inplace=True)

# Also write to gpkg for front-end purposes
gpkg_file = ds_dir.joinpath("pop_fp_LAU.gpkg")
pop_stats_rel.to_file(str(gpkg_file), layer='pp_LAU', driver='GPKG')

In [60]:
for col in pop_stats_rel.select_dtypes(include=['object']).columns:
    print(col, pop_stats_rel[col].apply(type).value_counts())

GISCO_ID GISCO_ID
<class 'str'>    11965
Name: count, dtype: int64
CNTR_CODE CNTR_CODE
<class 'str'>    11965
Name: count, dtype: int64
nuts_2 nuts_2
<class 'str'>    11965
Name: count, dtype: int64
nuts_3 nuts_3
<class 'str'>    11965
Name: count, dtype: int64
LAU_ID LAU_ID
<class 'str'>    11965
Name: count, dtype: int64
LAU_NAME LAU_NAME
<class 'str'>    11965
Name: count, dtype: int64
FID FID
<class 'str'>    11965
Name: count, dtype: int64


In [37]:
# Write to geopackage
gpkg_file = ds_dir.joinpath("pop_fp_LAU.gpkg")
pop_stats_test.to_file(str(gpkg_file), layer='pp_LAU_rel', driver='GPKG')

AttributeError: 'DataFrame' object has no attribute 'to_file'

In [25]:
# Print all column names
for col in pop_stats_rel.columns:
    print(col)

GISCO_ID
CNTR_CODE
nuts_2
nuts_3
LAU_ID
LAU_NAME
POP_2020
POP_DENS_2
AREA_KM2
YEAR
FID
geometry
tot_pop\2010\population_SSP1
tot_pop\2010\population_SSP2
tot_pop\2010\population_SSP5
tot_pop\2030\population_SSP1
tot_pop\2030\population_SSP2
tot_pop\2030\population_SSP5
tot_pop\2050\population_SSP1
tot_pop\2050\population_SSP2
tot_pop\2050\population_SSP5
tot_pop\2100\population_SSP1
tot_pop\2100\population_SSP2
tot_pop\2100\population_SSP5
tot_pop\2150\population_SSP1
tot_pop\2150\population_SSP2
tot_pop\2150\population_SSP5
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\abs_affected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\rel_affected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\abs_unaffected
HIGH_DEFENDED_MAPS\1000\None\2010\population_SSP2\rel_unaffected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\abs_affected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\rel_affected
HIGH_DEFENDED_MAPS\100\None\2010\population_SSP2\abs_unaffected
HIGH_DEFENDED_MAPS\100\No

In [27]:
# Open parquet file from google bucket URL
# URL of the Parquet file
url = "https://storage.googleapis.com/coclico-data-public/coclico/pp_stats/pop_fp_LAU_rel.parquet"

# Read the Parquet file
df = pd.read_parquet(url, engine='pyarrow')

df

Unnamed: 0,GISCO_ID,CNTR_CODE,nuts_2,nuts_3,LAU_ID,LAU_NAME,POP_2020,POP_DENS_2,AREA_KM2,YEAR,...,UNDEFENDED_MAPS\100\High_End\2150\population_SSP5\abs_unaffected,UNDEFENDED_MAPS\100\High_End\2150\population_SSP5\rel_unaffected,UNDEFENDED_MAPS\1\High_End\2150\population_SSP5\abs_affected,UNDEFENDED_MAPS\1\High_End\2150\population_SSP5\rel_affected,UNDEFENDED_MAPS\1\High_End\2150\population_SSP5\abs_unaffected,UNDEFENDED_MAPS\1\High_End\2150\population_SSP5\rel_unaffected,UNDEFENDED_MAPS\static\High_End\2150\population_SSP5\abs_affected,UNDEFENDED_MAPS\static\High_End\2150\population_SSP5\rel_affected,UNDEFENDED_MAPS\static\High_End\2150\population_SSP5\abs_unaffected,UNDEFENDED_MAPS\static\High_End\2150\population_SSP5\rel_unaffected
0,DE_03361001,DE,DE93,DE93B,03361001,"Achim, Stadt",31923.0,471.398771,67.719735,2020,...,,,,,,,0.752165,0.000012,63640.062694,0.999988
1,DE_03361002,DE,DE93,DE93B,03361002,Blender,2885.0,75.034640,38.448908,2020,...,,,,,,,,,,
2,DE_03361003,DE,DE93,DE93B,03361003,Dörverden,9009.0,107.940908,83.462333,2020,...,,,,,,,,,,
3,DE_03361004,DE,DE93,DE93B,03361004,Emtinghausen,1464.0,68.573846,21.349247,2020,...,,,,,,,,,,
4,DE_03361005,DE,DE93,DE93B,03361005,Kirchlinteln,9911.0,56.675823,174.871744,2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11960,FR_97614,FR,FRY5,FRY50,97614,Ouangani,10203.0,555.838786,18.356042,2020,...,,,,,,,,,,
11961,FR_97616,FR,FRY5,FRY50,97616,Sada,11156.0,1012.643086,11.016715,2020,...,,,,,,,,,,
11962,FR_97617,FR,FRY5,FRY50,97617,Tsingoni,13934.0,406.428213,34.284037,2020,...,,,,,,,,,,
11963,FR_97608,FR,FRY5,FRY50,97608,Dzaoudzi,17831.0,2797.974566,6.372824,2020,...,,,,,,,,,,
