## Notebook to process CBA from csv-file to parquet

In [1]:
# Load software
import os
import pathlib
import sys
import shapely
import pystac_client
import pandas as pd
from shapely import Polygon, geometry
from affine import Affine
from rasterio.features import shapes
import json
import itertools
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio

# Import custom functionality
from coclicodata.drive_config import p_drive

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [17]:
# Set path to csv data
CBA_dir = coclico_data_dir.joinpath('WP6', 'data', 'CBA_D6.4')

# List all csv files (first focus on country files)
CBA_files = CBA_dir.glob('*country*.csv')

In [4]:
# Load first file in pandas
CBA_file = next(CBA_files)
CBA_df = pd.read_csv(CBA_file)
CBA_df

Unnamed: 0,country,nuts,coast_length,protection_2050,retreat_2050,acc_2050,protect_retreat_2050,no_adaptation_2050,protection_2100,retreat_2100,...,protect_retreat_2100,no_adaptation_2100,protection_2150,retreat_2150,acc_2150,protect_retreat_2150,no_adaptation_2150,costs_adaptation,costs_damage,costs
0,Aaland,,3886.0,0.0,4.6,0.7,0.0,94.7,0.0,4.6,...,0.0,94.7,0.0,4.6,0.7,0.0,94.7,0.3,0.0,0.3
1,Albania,AL,753.0,5.3,0.0,49.1,0.0,45.6,22.3,2.0,...,0.0,45.5,40.0,14.5,0.0,0.0,45.5,1.3,24.8,26.1
2,Belgium,BE,319.0,96.5,2.8,0.0,0.0,0.7,96.5,2.8,...,0.0,0.7,96.5,2.8,0.0,0.0,0.7,12.5,2.4,14.9
3,Bulgaria,BG,466.0,0.4,0.0,23.1,0.0,76.5,0.4,0.0,...,0.0,75.3,1.6,2.0,21.3,0.0,75.1,0.1,0.9,0.9
4,Croatia,HR,5309.0,0.5,7.3,4.6,0.0,87.7,1.7,10.6,...,0.0,87.7,1.7,10.6,0.0,0.1,87.6,1.1,0.5,1.6
5,Cyprus,CY,641.0,0.0,0.0,11.3,0.0,88.7,0.0,0.0,...,0.0,87.8,3.3,8.4,0.3,0.8,87.2,0.0,0.2,0.2
6,Denmark,DK,6527.0,4.2,69.7,0.2,0.0,25.9,4.2,70.1,...,0.0,25.7,4.2,70.2,0.1,0.0,25.6,28.3,4.9,33.3
7,Estonia,EE,2559.0,0.0,48.5,0.1,0.0,51.4,0.0,49.0,...,0.0,51.0,0.0,49.0,0.1,0.0,51.0,3.2,0.0,3.2
8,Faroe Islands,,919.0,0.2,13.4,2.7,0.0,83.6,0.6,15.8,...,0.0,83.6,0.6,15.8,0.0,0.0,83.6,0.5,0.6,1.0
9,Finland,FI,17922.0,0.7,33.1,0.0,0.4,65.8,0.7,33.1,...,0.4,65.8,0.7,33.1,0.0,0.4,65.8,16.0,3.1,19.0


In [6]:
# Read NUTS from the stac

# Setup the URL to STAC catalog in Google Cloud
catalog = pystac_client.Client.open(
    "https://storage.googleapis.com/coclico-data-public/coclico/coclico-stac/catalog.json"
)

NUTS = catalog.get_child('NUTS')

# Get the href to the NUTS data
cloud_NUTS_path = NUTS.assets['geoparquet-stac-items'].href

# Retrieve actual data using regular pandas, loading with geopandas is very slow
NUTS_data = pd.read_parquet(cloud_NUTS_path)

# Because we load with regular pandas the polygon data needs to be converted from WKB - Well-Known Binary to shapely.Polygon
NUTS_data['geometry'] = NUTS_data['geometry'].apply(shapely.wkb.loads)

# Now convert to geopandas
NUTS_data = gpd.GeoDataFrame(NUTS_data, geometry='geometry', crs='EPSG:3035')

NUTS_data

c:\SnapVolumesTemp\MountPoints\{45c63495-0000-0000-0000-100000000000}\{548BD435-0C48-4314-A26F-734F82A3D043}\SVROOT\Users\kras\AppData\Local\mambaforge\envs\coclico\Lib\site-packages\pystac_client\client.py:187: NoConformsTo: Server does not advertise any conformance classes.


Unnamed: 0,NUTS_ID,LEVL_CODE,CNTR_CODE,NAME_LATN,NUTS_NAME,MOUNT_TYPE,URBN_TYPE,COAST_TYPE,geometry
0,AL,0,AL,b'5368716970eb726961',b'5368716970eb726961',0.0,0,0,"MULTIPOLYGON (((5121233.536 2221719.441, 51208..."
1,CZ,0,CZ,b'?esko',b'?esko',0.0,0,0,"POLYGON ((4624843.654 3112209.741, 4625546.618..."
2,DE,0,DE,b'Deutschland',b'Deutschland',0.0,0,0,"MULTIPOLYGON (((4355225.365 2715902.993, 43541..."
3,DK,0,DK,b'Danmark',b'Danmark',0.0,0,0,"MULTIPOLYGON (((4650502.736 3591342.844, 46503..."
4,CY,0,CY,b'4bfd70726f73',b'??????',0.0,0,0,"MULTIPOLYGON (((6527040.718 1762367.593, 65267..."
...,...,...,...,...,...,...,...,...,...
2005,NO0B1,3,NO,b'Jan Mayen',b'Jan Mayen',3.0,3,1,"POLYGON ((3623747.621 5400386.841, 3624031.138..."
2006,EE009,3,EE,b'Kesk-Eesti',b'Kesk-Eesti',4.0,3,1,"MULTIPOLYGON (((5216227.688 4159212.769, 52172..."
2007,NO0,1,NO,b'Norge',b'Norge',0.0,0,0,"MULTIPOLYGON (((4961367.759 5413266.131, 49622..."
2008,NO0B,2,NO,b'Jan Mayen and Svalbard',b'Jan Mayen and Svalbard',,0,0,"MULTIPOLYGON (((4744650.828 6379141.635, 47446..."


In [7]:
# Only get NUTS0 level
NUTS_data = NUTS_data[NUTS_data['LEVL_CODE'] == 0]
NUTS_data.shape

(37, 9)

In [8]:
# List all csv files (first focus on country files)
import re
CBA_files = CBA_dir.glob('*country*.csv')

# Initialize empty CBA dataframe
CBA = []

for idx, CBA_file in enumerate(CBA_files):
    print(idx, CBA_file)

    # Open csv file
    cur_CBA = pd.read_csv(CBA_file)

    # Retrieve the geometry from NUTS data
    cur_CBA = cur_CBA.merge(NUTS_data[['NUTS_ID', 'geometry']], 
                              left_on='nuts', 
                              right_on='NUTS_ID', 
                              how='left')

    # Convert to a geodataframe
    cur_CBA = gpd.GeoDataFrame(cur_CBA, geometry='geometry')

    # Extract the scenario (e.g., ssp126, ssp245, or ssp585) from the file name
    scenario = "SSP126" if "SSP126" in str(CBA_file) else (
        "SSP245" if "SSP245" in str(CBA_file) else "SSP585"
    )

    # Add the scenario as a new column
    #cur_CBA['scenarios'] = scenario
    basecols = ["country", "nuts", "geometry", "coast_length", "NUTS_ID"]
    collist = cur_CBA.columns
    collist_adj = []
    for item in collist:
        # Check if there's a number in the string
        if any(char.isdigit() for char in item) and item not in basecols:
            # Split the string at the number
            split_item = [''.join(filter(str.isalpha, part)) for part in item.split('_') if part]
            number = ''.join(filter(str.isdigit, item))
            modified_item = '_'.join(split_item).rstrip('_')
            collist_adj.append("%s\%s\%s"%(modified_item, scenario, number))
        elif not any(char.isdigit() for char in item) and item not in basecols:
            collist_adj.append("%s\%s"%(item, scenario))
        else:
            collist_adj.append(item)
    cur_CBA.columns = collist_adj

    if idx > 0: # rename column
        cur_CBA.rename(columns={"geometry": "geometry_%s"%idx}, inplace=True)

    # Append to the list
    CBA.append(cur_CBA)

#CBA = pd.concat(CBA, ignore_index=True)
combined_df = pd.concat(CBA, axis=1)

# Remove duplicate columns by checking for identical data
deduplicated_df = combined_df.loc[:, ~combined_df.columns.duplicated()]
deduplicated_df.drop(["nuts", "geometry_1", "geometry_2"], axis=1, inplace=True)

# print(deduplicated_df.keys())
# print(deduplicated_df.shape)

# Write as parquet
#CBA.to_parquet(str(CBA_file.parent.joinpath('GCF.open.CBA_country.all.parquet')))
deduplicated_df.to_parquet(str(CBA_file.parent.joinpath('GCF.open.CBA_country.all.parquet')))

0 P:\11207608-coclico\FULLTRACK_DATA\WP6\data\CBA_D6.4\GCF.open.CBA_country.SSP126.csv
1 P:\11207608-coclico\FULLTRACK_DATA\WP6\data\CBA_D6.4\GCF.open.CBA_country.SSP245.csv
2 P:\11207608-coclico\FULLTRACK_DATA\WP6\data\CBA_D6.4\GCF.open.CBA_country.SSP585.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  deduplicated_df.drop(["nuts", "geometry_1", "geometry_2"], axis=1, inplace=True)


In [9]:
# write gpkg
deduplicated_df.to_file(str(CBA_dir.joinpath('GCF_open_CBA_country_all_EPSG3035.gpkg')), layer='GCF_open_CBA_country_all', driver='GPKG')

Cannot find header.dxf (GDAL_DATA is not defined)


In [19]:
CBA = gpd.read_parquet(str(CBA_dir.joinpath('GCF.open.CBA_country.all.parquet')))
CBA

Unnamed: 0,country,coast_length,protection\SSP126\2050,retreat\SSP126\2050,acc\SSP126\2050,protect_retreat\SSP126\2050,no_adaptation\SSP126\2050,protection\SSP126\2100,retreat\SSP126\2100,acc\SSP126\2100,...,protect_retreat\SSP585\2100,no_adaptation\SSP585\2100,protection\SSP585\2150,retreat\SSP585\2150,acc\SSP585\2150,protect_retreat\SSP585\2150,no_adaptation\SSP585\2150,costs_adaptation\SSP585,costs_damage\SSP585,costs\SSP585
0,Aaland,3886.0,0.0,4.6,0.7,0.0,94.7,0.0,4.6,0.7,...,0.0,94.5,0.0,5.5,0.0,0.0,94.5,0.3,0.0,0.3
1,Albania,753.0,5.3,0.0,49.1,0.0,45.6,22.3,2.0,30.3,...,0.0,45.5,40.2,14.3,0.0,0.0,45.5,2.0,25.2,27.1
2,Belgium,319.0,96.5,2.8,0.0,0.0,0.7,96.5,2.8,0.0,...,0.0,0.7,96.5,1.7,0.0,1.1,0.7,13.5,2.3,15.8
3,Bulgaria,466.0,0.4,0.0,23.1,0.0,76.5,0.4,0.0,24.4,...,0.0,75.1,8.8,15.5,0.0,0.6,75.1,0.1,0.9,1.0
4,Croatia,5309.0,0.5,7.3,4.6,0.0,87.7,1.7,10.6,0.0,...,0.1,87.6,1.7,10.5,0.0,0.3,87.5,1.4,0.4,1.8
5,Cyprus,641.0,0.0,0.0,11.3,0.0,88.7,0.0,0.0,12.2,...,0.3,86.9,2.6,10.0,0.0,0.5,86.9,0.1,0.2,0.3
6,Denmark,6527.0,4.2,69.7,0.2,0.0,25.9,4.2,70.1,0.0,...,0.2,25.3,6.4,68.2,0.1,0.2,25.2,31.8,4.6,36.4
7,Estonia,2559.0,0.0,48.5,0.1,0.0,51.4,0.0,49.0,0.1,...,0.0,49.9,0.0,50.3,0.1,0.0,49.6,3.2,0.0,3.3
8,Faroe Islands,919.0,0.2,13.4,2.7,0.0,83.6,0.6,15.8,0.0,...,0.0,83.6,0.6,15.7,0.0,0.1,83.6,0.5,0.5,1.0
9,Finland,17922.0,0.7,33.1,0.0,0.4,65.8,0.7,33.1,0.0,...,1.6,65.6,2.0,30.7,0.0,1.6,65.6,15.0,2.5,17.5


### CUT the file above up into pieces for the F/E map

In [20]:
adap_strategy = ["protection", "retreat", "acc", "protect_retreat", "no_adaptation"]
scenario = ["SSP126", "SSP245", "SSP585"]
time = ["2050", "2100", "2150"]
#keepcols = ["country", "coast_length", "costs_adaptation", "costs_damage", "costs", "NUTS_ID", "geometry", "scenarios"]
keepcols = ["country", "coast_length", "NUTS_ID", "geometry"]

# create folder structure & split files
for adap in adap_strategy:
    for scen in scenario:
        for t in time:

            # combine the variables to span to column name
            #comb_col = adap+"_"+t
            comb_col = "%s\%s\%s"%(adap, scen, t)
            keepcols.append(comb_col) # add the variable column 
            comb_col_var = ["costs_adaptation", "costs_damage", "costs"]
            for i in comb_col_var:
                keepcols.append("%s\%s"%(i, scen))
            col_rename = [i.replace("\\", "_") for i in keepcols] # replace "\" to "_" in the column names
            #print(keepcols)
            #print(col_rename)
            #print(adap, scen, t, comb_col)

            # filter the original dataframe
            fil_DF = CBA[keepcols] # filter base columns
            # fil_DF = fil_DF[fil_DF['scenarios'] == scen] # filter scenario
            # fil_DF.drop("scenarios", axis=1, inplace=True) # drop scenario column
            #print(fil_DF.keys())

            # remove the added variable column for the next iteration
            keepcols.remove(comb_col) # remove the variable column
            for i in comb_col_var:
                keepcols.remove("%s\%s"%(i, scen)) # remove the variable columns
            #print(keepcols)

            # rename the columns from "\" to "_"
            fil_DF.columns = col_rename
            #print(fil_DF.keys())

            # make the folder structure and write as parquet & gpkg
            CBA_dir.joinpath("maps", adap, scen).mkdir(parents=True, exist_ok=True)
            fil_DF.to_parquet(str(CBA_dir.joinpath("maps", adap, scen, "GCF_open_CBA_country_%s_%s_%s.parquet"%(adap, scen, t))))
            fil_DF.to_file(str(CBA_dir.joinpath("maps", adap, scen, "GCF_open_CBA_country_%s_%s_%s.gpkg"%(adap, scen, t))), layer='CBA_%s_%s_%s'%(adap, scen, t), driver='GPKG')