## Notebook to process CBA from csv-file to parquet

In [2]:
# Load software
import os
import pathlib
import sys
import shapely
import pystac_client
import pandas as pd
from shapely import Polygon, geometry
from affine import Affine
from rasterio.features import shapes
import json
import itertools
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from dotenv import load_dotenv
import math
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
import rioxarray as rio

# Import custom functionality
from coclicodata.drive_config import p_drive

# Define (local and) remote drives
coclico_data_dir = p_drive.joinpath("11207608-coclico", "FULLTRACK_DATA")


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [3]:
# Set path to csv data
CBA_dir = coclico_data_dir.joinpath('WP6', 'data', 'CBA')

# List all csv files (first focus on country files)
CBA_files = list(CBA_dir.glob('*CBA.SSP*.csv'))

CBA_files

[WindowsPath('P:/11207608-coclico/FULLTRACK_DATA/WP6/data/CBA/GCF.open.CBA.SSP126.csv'),
 WindowsPath('P:/11207608-coclico/FULLTRACK_DATA/WP6/data/CBA/GCF.open.CBA.SSP245.csv'),
 WindowsPath('P:/11207608-coclico/FULLTRACK_DATA/WP6/data/CBA/GCF.open.CBA.SSP585.csv')]

In [4]:
# Load the first file in pandas
CBA_file = CBA_files[0]
CBA_df = pd.read_csv(CBA_file)
CBA_df


Unnamed: 0,fpid,year,protection_height,protection_level,retreat_height,accommodation,total_costs,total_adaptation_costs,total_flood_damages,current_adaptation_cost,annual_flood_damage
0,0,2020,2.6,1,2.2,0,108390904,58310104,50080799,34729450,3254453
1,32680,2020,0.0,1,0.0,0,1279086,1138430,140656,0,5047
2,32680,2030,0.0,1,0.0,0,1497673,109933,1387740,1,7866
3,32680,2040,0.0,1,0.0,1,1729771,1595764,134006,1595125,13331
4,32680,2050,0.0,1,2.3,0,1623,844,779,0,13
...,...,...,...,...,...,...,...,...,...,...,...
295578,31543,2110,1.7,134,0.0,0,11649590,6635715,5013875,0,117680
295579,31543,2120,1.7,95,0.0,0,10867700,4677369,6190331,0,179673
295580,31543,2130,1.7,73,0.0,0,9158890,5355772,3803119,0,252544
295581,31543,2140,1.7,43,0.0,0,6187575,1557487,4630087,0,463009


In [17]:
# Load the first file in pandas
CBA_file = CBA_files[1]
CBA_df = pd.read_csv(CBA_file)
CBA_df


Unnamed: 0,fpid,year,protection_height,protection_level,retreat_height,accommodation,total_costs,total_adaptation_costs,total_flood_damages,current_adaptation_cost,annual_flood_damage
0,0,2020,2.6,1,2.2,0,108390904,58310104,50080799,34729450,3254453
1,15654,2020,0.0,1,0.0,0,875714,343274,532440,343274,53244
2,15654,2030,0.0,1,1.6,0,0,0,0,0,0
3,15654,2040,0.0,1,1.6,0,0,0,0,0,0
4,15654,2050,0.0,1,1.6,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
295578,35975,2110,0.0,1,0.9,0,0,0,0,0,0
295579,35975,2120,0.0,1,0.9,0,0,0,0,0,0
295580,35975,2130,0.0,1,0.9,0,0,0,0,0,0
295581,35975,2140,0.0,1,0.9,0,0,0,0,0,0


In [5]:
SSPs = []

# Convert all files to parquet
for CBA_file in CBA_files:
    CBA_df = pd.read_csv(CBA_file)
    CBA_df.to_parquet(CBA_file.with_suffix('.parquet'))

    # Subtract SSP from file
    SSPs.append(CBA_file.stem.split('.')[-1])

SSPs

['SSP126', 'SSP245', 'SSP585']

In [None]:
# Compare which columns are the same in the different CBA files

CBA_SSP126 = pd.read_parquet(CBA_files[0].with_suffix('.parquet'))
CBA_SSP245 = pd.read_parquet(CBA_files[1].with_suffix('.parquet'))
CBA_SSP585 = pd.read_parquet(CBA_files[2].with_suffix('.parquet'))

# Print the length of all dataframes
print(len(CBA_SSP126))
print(len(CBA_SSP245))
print(len(CBA_SSP585))

# Check if column fpid is the same
if CBA_SSP126['fpid'].equals(CBA_SSP245['fpid']):
    print("The fpid columns are the same")
else:
    print(str(sum(CBA_SSP126['fpid'].values == CBA_SSP245['fpid'].values)) + ' instances of fpid are the same')

# Check if all instances of fpid exist in all dataframes


295583
295583
295583
1429 instances of fpid are the same
The year columns are the same


In [None]:
# Join all SSP dataframes

for CBA_file in CBA_files:
    CBA_df = pd.read_parquet(CBA_file.with_suffix('.parquet'))
    CBA_df['SSP'] = CBA_file.stem.split('.')[-1]
    CBA_df.to_parquet(CBA_file.with_suffix('.parquet'))

In [None]:
# Get all unique values for year
years = CBA_df['year'].unique()
years

array([2020, 2030, 2040, 2050, 2060, 2070, 2080, 2090, 2100, 2110, 2120,
       2130, 2140, 2150], dtype=int64)

In [20]:
# Get unique values for fpid
fpid = CBA_df['fpid'].unique()
len(fpid)

21114

In [22]:
len(years) * len(fpid)

295596

In [24]:
# List non-unique values of fpid
fpid = CBA_df['fpid']
fpid.duplicated()


0         False
1         False
2          True
3          True
4          True
          ...  
295578     True
295579     True
295580     True
295581     True
295582     True
Name: fpid, Length: 295583, dtype: bool

In [15]:
# Make one combined file for all data with the SSPs and years in the column titles

CBA_all = pd.DataFrame()

for CBA_file in CBA_files:
    for year in years:
        
        # Subtract SSP from file
        SSP = CBA_file.stem.split('.')[-1]

        # Load the data
        CBA_df = pd.read_csv(CBA_file)

        # Filter the data
        CBA_df_yr = CBA_df[CBA_df['year'] == year]

        # Drop the year column
        CBA_df_yr = CBA_df_yr.drop(columns=['year'])

        # Rename the columns
        CBA_df_yr.columns = [f'{SSP}\{year}\{col}' for col in CBA_df_yr.columns]

        # Add the data to the combined dataframe
        CBA_all = pd.concat([CBA_all, CBA_df_yr], axis=1)


In [18]:
CBA_df[CBA_df['year'] == year]

Unnamed: 0,fpid,year,protection_height,protection_level,retreat_height,accommodation,total_costs,total_adaptation_costs,total_flood_damages,current_adaptation_cost,annual_flood_damage
14,28144,2150,0.0,1,3.2,0,0,0,0,0,0
28,15654,2150,0.0,1,1.6,0,0,0,0,0,0
42,32680,2150,0.0,1,1.6,0,0,0,0,0,4388
56,25251,2150,0.0,1,0.9,0,0,0,0,0,491
70,17597,2150,0.0,1,1.3,0,0,0,0,0,46423
...,...,...,...,...,...,...,...,...,...,...,...
295526,35973,2150,0.0,1,1.7,0,0,0,0,0,1022163
295540,35974,2150,0.0,1,1.7,0,0,0,0,0,1565
295554,34864,2150,0.0,1,6.7,0,0,0,0,0,111911
295568,35975,2150,0.0,1,0.9,0,0,0,0,0,0


In [None]:
# List all unique 

Unnamed: 0,SSP126\2020\fpid,SSP126\2020\protection_height,SSP126\2020\protection_level,SSP126\2020\retreat_height,SSP126\2020\accommodation,SSP126\2020\total_costs,SSP126\2020\total_adaptation_costs,SSP126\2020\total_flood_damages,SSP126\2020\current_adaptation_cost,SSP126\2020\annual_flood_damage,...,SSP585\2150\fpid,SSP585\2150\protection_height,SSP585\2150\protection_level,SSP585\2150\retreat_height,SSP585\2150\accommodation,SSP585\2150\total_costs,SSP585\2150\total_adaptation_costs,SSP585\2150\total_flood_damages,SSP585\2150\current_adaptation_cost,SSP585\2150\annual_flood_damage
0,0.0,2.600000,1.0,2.2,0.0,108390904.0,58310104.0,50080799.0,34729450.0,3254453.0,...,,,,,,,,,,
1,32680.0,0.000000,1.0,0.0,0.0,1279086.0,1138430.0,140656.0,0.0,5047.0,...,,,,,,,,,,
15,28144.0,1.040843,31.0,0.0,0.0,716266.0,704695.0,11572.0,200961.0,1157.0,...,,,,,,,,,,
29,15654.0,0.000000,1.0,0.0,0.0,875758.0,343274.0,532485.0,343274.0,53248.0,...,,,,,,,,,,
43,17597.0,1.288138,25.0,0.0,0.0,1377262.0,1345645.0,31617.0,213183.0,1413.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295526,,,,,,,,,,,...,35973.0,0.0,1.0,1.7,0.0,0.0,0.0,0.0,0.0,1022163.0
295540,,,,,,,,,,,...,35974.0,0.0,1.0,1.7,0.0,0.0,0.0,0.0,0.0,1565.0
295554,,,,,,,,,,,...,34864.0,0.0,1.0,6.7,0.0,0.0,0.0,0.0,0.0,111911.0
295568,,,,,,,,,,,...,35975.0,0.0,1.0,0.9,0.0,0.0,0.0,0.0,0.0,0.0
