# Dataset for Zenodo repository

In [None]:
# basic
import os
import numpy as np
import pandas as pd
from glob import glob

# spatial
import xarray as xr
import geopandas as gpd
import shapely.geometry
import xesmf as xe
import regionmask

# climate
from xclim.indicators import atmos
from xclim import core 

# others
from tqdm.notebook import tqdm

os.chdir('/home/rooda/Dropbox/Patagonia')

## "basins_boundaries.zip" file

Contains the polygons (in .shp format) of the studied catchments. Each catchment is identified by its "basin_id".

In [None]:
basins = gpd.read_file("GIS South/Basins_Patagonia_ice.shp")
basins = basins.set_index("ID")

names = ["Yelcho", "Baker", "Santa Cruz", "Palena", "Grey", "Puelo", "Cisnes", "Aysen", "Pascua"]
basins.loc[basins.basin_area > 5000, "Name"] = names
basins = basins.replace({"Zone": {1:'PPY', 2:'PCA', 3:'NPI-E', 4:'NPI-W', 5:'SPI-N', 6:'SPI-C', 7:'SPI-S', 8:'GCN', 9:'CDI'}})
basins = basins[["Name", "Zone", "basin_area", "geometry"]]
basins = basins.rename(columns = {"Name": "basin_name", "Zone": "basin_zone"})

basins.index.name='basin_id'

In [None]:
basins
basins.to_file("MS2 Results/zenodo/basins_boundaries.shp")

## "dataset_historical.csv" file

Summarises the historical conditions of each glacier at the basin scale (area, volume and reference climate). The climate varaibles are estimated only using glacierized grid cells (1980-2019). 
- `basin_id`: unique identifier
- `basin_name`: basin name (only for catchments with area over 5000 km2)
- `basin_zone`: hydrological zone 
- `basin_area`: area of the basin in km2	
- `n_RGIX`: number of glaciers according to RGI6 and RGI7 inventories
- `area_RGIX`: glacier area in km2 according to inventories RGI6 and RGI7		
- `vol_X`: glacier volume in km3 estimated from Farinnoti et al. (2019) [F19] and Millan et al. (2022) [M22]	
- `PP_X`: annual mean precipitation according to PMET v1.0, ERA5, MSWEP v2.8 and CR2MET v2.5
- `PRSN_X`: annual mean solid precipitation according to PMET v1.0, ERA5, MSWEP v2.8 and CR2MET v2.5
- `PP_X`: annual mean temperature according to PMET v1.0, ERA5, MSWEP v2.8 and CR2MET v2.5
- `PPD_X`: annual mean positive degree-day sum according to PMET v1.0, ERA5, MSWX and CR2MET v2.5

In [None]:
# add area, volume and number of glaciers
RGI6 = gpd.read_file("GIS South/Glaciers/RGI6_v2.shp")
RGI7 = gpd.read_file("GIS South/Glaciers/RGI7_v2.shp")
glaciers  = pd.concat([RGI6.geometry, RGI7.geometry])
glaciers  = glaciers.buffer(0.05) # mask to use for baseline climate

RGI6_sum = RGI6.groupby("ID_basin")[["O2Region", "area_km2", "vol_F19", "vol_M22"]].sum()
RGI6_sum = RGI6_sum.rename(columns = {"O2Region": "n_RGI6", "area_km2": "area_RGI6"})

RGI7_sum = RGI7.groupby("ID_basin")[["O2Region", "area_km2"]].sum()
RGI7_sum = RGI7_sum.rename(columns = {"O2Region": "n_RGI7", "area_km2": "area_RGI7"})

basins = pd.concat([basins, RGI6_sum, RGI7_sum], axis=1)

# fill with zeros
fillc = ["n_RGI6", "area_RGI6", "vol_F19", "vol_M22", "n_RGI7", "area_RGI7"]
basins[fillc] = basins[fillc].fillna(0) 

In [None]:
# Original (no regrid) reference climate (1980-2019): 
pp_pmet  = xr.open_dataset("/home/rooda/OGGM_results/PMET_OGGM_1980_2019m.nc").prcp
t2m_pmet = xr.open_dataset("/home/rooda/OGGM_results/PMET_OGGM_1980_2019m.nc").temp
dem_005  = xr.open_dataset("/home/rooda/OGGM_results/PMET_OGGM_1980_2019m.nc").hgt

pp_cr2met  = xr.open_dataset("/home/rooda/OGGM_results/CR2MET_OGGM_1980_2019m.nc").prcp
t2m_cr2met = xr.open_dataset("/home/rooda/OGGM_results/CR2MET_OGGM_1980_2019m.nc").temp

pp_mswep  = xr.open_dataset("/home/rooda/OGGM_results/MSWEP_OGGM_1980_2019m.nc").prcp
t2m_mswep = xr.open_dataset("/home/rooda/OGGM_results/MSWEP_OGGM_1980_2019m.nc").temp
dem_010   = xr.open_dataset("/home/rooda/OGGM_results/MSWEP_OGGM_1980_2019m.nc").hgt
regridder = xe.Regridder(dem_010,   dem_005, "nearest_s2d")
dem_010   = regridder(dem_010)

pp_era5   = xr.open_dataset("/home/rooda/OGGM_results/ERA5_OGGM_1980_2019m.nc").prcp
t2m_era5  = xr.open_dataset("/home/rooda/OGGM_results/ERA5_OGGM_1980_2019m.nc").temp
dem_025   = xr.open_dataset("/home/rooda/OGGM_results/ERA5_OGGM_1980_2019m.nc").hgt
regridder = xe.Regridder(dem_025,   dem_005, "nearest_s2d")
dem_025   = regridder(dem_025)

In [None]:
# regrid (PMET as the reference grid; 0.05º)

regridder  = xe.Regridder(pp_era5,   pp_pmet, "nearest_s2d")
pp_era5    = regridder(pp_era5)
regridder  = xe.Regridder(pp_cr2met, pp_pmet, "nearest_s2d")
pp_cr2met  = regridder(pp_cr2met)
regridder  = xe.Regridder(pp_mswep,  pp_pmet, "nearest_s2d")
pp_mswep   = regridder(pp_mswep)

lapse_rate = 0.0065 

regridder  = xe.Regridder(t2m_era5,   t2m_pmet, "nearest_s2d")
t2m_era5   = regridder(t2m_era5) # fake high res
factor     = (dem_025 - dem_005)*lapse_rate
t2m_era5   =  t2m_era5 + factor # "real" high res

regridder  = xe.Regridder(t2m_mswep,   t2m_pmet, "nearest_s2d")
t2m_mswep  = regridder(t2m_mswep) # fake high res
factor     = (dem_010 - dem_005)*lapse_rate
t2m_mswep  =  t2m_mswep + factor # "real" high res

regridder  = xe.Regridder(t2m_cr2met, t2m_pmet, "bilinear")
t2m_cr2met = regridder(t2m_cr2met) # simple case (same resolution)

In [None]:
# mask: only glaciarated area
mask      = regionmask.mask_geopandas(glaciers, pp_pmet)   >= 0

pp_pmet    = pp_pmet.where(mask, drop = True)
pp_era5    = pp_era5.where(mask, drop = True)
pp_cr2met  = pp_cr2met.where(mask, drop = True)
pp_mswep   = pp_mswep.where(mask, drop = True)

t2m_pmet   = t2m_pmet.where(mask, drop = True)
t2m_era5   = t2m_era5.where(mask, drop = True)
t2m_cr2met = t2m_cr2met.where(mask, drop = True)
t2m_mswep  = t2m_mswep.where(mask, drop = True)

In [None]:
# Calculate more variables

# xclim needs the units
pp_pmet.attrs['units']   = "mm month-1"
pp_era5.attrs['units']   = "mm month-1"
pp_cr2met.attrs['units'] = "mm month-1"
pp_mswep.attrs['units']  = "mm month-1"
t2m_pmet.attrs['units']   = "C"
t2m_era5.attrs['units']   = "C"
t2m_cr2met.attrs['units'] = "C"
t2m_mswep.attrs['units']  = "C"

# Positive degree-day sum (PDD)
ppd_pmet   = t2m_pmet.where(t2m_pmet >= -1)
ppd_era5   = t2m_era5.where(t2m_era5 >= -1)
ppd_cr2met = t2m_cr2met.where(t2m_cr2met >= -1)
ppd_mswep  = t2m_mswep.where(t2m_mswep >= -1)

# snowfall component
prsn_pmet = atmos.snowfall_approximation(pp_pmet, t2m_pmet, method='brown', thresh='0 degC')
prsn_pmet = core.units.convert_units_to(prsn_pmet, target = 'mm month-1', context = "hydro")
prsn_era5 = atmos.snowfall_approximation(pp_era5, t2m_era5, method='brown', thresh='0 degC')
prsn_era5 = core.units.convert_units_to(prsn_era5, target = 'mm month-1', context = "hydro")
prsn_cr2met = atmos.snowfall_approximation(pp_cr2met, t2m_cr2met, method='brown', thresh='0 degC')
prsn_cr2met = core.units.convert_units_to(prsn_cr2met, target = 'mm month-1', context = "hydro")
prsn_mswep = atmos.snowfall_approximation(pp_mswep, t2m_mswep, method='brown', thresh='0 degC')
prsn_mswep = core.units.convert_units_to(prsn_mswep, target = 'mm month-1', context = "hydro")

In [None]:
# annual value
pp_pmet    = pp_pmet.resample(time='1Y').sum(skipna = False).mean(dim="time")
pp_era5    = pp_era5.resample(time='1Y').sum(skipna = False).mean(dim="time")
pp_cr2met  = pp_cr2met.resample(time='1Y').sum(skipna = False).mean(dim="time")
pp_mswep   = pp_mswep.resample(time='1Y').sum(skipna = False).mean(dim="time")

prsn_pmet    = prsn_pmet.resample(time='1Y').sum(skipna = False).mean(dim="time")
prsn_era5    = prsn_era5.resample(time='1Y').sum(skipna = False).mean(dim="time")
prsn_cr2met  = prsn_cr2met.resample(time='1Y').sum(skipna = False).mean(dim="time")
prsn_mswep   = prsn_mswep.resample(time='1Y').sum(skipna = False).mean(dim="time")

t2m_pmet   = t2m_pmet.resample(time='1Y').mean(skipna = False).mean(dim="time")
t2m_era5   = t2m_era5.resample(time='1Y').mean(skipna = False).mean(dim="time")
t2m_cr2met = t2m_cr2met.resample(time='1Y').mean(skipna = False).mean(dim="time")
t2m_mswep  = t2m_mswep.resample(time='1Y').mean(skipna = False).mean(dim="time")

# the +1 es due to threshold of -1ºC
ppd_pmet   = (ppd_pmet   + 1).resample(time='1Y').sum(skipna = True).mean(dim="time")
ppd_era5   = (ppd_era5   + 1).resample(time='1Y').sum(skipna = True).mean(dim="time")
ppd_cr2met = (ppd_cr2met + 1).resample(time='1Y').sum(skipna = True).mean(dim="time")
ppd_mswep  = (ppd_mswep  + 1).resample(time='1Y').sum(skipna = True).mean(dim="time")

ppd_pmet   = ppd_pmet.where(ppd_pmet > 0) * 30 # from monthly to daily (doesnt change anything)
ppd_era5   = ppd_era5.where(ppd_era5 > 0) * 30
ppd_cr2met = ppd_cr2met.where(ppd_cr2met > 0) * 30
ppd_mswep  = ppd_mswep.where(ppd_mswep > 0) * 30

In [None]:
# mean value for each catchment
averager   = xe.SpatialAverager(pp_pmet,   basins.geometry, geom_dim_name="avg")

basins["PP_PMET"]   = averager(pp_pmet,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PP_ERA5"]   = averager(pp_era5,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PP_CR2MET"] = averager(pp_cr2met, skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PP_MSWEP"]  = averager(pp_mswep,  skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

basins["PRSN_PMET"]   = averager(prsn_pmet,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PRSN_ERA5"]   = averager(prsn_era5,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PRSN_CR2MET"] = averager(prsn_cr2met, skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PRSN_MSWEP"]  = averager(prsn_mswep,  skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

basins["T2M_PMET"]   = averager(t2m_pmet,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2M_ERA5"]   = averager(t2m_era5,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2M_CR2MET"] = averager(t2m_cr2met, skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2M_MSWEP"]  = averager(t2m_mswep,  skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

basins["PPD_PMET"]   = averager(ppd_pmet,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PPD_ERA5"]   = averager(ppd_era5,   skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PPD_CR2MET"] = averager(ppd_cr2met, skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["PPD_MSWEP"]  = averager(ppd_mswep,  skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

In [None]:
basins
basins = basins.drop(columns = "geometry").to_csv("MS2 Results/zenodo/dataset_historical.csv",  index_label='basin_id')

## "dataset_future.csv" file

Summarises the future glacier climate drivers and their impacts at the catchment scale: 
- `basin_id`: unique identifier 
- `basin_name`: basin name (only for catchments with area over 5000 km2)
- `PPc_sspX`: relative precipitation change between the periods 1980-2015 and 2070-2099. Each column represents a different SSP scenario.
- `PPc_spread`: number of GCM that agrees in the direction of change based on SSP 2-4.5 scenario
- `T2Mc_sspX`: absolute temperature change between the periods 1980-2015 and 2070-2099. Each column represents a different SSP scenario.
- `mass_loss_sspX`: mean mass loss in 2100 for each catchment. Each column represents a different SSP scenario

In [None]:
lat_coords = np.arange(-56,-40, 0.5)
lon_coords = np.arange(-76,-67, 0.5)

baseline_period = slice("1980-01-01", "2015-01-01")
future_period   = slice("2070-01-01", "2100-01-01")

gcm_list  = ["ACCESS-CM2", "BCC-CSM2-MR", "CMCC-ESM2", "FGOALS-f3-L", "GFDL-ESM4", "CMCC-CM2-SR5", "KACE-1-0-G", "MPI-ESM1-2-HR", "MRI-ESM2-0", "MIROC6"]
ssp_list  = ['ssp126', 'ssp245', 'ssp370', 'ssp585']

results_pp = []
results_t2m = []

for ssp in tqdm(ssp_list):
    
    results_gcm_pp  = []
    results_gcm_t2m = []
    
    for gcm in gcm_list:
        
        pp_model_ssp = xr.open_dataset("/home/rooda/OGGM_results/Future_climate/PP_" + gcm + "_" + ssp + ".nc")["pr"]
        pp_model_ssp = pp_model_ssp.interp(lat = lat_coords, lon = lon_coords)
        pp_model_ssp = core.units.convert_units_to(pp_model_ssp, target = 'mm month-1', context = "hydro").resample(time = "YS").sum()
        pp_change    = (pp_model_ssp.sel(time = future_period).mean(dim="time") / pp_model_ssp.sel(time = baseline_period).mean(dim="time"))-1
        results_gcm_pp.append(pp_change)
        
        t2m_model_ssp = xr.open_dataset("/home/rooda/OGGM_results/Future_climate/T2M_" + gcm + "_" + ssp + ".nc")["tas"]
        t2m_model_ssp = t2m_model_ssp.interp(lat = lat_coords, lon = lon_coords)
        t2m_model_ssp = t2m_model_ssp.resample(time='YS').mean()        
        t2m_change    = t2m_model_ssp.sel(time = future_period).mean(dim="time") - t2m_model_ssp.sel(time = baseline_period).mean(dim="time")
        results_gcm_t2m.append(t2m_change)
        
    results_gcm_pp  = xr.concat(results_gcm_pp,  dim='gcm')
    results_gcm_t2m = xr.concat(results_gcm_t2m, dim='gcm')
    results_pp.append(results_gcm_pp)
    results_t2m.append(results_gcm_t2m)
    
dataset = xr.merge([xr.concat(results_pp,  dim='ssp'), 
                    xr.concat(results_t2m, dim='ssp')])

# GCM uncertainty (SSP 245) > 80% of the models should agree the direction
gcm_spread = dataset.pr[2].where(dataset.pr[1] >= 0, 1).where(dataset.pr[2] < 0, -1).sum(dim = "gcm")

In [None]:
## resample using ESMF.RegridMethod.NEAREST_STOD
regridder  = xe.Regridder(dataset,  pp_pmet, "bilinear")
dataset    = regridder(dataset)
gcm_spread = regridder(gcm_spread)

# only glacier area
mask    = regionmask.mask_geopandas(glaciers, dataset)   >= 0
dataset = dataset.where(mask, drop = True)
gcm_spread = gcm_spread.where(mask, drop = True)

# multi-model mean
dataset = dataset.mean(dim = "gcm")

In [None]:
# select SSP 245 for map (a,b)
savg  = xe.SpatialAverager(dataset,  basins.geometry, geom_dim_name="avg")

basins["PPc_ssp126"] = savg(dataset.pr[0], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values*100
basins["PPc_ssp245"] = savg(dataset.pr[1], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values*100
basins["PPc_ssp370"] = savg(dataset.pr[2], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values*100
basins["PPc_ssp585"] = savg(dataset.pr[3], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values*100
basins["PPc_spread"]     = savg(gcm_spread, skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

basins["T2Mc_ssp126"] = savg(dataset.tas[0], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2Mc_ssp245"] = savg(dataset.tas[1], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2Mc_ssp370"] = savg(dataset.tas[2], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values
basins["T2Mc_ssp585"] = savg(dataset.tas[3], skipna=True).assign_coords(avg=xr.DataArray(basins.index, dims=("avg",))).values

In [None]:
# volume change
RGI6_ids = RGI6[RGI6.area_km2 > 1][["RGIId", "ID_basin"]]
RGI7_ids = RGI7[RGI7.area_km2 > 1]
RGI7_ids = utils.cook_rgidf(RGI7_ids, o1_region='17', o2_region='02', bgndate= RGI7_ids.src_date, 
                            version = "70", assign_column_values= {'ID_basin' : 'ID_basin'})
RGI7_ids = RGI7_ids[["RGIId", "ID_basin"]]

# merge both datasets
ids = pd.concat([RGI6_ids, RGI7_ids]).set_index("RGIId")

def preprocess(ds): # remove unnecessary variables and coordinates
    return ds.drop_vars(['hydro_year', 'hydro_month', 'calendar_year', 'calendar_month'])['volume']

gdirs = glob("/home/rooda/OGGM_results/new/*", recursive = True)

ds    = []
for gdir in tqdm(gdirs):

    # read historical run 
    model_hist   = xr.open_mfdataset(gdir + "/run_outputs_*.nc", preprocess = preprocess)
    model_hist   = model_hist.sel(time=2015).volume # check NAs

    paths = glob(gdir + "/run_output_*ssp*.nc", recursive = True)
    for path in tqdm(paths, leave = False):

        # read future run and concatenate
        model_future = xr.open_dataset(path)
        model_future = preprocess(model_future).sel(time=2100)
        model   = xr.concat([model_hist, model_future], dim = "time").load()

        # add basin ID to each glacier ID (RGI_ID)
        ids_subset = ids[ids.index.isin(model.rgi_id.to_pandas().tolist())]
        model = model.assign_coords(rgi_id = ids_subset.ID_basin.tolist())
        model = model.groupby('rgi_id').sum()
        model = 1 - (model.sel(time = 2100) / model.sel(time = 2015))
        
        # ID of the setup
        experiment_id = pd.Series(data = {'SSP':     os.path.basename(path).split("_")[3]})
        ds_model = pd.DataFrame(pd.concat([experiment_id, model.to_pandas()]), columns=['mass_loss']).transpose()
        ds.append(ds_model)
        
ds = pd.concat(ds)
ds = ds.groupby("SSP").mean()
ds = ds.transpose()
ds = ds.rename(columns = {'ssp126': 'mass_loss_ssp126', 'ssp245': 'mass_loss_ssp245',
                          'ssp370': 'mass_loss_ssp370', 'ssp585': 'mass_loss_ssp585'})
basins
basins = pd.concat([basins, ds], axis=1)

In [None]:
basins = basins[["basin_name",
                 'PPc_ssp126', 'PPc_ssp245', 'PPc_ssp370', 'PPc_ssp585', "PPc_spread", 
                 'T2Mc_ssp126','T2Mc_ssp245', 'T2Mc_ssp370', 'T2Mc_ssp585',
                 'mass_loss_ssp126', 'mass_loss_ssp245', 'mass_loss_ssp370', 'mass_loss_ssp585']]

In [None]:
basins
basins = basins.to_csv("MS2 Results/zenodo/dataset_future.csv",  index_label='basin_id')

## "dataset_signatures.csv" file

Summarises the  glacio-hydrological signatures for each catchment. The metrics are calculated for the variables "glacier runoff (GR)" and "glacier melt (GM)". The mean was calculated using the full ensemble of projections (n = 1920). The main source of uncertainty (SoU) in each catchment was the source that accumulated most RMSE loss. Details of the different metrics can be found in Table 1. The file presents the following columns: 

- `basin_id`: unique identifier for each catchment 
- `variable_interannual_var`: Inter-annual variability mean (mm yr-1). 
- `variable_lt_change`: Long-term trend mean (% dec-1).  
- `variable_lt_trend`: Long-term change mean (%). 
- `variable_peak_water_duration`:	Peak water duration mean (years). 
- `variable_peak_water_magnitude`: Peak water magnitude mean (mm yr-1).
- `variable_peak_water_year`: Peak water year mean (year). 
- `variable_ref_magnitude`: Reference magnitude mean (mm yr-1).
- `variable_seasonal_cont`: Reference seasonal contribution mean (%).
- `variable_seasonal_shift`: Seasonal shift mean (%). 
- `variable_seasonal_var`: Reference seasonal variability mean (%).
- `SoU_variable_interannual_var`: Main SoU of the inter-annual variability for each catchment.
- `SoU_variable_lt_change`:  Main SoU of the long-term trend for each catchment.
- `SoU_variable_lt_trend`:  Main SoU of the long-term change for each catchment.
- `SoU_variable_peak_water_duration`:  Main SoU of the peak water duration for each catchment.
- `SoU_variable_peak_water_magnitude`:  Main SoU of the peak water magnitude for each catchment.
- `SoU_variable_peak_water_year`:  Main SoU of the peak water year for each catchment.
- `SoU_variable_ref_magnitude`:  Main SoU of the reference magnitude for each catchment.
- `SoU_variable_seasonal_cont`:  Main SoU of the reference seasonal contribution for each catchment.
- `SoU_variable_seasonal_shift`:  Main SoU of the seasonal shift for each catchment.
- `SoU_variable_seasonal_var`:  Main SoU of the reference seasonal variability for each catchment.

In [None]:
# glacio-hydrological signature
metrics = pd.read_csv("MS2 Results/dataset_hydro_signatures.csv")
metrics = metrics.drop(columns = ["Outline", "Climate", "Volume", "GCM", "SSP", "BCM"])
metrics = metrics.rename(columns = {"Unnamed: 0": "variable", "Variable": "metric"})
metrics = metrics.groupby(["metric", "variable"]).mean().transpose()
metrics = metrics.droplevel(0, axis=1) 
metrics.columns = np.concatenate(("GM_" + metrics.columns[0:10].values, "GR_" + metrics.columns[0:10].values), axis=0)
metrics.index = metrics.index.astype("int64")

In [None]:
metrics_su = pd.read_csv("MS2 Results/feature_importance_rmse.csv", index_col = 0)
metrics_su['Most_important'] = metrics_su[["Outline","Climate", "Volume", "GCM", "SSP", "BCM"]].idxmax(axis=1)
metrics_su = metrics_su.drop(columns = ["Outline", "Climate", "Volume", "GCM", "SSP", "BCM"])
metrics_su = metrics_su.pivot(columns = ["Variable", "Metric"], values = "Most_important")
metrics_su = metrics_su.droplevel(0, axis=1)
metrics_su.columns = np.concatenate(("SoU_GM_" + metrics_su.columns[0:10].values, "SoU_GR_" + metrics_su.columns[0:10].values), axis=0)

metrics_hydro = pd.concat([metrics, metrics_su], axis=1)
metrics_hydro.to_csv("MS2 Results/zenodo/dataset_signatures.csv",  index_label='basin_id')