# Subsidence

Notebook to migrate xlsx files to CF compliant ..

In [None]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

### Configure OS dependent paths

In [91]:
# Import standard packages
import os
import pathlib

# import numpy as np
# import geopandas as gpd
import pandas as pd
# import matplotlib.pyplot as plt
import xarray as xr
import json
# import math
# import itertools
# import glob

# Import custom functionality
from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = p_drive.joinpath("11208003-latedeo2022","020_InternationalDeltaPortfolio","datasets")

# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"Anaconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml"
    )
)

In [83]:
# Project paths & files (manual input)
dataset_dir = gca_data_dir.joinpath(r"00_mapping_global_threat_of_land_subsidence")
dataset_dir_path = dataset_dir.joinpath("abb8549_data_s3_red.nc")
CF_dir = gca_data_dir.joinpath(r"CF")  # directory to save output CF check files

### Write XLSX to NetCDF

In [85]:
# write xlsx to netcdf

# open the XLSX dataset as pandas dataframe
df = pd.read_excel(str(dataset_dir_path).replace("_red.nc", ".xlsx"), index_col=None, header=0)

# Rename columns that cause errors
key_list_corr = {}
for idx, i in enumerate(df.keys()):
    if '/' in i:
        key_list_corr[i] = i.replace("/", "per")

df = df.rename(columns=key_list_corr)

# Select relevant columns (info from Gilles --> decreases the datasets)
columns=['Country',
        'Potential exposed population in 2010 (Million)',
        'Exposed GDP (EGDP) (Billion US$)',
        'Potential global subsidence index in 2010 (PGSI)',
        'Potential exposed population in 2040 (Million)',
        'Exposed GDP (EGDP) (Billion US$) in 2040',
        'Potential subsidence index 2040',
        ]

df = df[columns]

# Convert the pandas dataframe to an xarray dataset
ds = xr.Dataset.from_dataframe(df)

# # Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)

### Check CF compliancy original NetCDF files

In [86]:
# open datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

In [87]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile=dataset_dir_path, 
                 working_dir=CF_dir
                 )

In [88]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [107]:
# open datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

In [108]:
# NetCDF attribute alterations by means of metadata template
f_global = open(dataset_dir.joinpath("metadata_subsidence.json"))
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"

In [111]:
dsn = ds.rename_dims({"index": "stations"})

# merge variables for 2010 and 2040 into one variable
data_array = np.concatenate([])
data_array_r = data_array.reshape((4,3,len(i['Lcurrent'].values)))

# rename variables, if necessary
dsn = dsn.rename_vars(
    {"Country": "country", "Potential exposed population in 2010 (Million)": "exp_pop"}
)

#dsn = dsn.drop(["index"])
dsn


In [None]:
# TODO: make CF compliant...
# NUTS2 regions
# COUNTRY code names..? check SM data
# differenct variables with different units and dimensions?
# how to make a multidimensional xarray file?

In [99]:
# write to NetCDF file to check compliancy

# prevent file locking, see: https://github.com/pydata/xarray/issues/2376
import os
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'

ds.to_netcdf(path=str(dataset_dir_path).replace(".nc", "_CF.nc"))

### Check CF compliancy altered NetCDF files

In [100]:
%%capture cap --no-stderr
# check altered CF compliancy

check_compliancy(testfile=str(dataset_dir_path).replace(".nc", "_CF.nc"), working_dir=CF_dir)

In [101]:
# save altered CF compliancy
save_compliancy(
    cap,
    testfile=str(dataset_dir_path).replace(".nc", "_CF.nc"),
    working_dir=CF_dir,
)

