# World Protected Areas Indicator

Notebook environment to migrate UN Inter-agency Group for Child Mortality csv file to CF compliant zarr

In [40]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

### Configure OS independent paths

In [41]:
#%pip install tqdm

<IPython.core.display.Javascript object>

In [42]:
# Import standard packages
import os
import pathlib

import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import math
from tqdm import tqdm 

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets"


# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)

<IPython.core.display.Javascript object>

In [43]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"03_Vulnerability","World Database on Protected Areas")
dataset_dir_path = dataset_dir.joinpath("World_Protected_Areas_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
template_path= pathlib.Path().joinpath(gca_data_dir,r"04_Auxillary_files\Arjen_Vector_Template.csv")
dataset_dir_path

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/All_Datasets/Orig_Datasets/03_Vulnerability/World Database on Protected Areas/World_Protected_Areas_original.nc')

<IPython.core.display.Javascript object>

In [44]:
df_template=pd.read_csv(template_path)
df_template

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348
...,...,...,...,...,...,...,...,...,...,...
1739821,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238
1739822,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363
1739823,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164
1739824,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341


<IPython.core.display.Javascript object>

In [45]:
# Data from other dataset input here
df_WPA=pd.read_csv(dataset_dir.joinpath("World_Protected_Areas_Arjen_Vector_Template_buffered.csv"),sep=",")
df_WPA.head()

  df_WPA=pd.read_csv(dataset_dir.joinpath("World_Protected_Areas_Arjen_Vector_Template_buffered.csv"),sep=",")


Unnamed: 0,fid,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,...,OWN_TYPE,MANG_AUTH,MANG_PLAN,VERIF,METADATAID,SUB_LOC,PARENT_ISO3,ISO3,SUPP_INFO,CONS_OBJ
0,225765,BOX_064_286_142,NZL,Oceania,New Zealand,174.411328,-36.266288,174.415842,-36.27255,174.420356,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
1,225766,BOX_064_286_143,NZL,Oceania,New Zealand,174.415212,-36.264468,174.419725,-36.27073,174.424239,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
2,225767,BOX_064_286_144,NZL,Oceania,New Zealand,174.417553,-36.263412,174.423528,-36.26882,174.429502,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
3,225769,BOX_064_286_146,NZL,Oceania,New Zealand,174.422956,-36.258309,174.429341,-36.263405,174.435725,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
4,225801,BOX_064_286_179,NZL,Oceania,New Zealand,174.435656,-36.280972,174.431478,-36.274561,174.427299,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable


<IPython.core.display.Javascript object>

In [46]:
df_WPA.columns

Index(['fid', 'transect_id', 'country_id', 'continent', 'country_name',
       'Start_lon', 'Start_lat', 'Intersect_lon', 'Intersect_lat', 'End_lon',
       'End_lat', 'OBJECTID', 'WDPAID', 'WDPA_PID', 'PA_DEF', 'NAME',
       'ORIG_NAME', 'DESIG', 'DESIG_ENG', 'DESIG_TYPE', 'IUCN_CAT', 'INT_CRIT',
       'MARINE', 'REP_M_AREA', 'GIS_M_AREA', 'REP_AREA', 'GIS_AREA', 'NO_TAKE',
       'NO_TK_AREA', 'STATUS', 'STATUS_YR', 'GOV_TYPE', 'OWN_TYPE',
       'MANG_AUTH', 'MANG_PLAN', 'VERIF', 'METADATAID', 'SUB_LOC',
       'PARENT_ISO3', 'ISO3', 'SUPP_INFO', 'CONS_OBJ'],
      dtype='object')

<IPython.core.display.Javascript object>

In [47]:
# df_WPA_fin=df_WPA["STATUS"].dropna()
# df_WPA_fin


<IPython.core.display.Javascript object>

In [48]:


for col in df_WPA.columns:
    if isinstance(df_WPA[col].values[0], str):
        df_WPA[col] = df_WPA[col].replace(np.nan, 'Not Applicable')
    elif isinstance(df_WPA[col].values[0], (int, float)):
        df_WPA[col] = df_WPA[col].replace('Not Applicable',np.nan)
        pass
df_WPA

Unnamed: 0,fid,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,...,OWN_TYPE,MANG_AUTH,MANG_PLAN,VERIF,METADATAID,SUB_LOC,PARENT_ISO3,ISO3,SUPP_INFO,CONS_OBJ
0,225765,BOX_064_286_142,NZL,Oceania,New Zealand,174.411328,-36.266288,174.415842,-36.272550,174.420356,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
1,225766,BOX_064_286_143,NZL,Oceania,New Zealand,174.415212,-36.264468,174.419725,-36.270730,174.424239,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
2,225767,BOX_064_286_144,NZL,Oceania,New Zealand,174.417553,-36.263412,174.423528,-36.268820,174.429502,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
3,225769,BOX_064_286_146,NZL,Oceania,New Zealand,174.422956,-36.258309,174.429341,-36.263405,174.435725,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
4,225801,BOX_064_286_179,NZL,Oceania,New Zealand,174.435656,-36.280972,174.431478,-36.274561,174.427299,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739821,1512940,BOX_188_000_39,POL,Europe,Poland,18.663036,54.410465,,,18.661677,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable
1739822,1566998,BOX_208_104_214,GBR,Europe,United Kingdom,-0.992502,60.700568,,,-0.988006,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable
1739823,1635683,BOX_209_264_164,SWE,Europe,Sweden,16.724558,58.365630,,,16.717210,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable
1739824,1635686,BOX_209_264_167,SWE,Europe,Sweden,16.702362,58.358494,,,16.711060,...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable


<IPython.core.display.Javascript object>

In [49]:
np.unique(df_WPA["STATUS"].values)

array(['Adopted', 'Designated', 'Established', 'Inscribed',
       'Not Applicable', 'Not Reported', 'Proposed'], dtype=object)

<IPython.core.display.Javascript object>

In [50]:
np.unique(df_WPA["GOV_TYPE"].values)

array(['Collaborative governance',
       'Federal or national ministry or agency',
       'For-profit organisations', 'Government-delegated management',
       'Indigenous peoples', 'Individual landowners', 'Joint governance',
       'Local communities', 'Non-profit organisations', 'Not Applicable',
       'Not Reported', 'Sub-national ministry or agency'], dtype=object)

<IPython.core.display.Javascript object>

In [51]:
# df_WPA.columns[13]
# df_WPA[df_WPA.columns[13]].values

<IPython.core.display.Javascript object>

In [52]:
# df_WPA[df_WPA.columns[13]].apply(lambda x:float(x))
# df_WPA.columns[13]

<IPython.core.display.Javascript object>

In [53]:

# df_WPA.to_csv(dataset_dir.joinpath("World_Protected_Areas_Arjen_Vector_Template_dtype_correct.csv"),index=False)

<IPython.core.display.Javascript object>

Warning Removed

In [54]:
df=pd.read_csv(dataset_dir.joinpath("World_Protected_Areas_Arjen_Vector_Template_dtype_correct.csv"))
df.head()

  df=pd.read_csv(dataset_dir.joinpath("World_Protected_Areas_Arjen_Vector_Template_dtype_correct.csv"))


Unnamed: 0,fid,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,...,OWN_TYPE,MANG_AUTH,MANG_PLAN,VERIF,METADATAID,SUB_LOC,PARENT_ISO3,ISO3,SUPP_INFO,CONS_OBJ
0,225765,BOX_064_286_142,NZL,Oceania,New Zealand,174.411328,-36.266288,174.415842,-36.27255,174.420356,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
1,225766,BOX_064_286_143,NZL,Oceania,New Zealand,174.415212,-36.264468,174.419725,-36.27073,174.424239,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
2,225767,BOX_064_286_144,NZL,Oceania,New Zealand,174.417553,-36.263412,174.423528,-36.26882,174.429502,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
3,225769,BOX_064_286_146,NZL,Oceania,New Zealand,174.422956,-36.258309,174.429341,-36.263405,174.435725,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable
4,225801,BOX_064_286_179,NZL,Oceania,New Zealand,174.435656,-36.280972,174.431478,-36.274561,174.427299,...,Not Reported,Department of Conservation,Not Reported,State Verified,1773.0,NZ-AUK,NZL,NZL,Not Applicable,Not Applicable


<IPython.core.display.Javascript object>

In [55]:
# Convert the pandas dataframe to an xarray dataset
ds = xr.Dataset.from_dataframe(df)
ds


<IPython.core.display.Javascript object>

In [56]:
ds=ds.drop_vars([ 'WDPA_PID'])
ds

<IPython.core.display.Javascript object>

In [57]:

os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
# Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [58]:
# # open datasets
# ds = xr.open_dataset(dataset_dir_path)

# # check original dataset
# ds

<IPython.core.display.Javascript object>

In [59]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile= dataset_dir_path, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [60]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [61]:
# open original datasets
ds 

<IPython.core.display.Javascript object>

In [62]:
# combine start and end coordinates into a transect
from shapely.geometry import LineString

start_lons = ds["Start_lon"].values
start_lats = ds["Start_lat"].values
end_lons = ds["End_lon"].values
end_lats = ds["End_lat"].values
coords_temporal = zip(zip(start_lons, start_lats), zip(end_lons, end_lats))

ds["transect_geom"] = (
    ["index"],
    [str(LineString(line)) for line in coords_temporal],
)
ds["transect_geom"].attrs["long_name"] = "Transect Geometry"

<IPython.core.display.Javascript object>

In [63]:

# set some data variables to coordinates to *avoid duplication* of dimensions in later stage
ds = ds.set_coords(['transect_id', 'country_name', 'continent', 'Intersect_lon', 'Intersect_lat', 'transect_geom','country_id'])
ds

<IPython.core.display.Javascript object>

In [64]:
# keep_vars = []
# allvars = list(ds_temporal.keys())
# delete_vars = list(set(allvars).difference(set(keep_vars)))

# ds_temporal = ds_temporal.drop_vars(delete_vars)
# ds_temporal

<IPython.core.display.Javascript object>

In [65]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds = ds.rename_dims({"index": "nstations"})
ds


<IPython.core.display.Javascript object>

In [66]:
ds=ds.drop_vars(["index"])
ds

<IPython.core.display.Javascript object>

In [67]:
import json

# NetCDF attribute alterations
f_global = open(dataset_dir.joinpath("metadata_world_database_on_protected_areas.json"))
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [68]:
# change dtypes from unsigned to signed
object_vars = list(ds.variables)
for i in object_vars:
    if isinstance(ds[i].values[0],str) and (i not in ["NAME","ORIG_NAME","MANG_AUTH","MANG_PLAN","SUB_LOC","SUPP_INFO",'DESIG','DESIG_ENG'] ) :
        ds[i] = ds[i].astype('S')
ds

<IPython.core.display.Javascript object>

In [69]:

# rename variables, if necessary
ds = ds.rename_vars(
       {"Intersect_lon": "lon", "Intersect_lat": "lat",
        "country_name": "country" ,
       }
  )
ds

<IPython.core.display.Javascript object>

In [70]:


# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "transect_id": { "long_name": "Transect Identity", "units": "1"},
    "continent": { "long_name": "Continent", "units": "1"},
    "country": { "long_name": "Country", "units": "1"},
    "country_id": { "long_name": "Country Identification", "units": "1"}
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue


ds

<IPython.core.display.Javascript object>

In [71]:
# List of keys to add
keys_to_add = [ 'PA_DEF', 'NAME', 'ORIG_NAME', 'DESIG', 'DESIG_ENG', 'DESIG_TYPE', 
               'IUCN_CAT', 'INT_CRIT', 'MARINE', 'REP_M_AREA', 'GIS_M_AREA', 'REP_AREA', 'GIS_AREA', 
               'NO_TAKE', 'NO_TK_AREA', 'STATUS', 'STATUS_YR', 'GOV_TYPE', 'OWN_TYPE', 'MANG_AUTH', 
               'MANG_PLAN', 'VERIF', 'METADATAID', 'SUB_LOC', 'PARENT_ISO3', 'ISO3', 'SUPP_INFO', 
               'CONS_OBJ']

# Add attributes for each key
for key in keys_to_add:
    try:
        ds[key].attrs = {"long_name": key, "units": ""}
    except KeyError:
        print(f"Variable {key} not found in the dataset.")
ds


<IPython.core.display.Javascript object>

In [72]:
ds=ds.drop_vars([ 'Start_lon', 'Start_lat', 'End_lon', 'End_lat'])
ds

<IPython.core.display.Javascript object>

In [73]:
unsigned_string_and_IDs=["NAME","ORIG_NAME","MANG_AUTH","MANG_PLAN","SUB_LOC","SUPP_INFO",'DESIG','DESIG_ENG',"fid","OBJECTID","WDPAID"]
ds=ds.drop_vars(unsigned_string_and_IDs)
ds

<IPython.core.display.Javascript object>

In [74]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


ds.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy altered NetCDF files

In [75]:
# # open datasets (only first file, rest is the same)
# ds = xr.open_dataset(dataset_dir_path_CF)

# # check original dataset
# ds

<IPython.core.display.Javascript object>

In [76]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [77]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### write data to Zarr files

In [79]:
# export to zarr in write mode (to overwrite if exists)
from dask.diagnostics import ProgressBar
with ProgressBar():
    ds.to_zarr(str(dataset_dir_path).replace("_original.nc", ".zarr"), mode="w")

<IPython.core.display.Javascript object>