# Infant Mortality Rate Indicator

Notebook environment to migrate UN Inter-agency Group for Child Mortality csv file to CF compliant zarr

In [1]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

<IPython.core.display.Javascript object>

### Configure OS independent paths

In [2]:
#%pip install tqdm

<IPython.core.display.Javascript object>

In [3]:
# Import standard packages
import os
import pathlib

import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import math
from tqdm import tqdm 

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets"


# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)

<IPython.core.display.Javascript object>

In [4]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"03_Vulnerability","Child_Mortality")
dataset_dir_path = dataset_dir.joinpath("Child_Mortality_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
template_path= pathlib.Path().joinpath(gca_data_dir,r"04_Auxillary_files\Arjen_Vector_Template")
dataset_dir_path

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/All_Datasets/Orig_Datasets/03_Vulnerability/Child_Mortality/Child_Mortality_original.nc')

<IPython.core.display.Javascript object>

In [5]:
df_template=pd.read_csv(template_path)
df_template

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348
...,...,...,...,...,...,...,...,...,...,...
1739821,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238
1739822,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363
1739823,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164
1739824,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341


<IPython.core.display.Javascript object>

In [6]:

unique_temp=df_template["country_name"].unique()
unique_temp_code=df_template["country_id"].unique()
unique_temp_code

array(['CHL', 'unknown', 'ARG', 'FLK', 'SGS', 'ATF', 'NZL', 'URY', 'BRA',
       'SHN', 'ZAF', 'AUS', 'TON', 'FJI', 'PYF', 'COK', 'PER', 'NAM',
       'MOZ', 'MDG', 'FRA', 'MUS', 'NCL', 'VUT', 'WSM', 'ASM', 'WLF',
       'KIR', 'ECU', 'GAB', 'AGO', 'COG', 'COD', 'TZA', 'KEN', 'COM',
       'SOM', 'SYC', 'IOT', 'MDV', 'IDN', 'TLS', 'PNG', 'SLB', 'MEX',
       'GTM', 'SLV', 'PAN', 'COL', 'BLZ', 'HND', 'CRI', 'NIC', 'VEN',
       'GUY', 'TTO', 'KNA', 'SUR', 'NLD', 'ATG', 'MSR', 'CUW', 'LCA',
       'ABW', 'BRB', 'DOM', 'VCT', 'GRD', 'DMA', 'CPV', 'GHA', 'GNB',
       'SEN', 'LBR', 'GMB', 'SLE', 'GIN', 'CIV', 'MRT', 'NGA', 'CMR',
       'BEN', 'GNQ', 'TGO', 'SOL', 'SAU', 'ERI', 'YEM', 'DJI', 'OMN',
       'LKA', 'IND', 'MMR', 'MYS', 'SGP', 'VNM', 'THA', 'KHM', 'PHL',
       'BRN', 'PLW', 'FSM', 'MNP', 'GUM', 'MHL', 'USA', 'HTI', 'BHS',
       'CUB', 'JAM', 'TCA', 'CYM', 'VGB', 'PRI', 'VIR', 'BLM', 'AIA',
       'BMU', 'MAF', 'ESP', 'MAR', 'PRT', 'LBY', 'TUN', 'EGY', 'LBN',
       'ISR', 'J

<IPython.core.display.Javascript object>

In [7]:
# Data from other dataset input here
df_CM=pd.read_csv(dataset_dir.joinpath("Mortality rate per 1000 live births\API_SP.DYN.IMRT.IN_DS2_en_csv_v2_6508440.csv"),sep=",",skiprows=list(range(0,4)))
df_CM.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Unnamed: 67
0,Aruba,ABW,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,,,,,,,...,50.858298,49.416164,48.047765,46.638627,45.268284,44.08135,43.027778,42.004211,,
2,Afghanistan,AFG,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,,,,228.9,225.1,221.2,...,55.0,53.0,51.1,49.4,47.8,46.3,44.8,43.4,,
3,Africa Western and Central,AFW,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,,,,,,,...,69.98855,68.760967,67.571981,66.373973,64.945255,63.556011,62.165177,60.749633,,
4,Angola,AGO,"Mortality rate, infant (per 1,000 live births)",SP.DYN.IMRT.IN,,,,,,,...,60.5,57.9,55.7,53.8,52.0,50.4,48.7,47.2,,


<IPython.core.display.Javascript object>

In [8]:
# Original dataset dependent

#df_CM=df_CM[[df_CM.columns[0],df_CM.columns[1],df_CM.columns[-3],df_CM.columns[-4],df_CM.columns[-5]]]
# country_list_RS=df_RS[country_column_RS]

df_CM=df_CM.drop(["Indicator Name","Indicator Code","Unnamed: 67"],axis=1)


<IPython.core.display.Javascript object>

In [9]:
df_CM["3_yr_Average"]= (df_CM["2021"]+df_CM["2020"]+df_CM["2019"])/3
df_CM.head()

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,3_yr_Average
0,Aruba,ABW,,,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,50.858298,49.416164,48.047765,46.638627,45.268284,44.08135,43.027778,42.004211,,43.037779
2,Afghanistan,AFG,,,,228.9,225.1,221.2,217.4,213.5,...,55.0,53.0,51.1,49.4,47.8,46.3,44.8,43.4,,44.833333
3,Africa Western and Central,AFW,,,,,,,,,...,69.98855,68.760967,67.571981,66.373973,64.945255,63.556011,62.165177,60.749633,,62.156941
4,Angola,AGO,,,,,,,,,...,60.5,57.9,55.7,53.8,52.0,50.4,48.7,47.2,,48.766667


<IPython.core.display.Javascript object>

In [10]:
unique_CM=df_CM["Country Name"].unique()
unique_CM_code=df_CM["Country Code"].unique()
unique_CM_code

array(['ABW', 'AFE', 'AFG', 'AFW', 'AGO', 'ALB', 'AND', 'ARB', 'ARE',
       'ARG', 'ARM', 'ASM', 'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL',
       'BEN', 'BFA', 'BGD', 'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ',
       'BMU', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN',
       'CEB', 'CHE', 'CHI', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG',
       'COL', 'COM', 'CPV', 'CRI', 'CSS', 'CUB', 'CUW', 'CYM', 'CYP',
       'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'EAP', 'EAR',
       'EAS', 'ECA', 'ECS', 'ECU', 'EGY', 'EMU', 'ERI', 'ESP', 'EST',
       'ETH', 'EUU', 'FCS', 'FIN', 'FJI', 'FRA', 'FRO', 'FSM', 'GAB',
       'GBR', 'GEO', 'GHA', 'GIB', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC',
       'GRD', 'GRL', 'GTM', 'GUM', 'GUY', 'HIC', 'HKG', 'HND', 'HPC',
       'HRV', 'HTI', 'HUN', 'IBD', 'IBT', 'IDA', 'IDB', 'IDN', 'IDX',
       'IMN', 'IND', 'INX', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA',
       'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR', 'KNA',
       'KOR', 'KWT',

<IPython.core.display.Javascript object>

In [11]:
#Create a list of matching countries

good_list=[]

for i in range(0,len(unique_temp)):
    for j in range(0,len(unique_CM)):
        if unique_temp[i]==unique_CM[j]:
            good_list.append(unique_temp[i])
good_list
            
len(good_list)


137

<IPython.core.display.Javascript object>

In [12]:
#Create a missing list

bad_list_temp=set(unique_temp).difference(set(good_list))
bad_list_CM=set(unique_CM).difference(set(good_list))
len(bad_list_CM)

129

<IPython.core.display.Javascript object>

In [13]:


#to dataframe
bad_list_CM_eye=pd.DataFrame({"child mortality": list(bad_list_CM)})
bad_list_temp_eye=pd.DataFrame({"Arjen Template2": list(bad_list_temp)})

#to excel for eye check
bad_list_CM_eye.to_excel(dataset_dir.joinpath("eye_checker_CM.xlsx"))
bad_list_temp_eye.to_excel(dataset_dir.joinpath("eye_checker_temp_code.xlsx"))

<IPython.core.display.Javascript object>

In [14]:
# Switch names that are similar but problematic

name_mapping = {

"Hong Kong SAR, China"	:"Hong Kong S.A.R.",
"Korea, Rep.":	"South Korea",
"St. Vincent and the Grenadines":	"Saint Vincent and the Grenadines",
"Bahamas, The":	"The Bahamas",
"Congo, Rep.":"Republic of Congo",
"Venezuela, RB"	:"Venezuela",
"St. Lucia":	"Saint Lucia",
"St. Kitts and Nevis":	"Saint Kitts and Nevis",
"Micronesia, Fed. Sts."	:"Federated States of Micronesia",
"Congo, Dem. Rep.":	"Democratic Republic of the Congo",
"Syrian Arab Republic":	"Syria",
"Cabo Verde":	"Cape Verde",
"Gambia, The":	"Gambia",
"Egypt, Arab Rep.":	"Egypt",
"Turkiye":	"Turkey",
"Virgin Islands (U.S.)"	:"United States Virgin Islands",
"Viet Nam"	:"Vietnam",
"Cote d'Ivoire"	:"Ivory Coast",
"Korea, Dem. People's Rep."	:"North Korea",
"United States":"United States of America",
"Iran, Islamic Rep."	:"Iran",
"Brunei Darussalam"	:"Brunei",
"Yemen, Rep."	:"Yemen",
"Tanzania"	:"United Republic of Tanzania",
"St. Martin (French part)"	:"Saint Martin",
"Sint Maarten (Dutch part)"	:"Saint Martin",
"Guinea-Bissau":"Guinea Bissau",
"Russian Federation"	:"Russia"


}

<IPython.core.display.Javascript object>

In [15]:
df_CM["Country Name"]=df_CM["Country Name"].replace(name_mapping)
unique_CM_new= df_CM["Country Name"].unique()
unique_CM_new

array(['Aruba', 'Africa Eastern and Southern', 'Afghanistan',
       'Africa Western and Central', 'Angola', 'Albania', 'Andorra',
       'Arab World', 'United Arab Emirates', 'Argentina', 'Armenia',
       'American Samoa', 'Antigua and Barbuda', 'Australia', 'Austria',
       'Azerbaijan', 'Burundi', 'Belgium', 'Benin', 'Burkina Faso',
       'Bangladesh', 'Bulgaria', 'Bahrain', 'The Bahamas',
       'Bosnia and Herzegovina', 'Belarus', 'Belize', 'Bermuda',
       'Bolivia', 'Brazil', 'Barbados', 'Brunei', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada',
       'Central Europe and the Baltics', 'Switzerland', 'Channel Islands',
       'Chile', 'China', 'Ivory Coast', 'Cameroon',
       'Democratic Republic of the Congo', 'Republic of Congo',
       'Colombia', 'Comoros', 'Cape Verde', 'Costa Rica',
       'Caribbean small states', 'Cuba', 'Curacao', 'Cayman Islands',
       'Cyprus', 'Czechia', 'Germany', 'Djibouti', 'Dominica', 'Denmark',
       'Dominican Republic

<IPython.core.display.Javascript object>

In [16]:
#Create a list of matching countries

good_list=[]

for i in range(0,len(unique_temp)):
    for j in range(0,len(unique_CM_new)):
        if unique_temp[i]==unique_CM_new[j]:
            good_list.append(unique_temp[i])
good_list
            
len(good_list)


164

<IPython.core.display.Javascript object>

In [17]:
#Create a missing list

bad_list_temp=set(unique_temp).difference(set(good_list))
bad_list_CM=set(unique_CM_new).difference(set(good_list))
len(bad_list_CM)

101

<IPython.core.display.Javascript object>

In [18]:
df_CM.head()

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,3_yr_Average
0,Aruba,ABW,,,,,,,,,...,,,,,,,,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,50.858298,49.416164,48.047765,46.638627,45.268284,44.08135,43.027778,42.004211,,43.037779
2,Afghanistan,AFG,,,,228.9,225.1,221.2,217.4,213.5,...,55.0,53.0,51.1,49.4,47.8,46.3,44.8,43.4,,44.833333
3,Africa Western and Central,AFW,,,,,,,,,...,69.98855,68.760967,67.571981,66.373973,64.945255,63.556011,62.165177,60.749633,,62.156941
4,Angola,AGO,,,,,,,,,...,60.5,57.9,55.7,53.8,52.0,50.4,48.7,47.2,,48.766667


<IPython.core.display.Javascript object>

In [19]:
# df_CM=df_CM[["Country Name","3_yr_Average"]]
# df_CM

<IPython.core.display.Javascript object>

In [20]:
df_template.head()
# len(df_template)

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat
0,BOX_028_183_0,CHL,South America,Chile,-74.38631,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.38894
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425
3,BOX_028_183_3,CHL,South America,Chile,-74.37395,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348


<IPython.core.display.Javascript object>

In [21]:


# Merge df_template with RS_counter on the "Country" column
merged_df = pd.merge(df_template, df_CM, left_on="country_name", right_on="Country Name", how="left")

# Rename the merged column to "Recent_Shocks (Deaths)"
merged_df.rename(columns={"3_yr_Average": "Infant Mortality Rate (3_yr_Average)"}, inplace=True)

# Fill NAs with "N/A" in the "Recent_Shocks (Deaths)" column
#merged_df["Child Mortality Rate (3_yr_Average)"] = merged_df["Child Mortality Rate (3_yr_Average))"].fillna(0).astype('int64')
merged_df["Infant Mortality Rate (3_yr_Average)"] = merged_df["Infant Mortality Rate (3_yr_Average)"].fillna("N/A")

# Now merged_df_RS is the DataFrame with the appended total death data
merged_df=merged_df.drop("Country Name",axis=1)
merged_df


Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Infant Mortality Rate (3_yr_Average)
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456,...,6.9,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940,...,6.9,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425,...,6.9,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574,...,6.9,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348,...,6.9,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739914,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238,...,7.2,6.6,6.1,5.6,5.1,4.7,4.4,4.1,,4.4
1739915,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363,...,7.2,6.6,6.1,5.6,5.1,4.7,4.4,4.1,,4.4
1739916,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164,...,7.2,6.6,6.1,5.6,5.1,4.7,4.4,4.1,,4.4
1739917,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341,...,7.2,6.6,6.1,5.6,5.1,4.7,4.4,4.1,,4.4


<IPython.core.display.Javascript object>

In [22]:

#merged_df.to_csv(dataset_dir.joinpath("Child_mortality_mapped_to_transect.csv"),index=False)

<IPython.core.display.Javascript object>

In [23]:
df=pd.read_csv(dataset_dir.joinpath("Child_mortality_mapped_to_transect.csv"))
df.head()

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat,...,2015,2016,2017,2018,2019,2020,2021,2022,Infant Mortality Rate (3_yr_Average),Child Mortality Rate (3_yr_Average)
0,BOX_028_183_0,CHL,South America,Chile,-74.38631,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456,...,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667,5.866667
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.38894,...,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667,5.866667
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425,...,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667,5.866667
3,BOX_028_183_3,CHL,South America,Chile,-74.37395,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574,...,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667,5.866667
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348,...,6.8,6.7,6.5,6.3,6.1,5.9,5.6,,5.866667,5.866667


<IPython.core.display.Javascript object>

In [24]:
# Convert the pandas dataframe to an xarray dataset
ds = xr.Dataset.from_dataframe(df)
ds


<IPython.core.display.Javascript object>

In [25]:

os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
# Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [26]:
# open datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [27]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile= dataset_dir_path, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [28]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [29]:
# open original datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [30]:
#Check columns to know which years to get
df.columns

Index(['transect_id', 'country_id', 'continent', 'country_name', 'Start_lon',
       'Start_lat', 'Intersect_lon', 'Intersect_lat', 'End_lon', 'End_lat',
       'Country Code', '1960', '1961', '1962', '1963', '1964', '1965', '1966',
       '1967', '1968', '1969', '1970', '1971', '1972', '1973', '1974', '1975',
       '1976', '1977', '1978', '1979', '1980', '1981', '1982', '1983', '1984',
       '1985', '1986', '1987', '1988', '1989', '1990', '1991', '1992', '1993',
       '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002',
       '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
       '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', 'Infant Mortality Rate (3_yr_Average)',
       'Child Mortality Rate (3_yr_Average)'],
      dtype='object')

<IPython.core.display.Javascript object>

In [31]:
df.iloc[:,11:-2].columns

Index(['1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
      dtype='object')

<IPython.core.display.Javascript object>

In [32]:
#make a dataframe of all the times and convert them to a numpy array
df_times=(df.iloc[:,11:-2].values)
df_times.shape


(1739919, 63)

<IPython.core.display.Javascript object>

In [33]:
# Define the start and end years
start_year =  1960
end_year =  2022

# Use date_range to create a DatetimeIndex with yearly frequency
date_range = pd.date_range(start=f'{start_year}-01-01', end=f'{end_year}-01-01', freq='AS')

# Convert the DatetimeIndex to a list if needed
date_list = date_range.tolist()
len(date_list)

63

<IPython.core.display.Javascript object>

In [34]:
ds_temporal=ds
ds_temporal

<IPython.core.display.Javascript object>

In [35]:
# combine start and end coordinates into a transect
from shapely.geometry import LineString

start_lons = ds_temporal["Start_lon"].values
start_lats = ds_temporal["Start_lat"].values
end_lons = ds_temporal["End_lon"].values
end_lats = ds_temporal["End_lat"].values
coords_temporal = zip(zip(start_lons, start_lats), zip(end_lons, end_lats))

ds_temporal["transect_geom"] = (
    ["index"],
    [str(LineString(line)) for line in coords_temporal],
)
ds_temporal["transect_geom"].attrs["long_name"] = "Transect Geometry"

<IPython.core.display.Javascript object>

In [36]:

# set some data variables to coordinates to *avoid duplication* of dimensions in later stage
ds_temporal = ds_temporal.set_coords(['transect_id', 'country_name', 'continent', 'Intersect_lon', 'Intersect_lat', 'transect_geom','Infant Mortality Rate (3_yr_Average)'])
ds_temporal

<IPython.core.display.Javascript object>

In [37]:
keep_vars = []
allvars = list(ds_temporal.keys())
delete_vars = list(set(allvars).difference(set(keep_vars)))

ds_temporal = ds_temporal.drop_vars(delete_vars)
ds_temporal

<IPython.core.display.Javascript object>

In [38]:
# Once coordinates are set, we can add another time dimension that will not duplicate 
ds_temporal=ds_temporal.expand_dims(dim={"time": date_list})
ds_temporal


<IPython.core.display.Javascript object>

In [39]:
# assign a new variable that canges both in time and in location(country)
ds_reshaped=ds_temporal.assign(Infant_mortality=(["index","time"],df_times))
ds_reshaped

<IPython.core.display.Javascript object>

In [40]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds = ds_reshaped.rename_dims({"index": "nstations"})
ds


<IPython.core.display.Javascript object>

In [41]:
ds=ds.drop_vars(["index"])
ds

<IPython.core.display.Javascript object>

In [42]:
import json

# NetCDF attribute alterations
f_global = open(dataset_dir.joinpath("metadata_infant_mortality.json"))
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [43]:
# change dtypes from unsigned to signed
object_vars = ['transect_id', 'country_name', 'continent', 'transect_geom']
for i in object_vars:
    ds[i] = ds[i].astype('S')
ds

<IPython.core.display.Javascript object>

In [44]:

# rename variables, if necessary
ds = ds.rename_vars(
       {"Intersect_lon": "lon", "Intersect_lat": "lat",
        "country_name": "country" ,
        "Infant Mortality Rate (3_yr_Average)" : "Infant_mort_3yr_mean"
       }
  )
ds

<IPython.core.display.Javascript object>

In [45]:


# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "transect_id": { "long_name": "Transect Identity", "units": "1"},
    "continent": { "long_name": "Continent", "units": "1"},
    "country": { "long_name": "Country", "units": "1"},
    "Infant_mortality": { "long_name": "Infant Mortality rate", "units": "1"},
    "Infant_mort_3yr_mean": { "long_name": "Infant Mortality rate (3 year Average)", "units": "1"},
    "time": { "long_name": "julian day (UT)"},
}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue


ds

<IPython.core.display.Javascript object>

In [46]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


ds.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy altered NetCDF files

In [47]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path_CF)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [48]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [49]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### write data to Zarr files

In [50]:
# export to zarr in write mode (to overwrite if exists)
ds.to_zarr(str(dataset_dir_path).replace("original","final").replace(".nc", ".zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x1d773960ec0>

<IPython.core.display.Javascript object>