# Mean Years of schooling Vulnerability Indicator

Notebook environment to migrate UNDP csv file to CF compliant zarr

In [160]:
# Optional; code formatter, installed as jupyter lab extension
#%load_ext lab_black
# Optional; code formatter, installed as jupyter notebook extension
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

### Configure OS independent paths

In [161]:
#%pip install tqdm

<IPython.core.display.Javascript object>

In [162]:
# Import standard packages
import os
import pathlib

import sys
import numpy as np
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
import math
from tqdm import tqdm 

from coclicodata.drive_config import p_drive
from coclicodata.etl.cf_compliancy_checker import check_compliancy, save_compliancy

# Define (local and) remote drives
gca_data_dir = r"P:\11209197-018-global-coastal-atlas\MSc_students\ClenmarRowe\Data\All_Datasets\Orig_Datasets"


# Workaround to the Windows OS (10) udunits error after installation of cfchecker: https://github.com/SciTools/iris/issues/404
os.environ["UDUNITS2_XML_PATH"] = str(
    pathlib.Path().home().joinpath(  # change to the udunits2.xml file dir in your Python installation
        r"AppData\Local\miniconda3\pkgs\udunits2-2.2.28-h892ecd3_0\Library\share\udunits\udunits2.xml" ###Changed from anaconda to miniconda  - added the new package udunits file
    )
)

<IPython.core.display.Javascript object>

In [163]:
# Project paths & files (manual input)
dataset_dir = pathlib.Path().joinpath(gca_data_dir,"03_Vulnerability","Mean_Years_of_Schooling")
dataset_dir_path = dataset_dir.joinpath("Mean_Years_of_Schooling_original.nc")
CF_dir = dataset_dir.joinpath("CF")  # directory to save output CF check files
template_path= pathlib.Path().joinpath(gca_data_dir,"04_Auxillary_files\Arjen_Vector_Template")
dataset_dir_path

WindowsPath('P:/11209197-018-global-coastal-atlas/MSc_students/ClenmarRowe/Data/All_Datasets/Orig_Datasets/03_Vulnerability/Mean_Years_of_Schooling/Mean_Years_of_Schooling_original.nc')

<IPython.core.display.Javascript object>

In [164]:
df_template=pd.read_csv(template_path)
df_template

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348
...,...,...,...,...,...,...,...,...,...,...
1739821,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238
1739822,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363
1739823,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164
1739824,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341


<IPython.core.display.Javascript object>

In [165]:
country_temp=df_template.columns[3]
unique_temp=df_template[country_temp].unique()
unique_temp

array(['Chile', 'unknown', 'Argentina', 'Falkland Islands',
       'South Georgia and South Sandwich Islands',
       'French Southern and Antarctic Lands', 'New Zealand', 'Uruguay',
       'Brazil', 'Saint Helena', 'South Africa', 'Australia', 'Tonga',
       'Fiji', 'French Polynesia', 'Cook Islands', 'Peru', 'Namibia',
       'Mozambique', 'Madagascar', 'France', 'Mauritius', 'New Caledonia',
       'Vanuatu', 'Samoa', 'American Samoa', 'Wallis and Futuna',
       'Kiribati', 'Ecuador', 'Gabon', 'Angola', 'Republic of Congo',
       'Democratic Republic of the Congo', 'United Republic of Tanzania',
       'Kenya', 'Comoros', 'Somalia', 'Seychelles',
       'British Indian Ocean Territory', 'Maldives', 'Indonesia',
       'East Timor', 'Papua New Guinea', 'Solomon Islands', 'Mexico',
       'Guatemala', 'El Salvador', 'Panama', 'Colombia', 'Belize',
       'Honduras', 'Costa Rica', 'Nicaragua', 'Venezuela', 'Guyana',
       'Trinidad and Tobago', 'Saint Kitts and Nevis', 'Suriname',


<IPython.core.display.Javascript object>

In [166]:
# Data from other dataset input here
df_MYS=pd.read_excel(dataset_dir.joinpath("HDI","HDR21-22_Statistical_Annex_HDI_Table.xlsx"),skiprows=4)
df_MYS

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Human Development Index (HDI),Unnamed: 3,Life expectancy at birth,Unnamed: 5,Expected years of schooling,Unnamed: 7,Mean years of schooling,Unnamed: 9,Gross national income (GNI) per capita,Unnamed: 11,GNI per capita rank minus HDI rank,Unnamed: 13,HDI rank
0,HDI rank,Country,Value,,(years),,(years),,(years),,(2017 PPP $),,,,
1,,,2021,,2021,,2021,a,2021,a,2021,,2021,b,2020
2,,VERY HIGH HUMAN DEVELOPMENT,,,,,,,,,,,,,
3,1,Switzerland,0.962,,83.9872,,16.500299,,13.85966,,66933.00454,,5,,3
4,2,Norway,0.961,,83.2339,,18.1852,c,13.00363,,64660.10622,,6,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,,Column 2: UNDESA (2022a).,,,,,,,,,,,,,
268,,"Column 3: CEDLAS and World Bank (2022), ICF Ma...",,,,,,,,,,,,,
269,,"Column 4: Barro and Lee (2018), ICF Macro Demo...",,,,,,,,,,,,,
270,,"Column 5: IMF (2022), UNDESA (2022b), United N...",,,,,,,,,,,,,


<IPython.core.display.Javascript object>

In [167]:
# Original dataset dependent

df_MYS=df_MYS[[df_MYS.columns[1],df_MYS.columns[8]]]
#country_list_RS=df_RS[country_column_RS]
df_MYS


Unnamed: 0,Unnamed: 1,Mean years of schooling
0,Country,(years)
1,,2021
2,VERY HIGH HUMAN DEVELOPMENT,
3,Switzerland,13.85966
4,Norway,13.00363
...,...,...
267,Column 2: UNDESA (2022a).,
268,"Column 3: CEDLAS and World Bank (2022), ICF Ma...",
269,"Column 4: Barro and Lee (2018), ICF Macro Demo...",
270,"Column 5: IMF (2022), UNDESA (2022b), United N...",


<IPython.core.display.Javascript object>

In [168]:
df_MYS=df_MYS.dropna()
df_MYS

Unnamed: 0,Unnamed: 1,Mean years of schooling
0,Country,(years)
3,Switzerland,13.85966
4,Norway,13.00363
5,Iceland,13.76717
6,"Hong Kong, China (SAR)",12.22621
...,...,...
217,Sub-Saharan Africa,6.001639
219,Least developed countries,5.21532
220,Small island developing states,9.08964
222,Organisation for Economic Co-operation and Dev...,12.267755


<IPython.core.display.Javascript object>

In [169]:
df_MYS=df_MYS.iloc[1:192,:]
df_MYS

Unnamed: 0,Unnamed: 1,Mean years of schooling
3,Switzerland,13.85966
4,Norway,13.00363
5,Iceland,13.76717
6,"Hong Kong, China (SAR)",12.22621
7,Australia,12.72682
...,...,...
192,Burundi,3.129267
193,Central African Republic,4.334
194,Niger,2.116717
195,Chad,2.573774


<IPython.core.display.Javascript object>

In [170]:
df_MYS.rename(columns={"Unnamed: 1":"Country"}, inplace=True)
df_MYS

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_MYS.rename(columns={"Unnamed: 1":"Country"}, inplace=True)


Unnamed: 0,Country,Mean years of schooling
3,Switzerland,13.85966
4,Norway,13.00363
5,Iceland,13.76717
6,"Hong Kong, China (SAR)",12.22621
7,Australia,12.72682
...,...,...
192,Burundi,3.129267
193,Central African Republic,4.334
194,Niger,2.116717
195,Chad,2.573774


<IPython.core.display.Javascript object>

In [171]:
df_MYS["Country"].unique()

array(['Switzerland', 'Norway', 'Iceland', 'Hong Kong, China (SAR)',
       'Australia', 'Denmark', 'Sweden', 'Ireland', 'Germany',
       'Netherlands', 'Finland', 'Singapore', 'Belgium', 'New Zealand',
       'Canada', 'Liechtenstein', 'Luxembourg', 'United Kingdom', 'Japan',
       'Korea (Republic of)', 'United States', 'Israel', 'Malta',
       'Slovenia', 'Austria', 'United Arab Emirates', 'Spain', 'France',
       'Cyprus', 'Italy', 'Estonia', 'Czechia', 'Greece', 'Poland',
       'Bahrain', 'Lithuania', 'Saudi Arabia', 'Portugal', 'Latvia',
       'Andorra', 'Croatia', 'Chile', 'Qatar', 'San Marino', 'Slovakia',
       'Hungary', 'Argentina', 'Türkiye', 'Montenegro', 'Kuwait',
       'Brunei Darussalam', 'Russian Federation', 'Romania', 'Oman',
       'Bahamas', 'Kazakhstan', 'Trinidad and Tobago', 'Costa Rica',
       'Uruguay', 'Belarus', 'Panama', 'Malaysia', 'Georgia', 'Mauritius',
       'Serbia', 'Thailand', 'Albania', 'Bulgaria', 'Grenada', 'Barbados',
       'Antigua an

<IPython.core.display.Javascript object>

In [172]:
# First Round Screening; Switch names that are similar but problematic

name_mapping = {

    "Saint Martin (French Part)":	"Saint Martin",
    "Viet Nam"	:"Vietnam",
    "Guinea-Bissau"	:"Guinea Bissau",
    "Bahamas"	:"The Bahamas",
    "Taiwan (Province of China)"	:"Taiwan",
    "Saint Barthélemy":	"Saint Barthelemy",
    "United Kingdom of Great Britain and Northern Ireland"	:"United Kingdom",
    "Republic of Korea"	:"South Korea",
    "Cabo Verde"	:"Cape Verde",
    "Netherlands (Kingdom of the)"	:"Netherlands",
    "China, Hong Kong Special Administrative Region"	:"Hong Kong S.A.R.",
    "Türkiye"	:"Turkey",
    "Micronesia (Federated States of)":	"Federated States of Micronesia",
    "Russian Federation":	"Russia",
    "Iran (Islamic Republic of)":	"Iran",
    "Democratic People's Republic of Korea"	:"North Korea",
    "Congo":	"Republic of Congo",
    "Wallis and Futuna Islands":	"Wallis and Futuna",
    "Venezuela (Bolivarian Republic of)":	"Venezuela",
    "Syrian Arab Republic":	"Syria",
    "Timor-Leste"	:"East Timor",
    "Côte d'Ivoire"	:"Ivory Coast",

    "Hong Kong SAR, China"	:"Hong Kong S.A.R.",
    "Korea, Rep.":	"South Korea",
    "St. Vincent and the Grenadines":	"Saint Vincent and the Grenadines",
    "Bahamas, The":	"The Bahamas",
    "Congo, Rep.":"Republic of Congo",
    "Venezuela, RB"	:"Venezuela",
    "St. Lucia":	"Saint Lucia",
    "St. Kitts and Nevis":	"Saint Kitts and Nevis",
    "Micronesia, Fed. Sts."	:"Federated States of Micronesia",
    "Congo, Dem. Rep.":	"Democratic Republic of the Congo",
    "Syrian Arab Republic":	"Syria",
    "Cabo Verde":	"Cape Verde",
    "Gambia, The":	"Gambia",
    "Egypt, Arab Rep.":	"Egypt",
    "Turkiye":	"Turkey",
    "Virgin Islands (U.S.)"	:"United States Virgin Islands",
    "Viet Nam"	:"Vietnam",
    "Cote d'Ivoire"	:"Ivory Coast",
    "Korea, Dem. People's Rep."	:"North Korea",
    "United States":"United States of America",
    "Iran, Islamic Rep."	:"Iran",
    "Brunei Darussalam"	:"Brunei",
    "Yemen, Rep."	:"Yemen",
    "Tanzania"	:"United Republic of Tanzania",
    "St. Martin (French part)"	:"Saint Martin",
    "Sint Maarten (Dutch part)"	:"Saint Martin",
    "Guinea-Bissau":"Guinea Bissau",
    "Russian Federation"	:"Russia",

    #Added for this specific MYS eye check
    "Korea (Republic of)":	"South Korea",
    "Hong Kong, China (SAR)"	:"Hong Kong S.A.R.",
    "Tanzania (United Republic of)":	"United Republic of Tanzania",
    "Congo (Democratic Republic of the)":	"Democratic Republic of the Congo",


}

<IPython.core.display.Javascript object>

In [173]:
df_MYS["Country"]=df_MYS["Country"].replace(name_mapping)
unique_MYS= df_MYS["Country"].unique()
unique_MYS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_MYS["Country"]=df_MYS["Country"].replace(name_mapping)


array(['Switzerland', 'Norway', 'Iceland', 'Hong Kong S.A.R.',
       'Australia', 'Denmark', 'Sweden', 'Ireland', 'Germany',
       'Netherlands', 'Finland', 'Singapore', 'Belgium', 'New Zealand',
       'Canada', 'Liechtenstein', 'Luxembourg', 'United Kingdom', 'Japan',
       'South Korea', 'United States of America', 'Israel', 'Malta',
       'Slovenia', 'Austria', 'United Arab Emirates', 'Spain', 'France',
       'Cyprus', 'Italy', 'Estonia', 'Czechia', 'Greece', 'Poland',
       'Bahrain', 'Lithuania', 'Saudi Arabia', 'Portugal', 'Latvia',
       'Andorra', 'Croatia', 'Chile', 'Qatar', 'San Marino', 'Slovakia',
       'Hungary', 'Argentina', 'Turkey', 'Montenegro', 'Kuwait', 'Brunei',
       'Russia', 'Romania', 'Oman', 'The Bahamas', 'Kazakhstan',
       'Trinidad and Tobago', 'Costa Rica', 'Uruguay', 'Belarus',
       'Panama', 'Malaysia', 'Georgia', 'Mauritius', 'Serbia', 'Thailand',
       'Albania', 'Bulgaria', 'Grenada', 'Barbados',
       'Antigua and Barbuda', 'Seychelles

<IPython.core.display.Javascript object>

In [174]:
#Create a present list
good_list=[]

for i in range(0,len(unique_temp)):
    for j in range(0,len(unique_MYS)):
        if unique_temp[i]==unique_MYS[j]:
            good_list.append(unique_temp[i])
good_list
            
len(good_list)

147

<IPython.core.display.Javascript object>

In [175]:
#Create a missing list
bad_list_temp=set(unique_temp).difference(set(good_list))
bad_list_MYS=set(unique_MYS).difference(set(good_list))
len(bad_list_MYS)

44

<IPython.core.display.Javascript object>

In [176]:


#to dataframe
bad_list_MYS_eye=pd.DataFrame({"Mean Years Schooling": list(bad_list_MYS)})
bad_list_temp_eye=pd.DataFrame({"Arjen Template": list(bad_list_temp)})

#to excel for eye check
bad_list_MYS_eye.to_excel(dataset_dir.joinpath("eye_checker_MYS.xlsx"))

bad_list_temp_eye.to_excel(dataset_dir.joinpath("eye_checker_temp.xlsx"))

<IPython.core.display.Javascript object>

In [177]:
df_MYS

Unnamed: 0,Country,Mean years of schooling
3,Switzerland,13.85966
4,Norway,13.00363
5,Iceland,13.76717
6,Hong Kong S.A.R.,12.22621
7,Australia,12.72682
...,...,...
192,Burundi,3.129267
193,Central African Republic,4.334
194,Niger,2.116717
195,Chad,2.573774


<IPython.core.display.Javascript object>

In [178]:
df_template.head()
# len(df_template)

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat
0,BOX_028_183_0,CHL,South America,Chile,-74.38631,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.38894
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425
3,BOX_028_183_3,CHL,South America,Chile,-74.37395,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348


<IPython.core.display.Javascript object>

In [179]:


# Merge df_template with RS_counter on the "Country" column
merged_df_MYS = pd.merge(df_template, df_MYS, left_on="country_name", right_on="Country", how="left")

# Rename the merged column to "Recent_Shocks (Deaths)"
# merged_df_RS.rename(columns={"Total Deaths": "Recent_Shocks (Deaths)"}, inplace=True)

# Fill NAs with "N/A" in the "Recent_Shocks (Deaths)" column
# merged_df_RS["Recent_Shocks (Deaths)"] = merged_df_RS["Recent_Shocks (Deaths)"].fillna(0).astype('int64')
# merged_df_RS["Recent_Shocks (Deaths)"] = merged_df_RS["Recent_Shocks (Deaths)"].fillna("N/A")

# Now merged_df_RS is the DataFrame with the appended total death data
merged_df_MYS=merged_df_MYS.drop(merged_df_MYS.columns[-2],axis=1)
merged_df_MYS


Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat,Mean years of schooling
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456,10.934966
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940,10.934966
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425,10.934966
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574,10.934966
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348,10.934966
...,...,...,...,...,...,...,...,...,...,...,...
1739821,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238,12.774288
1739822,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363,12.774288
1739823,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164,12.774288
1739824,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341,12.774288


<IPython.core.display.Javascript object>

In [181]:

merged_df_MYS.to_csv(dataset_dir.joinpath("Mean_years_schooling_mapped_to_transect.csv"),index=False)

<IPython.core.display.Javascript object>

In [182]:
df=pd.read_csv(dataset_dir.joinpath("Mean_years_schooling_mapped_to_transect.csv"))
df

Unnamed: 0,transect_id,country_id,continent,country_name,Start_lon,Start_lat,Intersect_lon,Intersect_lat,End_lon,End_lat,Mean years of schooling
0,BOX_028_183_0,CHL,South America,Chile,-74.386310,-50.377659,-74.390966,-50.382558,-74.395623,-50.387456,10.934966
1,BOX_028_183_1,CHL,South America,Chile,-74.382469,-50.379144,-74.387125,-50.384042,-74.391782,-50.388940,10.934966
2,BOX_028_183_2,CHL,South America,Chile,-74.378628,-50.380629,-74.383284,-50.385527,-74.387941,-50.390425,10.934966
3,BOX_028_183_3,CHL,South America,Chile,-74.373950,-50.382583,-74.379517,-50.387079,-74.385083,-50.391574,10.934966
4,BOX_028_183_4,CHL,South America,Chile,-74.370425,-50.384358,-74.375991,-50.388853,-74.381558,-50.393348,10.934966
...,...,...,...,...,...,...,...,...,...,...,...
1739821,BOX_211_067_149,RUS,Europe,Russia,39.929937,64.701462,39.935198,64.698350,39.940460,64.695238,12.774288
1739822,BOX_211_067_150,RUS,Europe,Russia,39.933577,64.702586,39.938839,64.699474,39.944100,64.696363,12.774288
1739823,BOX_211_067_151,RUS,Europe,Russia,39.935546,64.703502,39.942003,64.700833,39.948460,64.698164,12.774288
1739824,BOX_211_067_152,RUS,Europe,Russia,39.937050,64.704370,39.944697,64.702356,39.952343,64.700341,12.774288


<IPython.core.display.Javascript object>

In [183]:
# Convert the pandas dataframe to an xarray dataset
ds = xr.Dataset.from_dataframe(df)
ds


<IPython.core.display.Javascript object>

In [184]:

os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'
# Write the xarray dataset to a netCDF file
ds.to_netcdf(dataset_dir_path)

<IPython.core.display.Javascript object>

### Check CF compliancy original NetCDF files

In [185]:
# open datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [186]:
%%capture cap --no-stderr
# check original CF compliancy

check_compliancy(testfile= dataset_dir_path, 
                 working_dir=CF_dir
                 )


<IPython.core.display.Javascript object>

In [187]:
# save original CF compliancy
save_compliancy(cap, testfile=dataset_dir_path, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### Make CF compliant alterations to the NetCDF files (dataset dependent)

In [188]:
# open original datasets
ds = xr.open_dataset(dataset_dir_path)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [189]:
import json

# NetCDF attribute alterations
f_global = open(dataset_dir.joinpath("metadata_mean_years_of_schooling.json"))
meta_global = json.load(f_global)

for attr_name, attr_val in meta_global.items():
    if attr_name == 'PROVIDERS':
        attr_val = json.dumps(attr_val)
    ds.attrs[attr_name] = attr_val

ds.attrs['Conventions'] = "CF-1.8"
ds

<IPython.core.display.Javascript object>

In [190]:
# combine start and end coordinates into a transect
from shapely.geometry import LineString

start_lons = ds["Start_lon"].values
start_lats = ds["Start_lat"].values
end_lons = ds["End_lon"].values
end_lats = ds["End_lat"].values
coords = zip(zip(start_lons, start_lats), zip(end_lons, end_lats))

ds["transect_geom"] = (
    ["index"],
    [str(LineString(line)) for line in coords],
)
ds["transect_geom"].attrs["long_name"] = "Transect Geometry"

<IPython.core.display.Javascript object>

In [191]:
keep_vars = ['transect_id', 'country_name', 'continent', 'Intersect_lon', 'Intersect_lat', 'transect_geom', 'Mean years of schooling']
allvars = list(ds.keys())
delete_vars = list(set(allvars).difference(set(keep_vars)))

ds = ds.drop_vars(delete_vars)
ds

<IPython.core.display.Javascript object>

In [192]:
# NetCDF variable and dimension alterations

# rename or swap dimension names, the latter in case the name already exists as coordinate
ds = ds.rename_dims({"index": "nstations"})
ds


<IPython.core.display.Javascript object>

In [193]:
# change dtypes from unsigned to signed
object_vars = ['transect_id', 'country_name', 'continent', 'transect_geom']
for i in object_vars:
    ds[i] = ds[i].astype('S')
ds

<IPython.core.display.Javascript object>

In [194]:

# rename variables, if necessary
ds = ds.rename_vars(
       {"Intersect_lon": "lon", "Intersect_lat": "lat",
        "country_name": "country" ,
        "Mean years of schooling" : "mean_years_schooling"
       }
  )
ds

<IPython.core.display.Javascript object>

In [195]:


# add or change certain variable / coordinate attributes
### dataset attributes is a dictionary of dictionaries
dataset_attributes = {
    "lon": {"standard_name": "longitude", "long_name": "longitude", "units": "degrees_east"},
    "lat": {"standard_name": "latitude", "long_name": "latitude", "units": "degrees_north"},
    "transect_id": { "long_name": "Transect Identity", "units": "1"},
    "continent": { "long_name": "Continent", "units": "1"},
    "country": { "long_name": "Country", "units": "1"},
    "mean_years_schooling": { "long_name": "Mean years of schooling", "units": "1"}

}  # specify custom (CF convention) attributes

 # add / overwrite attributes
for k, v in dataset_attributes.items():
    try:
        ds[k].attrs = dataset_attributes[k]
    except:
        continue


ds

<IPython.core.display.Javascript object>

In [196]:
#Drop index from coordinates list
ds=ds.drop_vars(["index"])
ds


<IPython.core.display.Javascript object>

In [197]:

# set some data variables to coordinates to avoid duplication of dimensions in later stage if more dims are expanded
ds = ds.set_coords(["lon", "lat","transect_id", "country", "continent", "transect_geom"])
ds

<IPython.core.display.Javascript object>

In [198]:
# Write the xarray dataset to a netCDF file
#Compliant netcdf
dataset_dir_path_CF=str(dataset_dir_path).replace("original","final")


ds.to_netcdf(path=dataset_dir_path_CF)

<IPython.core.display.Javascript object>

### Check CF compliancy altered NetCDF files

In [199]:
# open datasets (only first file, rest is the same)
ds = xr.open_dataset(dataset_dir_path_CF)

# check original dataset
ds

<IPython.core.display.Javascript object>

In [200]:
%%capture cap --no-stderr
# check original CF compliancy (for first file)

check_compliancy(testfile=dataset_dir_path_CF, 
                 working_dir=CF_dir
                 )

<IPython.core.display.Javascript object>

In [201]:
# save original CF compliancy (for first file)
save_compliancy(cap, testfile=dataset_dir_path_CF, working_dir=CF_dir)



<IPython.core.display.Javascript object>

### write data to Zarr files

In [202]:
# export to zarr in write mode (to overwrite if exists)
ds.to_zarr(str(dataset_dir_path).replace("original","final").replace(".nc", ".zarr"), mode="w")

<xarray.backends.zarr.ZarrStore at 0x173c78c0840>

<IPython.core.display.Javascript object>