In [1]:
import os

repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")

os.chdir(code_dir)

import geopandas as gpd
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

import os

import sys


from mosaiks.label_utils.utils import geopandas_shape_grid, box_grid, assign_grid_points_to_gpdFile, get_dense_grid_for_gpdf_file
from mosaiks.label_utils.plotting_utils import plot_label_map_hist

# HDI data (Smits et al)

**Data Download:**

*Shapefiles and tabular data are separate downloads*

Tabular data:
https://globaldatalab.org/mygdl/downloads/

We are using the SHDI V4 data in this analysis. Version history is [here](https://globaldatalab.org/shdi/archive/).


Shapefiles:
https://globaldatalab.org/shdi/shapefiles/

We use the neweset shapefile available when this project began `GDL Shapefiles V4`. This file is NOT included in the GitHub repository and must be downloaded to replicate our data cleaning.



**Data Citation**

Smits, J., Permanyer, I. The Subnational Human Development Database. Sci Data 6, 190038 (2019). https://doi.org/10.1038/sdata.2019.38

**Corresponding paper:**

https://www.nature.com/articles/sdata201938


**Abstract**

In this paper we describe the Subnational Human Development Database. This database contains for the period 1990–2017 for 1625 regions within 161 countries the national and subnational values of the Subnational Human Development Index (SHDI), for the three dimension indices on the basis of which the SHDI is constructed – education, health and standard of living --, and for the four indicators needed to create the dimension indices -- expected years of schooling, mean years of schooling, life expectancy and gross national income per capita. The subnational values of the four indicators were computed using data from statistical offices and from the Area Database of the Global Data Lab, which contains indicators aggregated from household surveys and census datasets. Values for missing years were estimated by interpolation and extrapolation from real data. By normalizing the population-weighted averages of the indicators to their national levels in the UNDP-HDI database, values of the SHDI and its dimension indices were obtained that at national level equal their official versions of the UNDP.


**Data sources**

Three major data sources were used to create our SHDI database. We approached statistical offices, including Eurostat, the statistical office of the European Union (https://ec.europa.eu/eurostat), by email communication or visiting their websites to obtain data. We downloaded data from the Area Database of the Global Data Lab (https://www.globaldatalab.org). And we downloaded data from the HDI website of the Human Development Report Office of the United Nations Development Program (http://hdr.undp.org). In the ‘SHDI Start’ data file (Data Citation 1), for each country information is provided on the data source(s) used for the subnational values of the indicators. In this file also for each country the years for which data is available, the number of subnational regions and the population size is presented. Below we discuss the three main data sources in more detail.

**Local paths**

Tabular data:

`/raw/GDL_HDI/GDL-Indicators-(2018)-data.csv`

`raw/GDL_HDI/GDL-Indices-(2018)-data.csv`

Shapefiles:

`raw/GDL_HDI/GDL_Shapefiles_V4`

## Read in shape files

In [4]:
directory = data_dir + "/raw/GDL_HDI/"
out_directory = data_dir + "/int/GDL_HDI/"

## This file MUST be downloaded from the link above, unzipped, and placed in the correct subdirectory
gpdf = gpd.read_file(directory+"GDL_Shapefiles_V4/GDL_Shapefiles_V4.shp")

In [5]:
len(gpdf)

1745

In [6]:
gpdf.set_index("GDLcode", inplace=True)
gpdf.loc["BHRt","iso_code"] = "BHR" # Fix weird anomaly in shapefile
gpdf.loc["MLTt","iso_code"] = "MLT" # Fix weird anomaly in shapefile
gpdf.reset_index(inplace=True)
gpdf.dropna(subset = ["GDLcode"], inplace=True) #Drop weird NAs in GDLcode column

### Let's make and save a country aggregated version of this shapefiile -- it will be useful later

In [7]:
#gpdf_country = gpdf.dissolve("iso_code")

In [8]:
#gpdf_country.to_pickle(out_directory + "/HDI_ADM0_dissolved_shapefile.p")

## Read and clean data files

The data includes a set of indicator variables and a set of indices. First we will merge these dataframes.

The data also come with a column of national indicators. We drop these when we have subnational data for a country.

[If we decide to update with newer data, use code that is commented out in this markdown cell]

<!-- # data = pd.read_csv(directory + "SHDI-SGDI-Total 5.0.csv", low_memory=False)
# data = data[data["year"] == 2019].copy()
# rename_dictionary = {"shdi" : "Sub-national HDI",
#                     "msch": "Mean years schooling",
#                     "esch":"Expected years schooling",
#                     "lifexp":"Life expectancy",
#                     "gnic": "GNI per capita in thousands of US$ (2011 PPP)" }

# data.rename(columns = rename_dictionary, inplace=True)

# unneeded_cols = ['sgdi', 'shdif', 'shdim',
#        'healthindex', 'healthindexf', 'healthindexm', 'incindex', 'incindexf',
#        'incindexm', 'edindex', 'edindexf', 'edindexm', 'eschf',
#        'eschm', 'mschf', 'mschm', 'gnicf',
#        'gnicm',  'mfsel', "lgnic", "lgnicf", "lgnicm", "lifexpf", "lifexpm"]

# data.drop(unneeded_cols, inplace=True)
 -->

In [9]:
indicators = pd.read_csv(directory + 'GDL-Indicators-(2018)-data.csv')
indices =  pd.read_csv(directory + 'GDL-Indices-(2018)-data.csv')

In [10]:
data = pd.concat([indicators, indices.iloc[:,5:]], axis=1) # Merge indices and indicators so we have both

In [11]:
## Now we want to take the countries where we only have national data and merge those with the dataframe of subnational entities
national_data_only_indices = data.groupby("ISO_Code").count()["Country"]==1
national_data_only = data.groupby("ISO_Code").first()[national_data_only_indices].reset_index()

subnational_data_only = data[data["Region"] != "Total"]

df = pd.concat([national_data_only, subnational_data_only])



### Let's inspect the set of countries that do not have subnational province observations

In [12]:
#pd.set_option('display.max_rows', None)
print("Countries that do not have ADM1 child regions:")
national_data_only

Countries that do not have ADM1 child regions:


Unnamed: 0,ISO_Code,Country,Level,GDLCODE,Region,Life expectancy,GNI per capita in thousands of US$ (2011 PPP),Expected years schooling,Mean years schooling,Population size in millions,Sub-national HDI,Health index,Income index,Educational index
0,AND,Andorra,National,ANDt,Total,81.79,48.64,13.3,10.2,0.08,0.857,0.951,0.935,0.708
1,ARE,United Arab Emirates,National,AREt,Total,77.81,66.91,13.6,11.0,9.63,0.866,0.889,0.983,0.744
2,ATG,Antigua and Barbuda,National,ATGt,Total,76.89,22.2,12.5,9.26,0.1,0.776,0.875,0.816,0.655
3,BHR,Bahrain,National,BHRt,Total,77.16,40.4,15.3,9.41,1.57,0.838,0.879,0.907,0.738
4,BHS,Bahamas,National,BHSt,Total,73.75,28.4,12.8,11.5,0.39,0.805,0.827,0.853,0.74
5,BRN,Brunei Darussalam,National,BRNt,Total,75.72,76.39,14.4,9.1,0.43,0.845,0.857,1.0,0.703
6,CYP,Cyprus,National,CYPt,Total,80.83,33.1,14.7,12.1,1.19,0.873,0.936,0.876,0.811
7,DMA,Dominica,National,DMAt,Total,78.12,9.245,13.0,7.8,0.07,0.724,0.894,0.684,0.62
8,FSM,Micronesia (Federated States of),National,FSMt,Total,67.76,3.7,11.6,7.72,0.11,0.614,0.735,0.545,0.578
9,GRD,Grenada,National,GRDt,Total,72.38,12.68,16.6,8.8,0.11,0.763,0.806,0.732,0.754


These are all very small countries and this appears to be reasonable.

### The shapefile is not a perfect match the tabular data

Let's analyze what is missing


#### First, let's inspect the set of countries that cannot be linked to a shapefile primary key

In [13]:
nats_dropped = national_data_only[~national_data_only["GDLCODE"].isin(gpdf["GDLcode"])]
nats_dropped

Unnamed: 0,ISO_Code,Country,Level,GDLCODE,Region,Life expectancy,GNI per capita in thousands of US$ (2011 PPP),Expected years schooling,Mean years schooling,Population size in millions,Sub-national HDI,Health index,Income index,Educational index
12,KIR,Kiribati,National,KIRt,Total,68.12,3.917,11.8,7.87,0.12,0.623,0.74,0.554,0.59
13,KNA,Saint Kitts and Nevis,National,KNA,Total,74.56,26.77,13.6,8.5,0.05,0.776,0.839,0.844,0.661
14,LCA,Saint Lucia,National,LCAt,Total,76.06,11.53,13.9,8.49,0.18,0.745,0.862,0.717,0.668


These are very small countries. Excluding these from our analysis seems reasonable.

#### Second, let's inspect the set of ADM1 polygons that cannot be linked to a shapefile primary key

In [14]:
subnats_dropped = subnational_data_only[~subnational_data_only.GDLCODE.isin(gpdf.GDLcode)]
subnats_dropped

Unnamed: 0,Country,ISO_Code,Level,GDLCODE,Region,Life expectancy,GNI per capita in thousands of US$ (2011 PPP),Expected years schooling,Mean years schooling,Population size in millions,Sub-national HDI,Health index,Income index,Educational index
374,China,CHN,Subnat,CHNr133,Taiwan,81.76,44.44,16.7,9.48,2.79,0.88,0.95,0.921,0.779
494,Czech Republic,CZE,Subnat,CZEr101,Praha,80.7,66.64,21.0,14.0,1.29,0.961,0.934,0.982,0.967
588,Fiji,FJI,Subnat,FJIr108,Ba,66.02,12.1,14.0,10.9,0.24,0.728,0.708,0.724,0.753
589,Fiji,FJI,Subnat,FJIr106,"Cakaudrove, Bua",68.71,4.335,13.3,9.93,0.07,0.668,0.749,0.569,0.7
590,Fiji,FJI,Subnat,FJIr105,"Kadavu, Lau, Lomaiviti, Rotuma",67.37,4.209,14.0,10.1,0.04,0.669,0.729,0.565,0.726
591,Fiji,FJI,Subnat,FJIr107,Macuata,67.1,7.913,14.3,10.2,0.08,0.706,0.725,0.66,0.736
592,Fiji,FJI,Subnat,FJIr109,Nadroga or Navosa,68.01,7.139,13.4,10.6,0.06,0.701,0.739,0.645,0.724
593,Fiji,FJI,Subnat,FJIr101,Naitasiri,67.8,11.5,15.1,11.4,0.17,0.749,0.735,0.717,0.797
594,Fiji,FJI,Subnat,FJIr110,Ra,66.62,5.037,13.5,10.2,0.03,0.672,0.717,0.592,0.716
595,Fiji,FJI,Subnat,FJIr102,Rewa,68.26,12.52,15.8,11.7,0.11,0.765,0.742,0.73,0.828


Dropping these 43 subnational observations is unfortunate. Unfortunately, without a way to match them to an administrative polygon there is little we can do.

### Now let's see if there is any data in the shapefile that is missing from the tabular data

In [15]:
print("Shape file obs that don't match tabular data")

gpdf[~gpdf.GDLcode.isin(df.GDLCODE)]

Shape file obs that don't match tabular data


Unnamed: 0,GDLcode,constant,iso_code,country,region,shdi,geometry
1069,MLTt,World,MLT,,,,"MULTIPOLYGON (((14.41097 35.78847, 14.41097 35..."
1146,,World,,,,,"MULTIPOLYGON (((169.12677 -52.61396, 169.12759..."
1487,SYRr101,World,SYR,Syria,Damascus,0.702796,"POLYGON ((36.40147 33.56197, 36.40089 33.51022..."
1488,SYRr102,World,SYR,Syria,Rural Damascus,0.656747,"MULTIPOLYGON (((36.60928 33.18481, 36.60882 33..."
1489,SYRr103,World,SYR,Syria,Homs,0.664678,"POLYGON ((39.96092 33.93422, 39.90065 33.90559..."
1490,SYRr104,World,SYR,Syria,Hamaa,0.649091,"POLYGON ((36.30488 34.91472, 36.29511 34.91587..."
1491,SYRr105,World,SYR,Syria,Tartous,0.684428,"MULTIPOLYGON (((35.85986 34.85950, 35.85986 34..."
1492,SYRr106,World,SYR,Syria,Al Latakia,0.688736,"POLYGON ((35.94403 35.23540, 35.94373 35.23547..."
1493,SYRr107,World,SYR,Syria,Edlab Idleb,0.613406,"POLYGON ((37.09616 35.28223, 37.08421 35.28244..."
1494,SYRr108,World,SYR,Syria,Halab - Aleppo,0.609183,"POLYGON ((37.77172 35.42765, 37.75378 35.42831..."


This is just two countries. Less than ideal, but we can move on without them. The NA shape is a stange anomaly. It appears to contain small islands as well as North Korea and Taiwan.

In [16]:
n_dropped = len(nats_dropped) + len(subnats_dropped)

## Let's go ahead and subset both of these files to the matching set of indices

In [17]:
df.set_index("GDLCODE", inplace=True)
gpdf.set_index("GDLcode", inplace=True)

In [18]:
matching_locs = df.index[df.index.isin(gpdf.index)]

In [19]:
df = df.loc[matching_locs]
gpdf = gpdf.loc[matching_locs]

In [20]:
df.to_pickle(out_directory + "/HDI_indicators_and_indices_clean.p")
gpdf.to_pickle(out_directory + "/HDI_ADM1_shapefile_clean.p")

#### Also write the national level data

In [21]:
nat_data = data[data["Level"] == "National"].set_index("ISO_Code")
nat_data.to_pickle(out_directory + "/HDI_indicators_and_indices_adm0_clean.p")

In [23]:
len(matching_locs)

1729

In [24]:
print(round(n_dropped/(len(matching_locs ) + n_dropped),3) * 100, "% of HDI data dropped")

2.6 % of HDI data dropped


##  Transform shapefile to .01 x . 01 degree grid

This is the form needed for aggregating features in the existing pipeline

In [None]:
#dense_grid = get_dense_grid_for_gpdf_file(gpdf.reset_index(), columns=["GDLCODE", "iso_code","country"])

In [None]:
#dense_grid.head()

In [None]:
#outpath = data_dir + "/features/prepared_labels/GDL_HDI_polygon_coords_for_featurization.p"
#dense_grid["constant"] = 1
#dense_grid.to_pickle(outpath)
#dense_grid = pd.read_pickle(outpath)

In [None]:
#dense_grid.head()

Check to see if any polygon observations were dropped. This would occur if they are very small and don't overlay any grid centorids.

In [None]:
#len(dense_grid["GDLCODE"].unique()) != len(matching_locs)

None are dropped in this procedure