In [1]:
import numpy as np
import pandas as pd

In [2]:
hdi_data_raw = pd.read_excel('hdi_data.xlsx')

# Pertinent articles:
# https://hdr.undp.org/en/content/download-data
# https://hdr.undp.org/en

In [244]:
# Rounding values based on HDI data sheet.

hdi_data_raw['life_expectancy_at_birth'] = hdi_data_raw['life_expectancy_at_birth'].round(1)
hdi_data_raw['expected_years_of_schooling'] = hdi_data_raw['expected_years_of_schooling'].round(1)
hdi_data_raw['mean_years_of_schooling'] = hdi_data_raw['mean_years_of_schooling'].round(1)
hdi_data_raw['gross_national_income_per_capita'] = hdi_data_raw['gross_national_income_per_capita'].round(0)

In [245]:
hdi_data_raw.head(10)

Unnamed: 0,country,human_development_index_value,life_expectancy_at_birth,expected_years_of_schooling,mean_years_of_schooling,gross_national_income_per_capita,development
0,Norway,0.957,82.4,18.1,12.9,66494.0,very_high_development
1,Ireland,0.955,82.3,18.7,12.7,68371.0,very_high_development
2,Switzerland,0.955,83.8,16.3,13.4,69394.0,very_high_development
3,"Hong Kong, China (SAR)",0.949,84.9,16.9,12.3,62985.0,very_high_development
4,Iceland,0.949,83.0,19.1,12.8,54682.0,very_high_development
5,Germany,0.947,81.3,17.0,14.2,55314.0,very_high_development
6,Sweden,0.945,82.8,19.5,12.5,54508.0,very_high_development
7,Australia,0.944,83.4,22.0,12.7,48085.0,very_high_development
8,Netherlands,0.944,82.3,18.5,12.4,57707.0,very_high_development
9,Denmark,0.94,80.9,18.9,12.6,58662.0,very_high_development


In [246]:
hdi_condensed = hdi_data_raw[['life_expectancy_at_birth', 'expected_years_of_schooling', 'mean_years_of_schooling', 'gross_national_income_per_capita']]

In [247]:
# Each column is clipped to max out at 1 (education is clipped before arithmatic mean).

life_expectancy_index_col = ((hdi_condensed['life_expectancy_at_birth'] - 20) / (85 - 20)).clip(upper=1)

education_index_col = ((hdi_condensed[['expected_years_of_schooling', 'mean_years_of_schooling']] / np.array([18, 15]))).clip(upper=1).mean(axis=1)

income_index_col = ((np.log(hdi_condensed['gross_national_income_per_capita']) - np.log(100)) / (np.log(75000) - np.log(100))).clip(upper=75000)

In [248]:
formatted_df = pd.DataFrame({
    'country': hdi_data_raw['country'], 
    'human_development_index_value': hdi_data_raw['human_development_index_value'], 
    'life_expectancy_index': life_expectancy_index_col, 
    'education_index': education_index_col, 
    'income_index': income_index_col, 
    'development': hdi_data_raw['development']
})

In [249]:
formatted_df.head(10)

Unnamed: 0,country,human_development_index_value,life_expectancy_index,education_index,income_index,development
0,Norway,0.957,0.96,0.93,0.981816,very_high_development
1,Ireland,0.955,0.958462,0.923333,0.986021,very_high_development
2,Switzerland,0.955,0.981538,0.899444,0.988265,very_high_development
3,"Hong Kong, China (SAR)",0.949,0.998462,0.879444,0.973627,very_high_development
4,Iceland,0.949,0.969231,0.926667,0.952273,very_high_development
5,Germany,0.947,0.943077,0.945556,0.954009,very_high_development
6,Sweden,0.945,0.966154,0.916667,0.951792,very_high_development
7,Australia,0.944,0.975385,0.923333,0.932853,very_high_development
8,Netherlands,0.944,0.958462,0.913333,0.960407,very_high_development
9,Denmark,0.94,0.936923,0.92,0.962886,very_high_development


In [250]:
source_id_list = []
source_name_list = []
source_type_list = []
target_id_list = []
target_name_list = []
target_type_list = []
edge_value_list = []

for idx, row in formatted_df.iterrows():
    for i, target in enumerate(row.index[2:-1]):
        source_id_list.append(row['country'])
        source_name_list.append(row['country'])
        source_type_list.append(row['development'])
        target_id_list.append(target)
        target_name_list.append(target)
        target_type_list.append('target')

        edge_value_list.append(row[target])

In [251]:
data = {
    'source_id': source_id_list, 
    'source_name': source_name_list, 
    'source_type': source_type_list, 
    'target_id': target_id_list, 
    'target_name': target_name_list, 
    'target_type': target_type_list, 
    'edge_value': edge_value_list
}

In [252]:
formatted_edges_df = pd.DataFrame(data)

formatted_edges_df

Unnamed: 0,source_id,source_name,source_type,target_id,target_name,target_type,edge_value
0,Norway,Norway,very_high_development,life_expectancy_index,life_expectancy_index,target,0.960000
1,Norway,Norway,very_high_development,education_index,education_index,target,0.930000
2,Norway,Norway,very_high_development,income_index,income_index,target,0.981816
3,Ireland,Ireland,very_high_development,life_expectancy_index,life_expectancy_index,target,0.958462
4,Ireland,Ireland,very_high_development,education_index,education_index,target,0.923333
...,...,...,...,...,...,...,...
562,Central African Republic,Central African Republic,low_development,education_index,education_index,target,0.354444
563,Central African Republic,Central African Republic,low_development,income_index,income_index,target,0.346758
564,Niger,Niger,low_development,life_expectancy_index,life_expectancy_index,target,0.652308
565,Niger,Niger,low_development,education_index,education_index,target,0.250556


In [253]:
formatted_edges_df.to_csv('hdi_cv_data_formatted.csv')