<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Preliminaries

In [1]:
# Required standard libraries
import pandas as pd
import json
import urllib
import requests
import os
import re
import numpy as np
import bs4 as bs
import selenium
import html5lib
import nltk
from selenium import webdriver

# Extractors (cluster specific)
import extract
# from extract.unesco_extractor import extract_unesco_api_data
# from extract.ilo_extractor import extract_ilo_api_data
# from extract.sdg_extractor import extract_sdg_api_data
# from extract.who_extractor import extract_who_api_data
# from extract.un_treaty_extractor import extract_un_treaties_data
# from extract.ilo_normlex_extractor import extract_ilo_normlex_data

# from extract import save_raw_data

# Cleansers (cluster specific)
import cleanse
# from cleanse.unesco_cleanser import cleanse_unesco_api_data
# from cleanse.ilo_cleanser import cleanse_ilo_api_data
# from cleanse.sdg_cleanser import cleanse_sdg_api_data
# from cleanse.who_cleanser import cleanse_who_api_num_data
# from cleanse.un_treaty_cleanser import cleanse_un_treaty_data
# from cleanse.wpac_cleanser import cleanse_wpac_data

# from cleanse.save_cleansed_data import save_cleansed_data 

# Normalizer (generalised across all clusters)
from normalize import scaler
# from normalize import save_normalized_data

# Utils
from utils import utils

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Define the export path for all data exports
from pathlib import Path

# CUrrent working directory
cwd = Path('.')

# Folder with data-in artifacts, quired to run this script
data_in = cwd / 'data_in'

# Folder to export raw data
data_sources_raw = cwd / 'data_out' / 'data_raw'
data_sources_raw.mkdir(parents=True, exist_ok=True)

# Folder to export cleansed data
data_sources_cleansed = cwd / 'data_out' / 'data_cleansed'
data_sources_cleansed.mkdir(parents=True, exist_ok=True)

# Folder to export normalized data
data_sources_normalized = cwd / 'data_out' / 'data_normalized'
data_sources_normalized.mkdir(parents=True, exist_ok=True)

In [3]:
# Load the list of countries which contains all different variations of country names 
country_full_list = pd.read_excel(
    data_in / 'all_countrynames_list.xlsx',
    keep_default_na = False).drop_duplicates()

# Create a version of the list with unique ISO2 and ISO3 codes
country_iso_list = country_full_list.drop_duplicates(subset = 'CountryIso2')

# Country CRBA list, this is the list of the countries that should be in the final CRBA indicator list
country_crba_list = pd.read_excel(
    data_in / 'crba_country_list.xlsx',
    header = None,
    usecols = [0, 1], 
    names = ['COUNTRY_ISO_3', 'COUNTRY_NAME']).merge(
        right = country_iso_list,
        how = 'left',
        left_on = 'COUNTRY_ISO_3',
        right_on = 'CountryIso3',
        validate = 'one_to_one')[
    ['COUNTRY_ISO_3', 'COUNTRY_NAME', 'CountryIso2']].rename(
    columns = {'CountryIso2': "COUNTRY_ISO_2",
               'COUNTRY_NAME' : "REF_AREA"})

## Read data dictionary

In [4]:
# sources sheet
crba_data_dictionary_source = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Source",
    keep_default_na = False
)

# snapshot sheet
crba_data_dictionary_snapshot = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Snapshot",
    keep_default_na = False
)

# indicator sheet
crba_data_dictionary_indicator = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Indicator",
    keep_default_na = False
)

# Input lists
crba_data_dictionary_input_list = pd.read_excel(
    data_in / 'indicator_dictionary_CRBA.xlsx',
    sheet_name = "Input_Lists",
    keep_default_na = False
)

# Add 2-digit shortcodes of index, issue and category to indicators sheet
crba_data_dictionary_indicator = crba_data_dictionary_indicator.merge(
    right=crba_data_dictionary_input_list[['INDEX', 'INDEX_CODE']],
    left_on='INDEX',
    right_on='INDEX',
).merge(
    right=crba_data_dictionary_input_list[['ISSUE', 'ISSUE_CODE']],
    left_on='ISSUE',
    right_on='ISSUE',
).merge(
    right=crba_data_dictionary_input_list[['CATEGORY', 'CATEGORY_CODE']],
    left_on='CATEGORY',
    right_on='CATEGORY',
)

# Create indicator code prefix (INDEX-ISSUE_CAEGORY CODE)
crba_data_dictionary_indicator = crba_data_dictionary_indicator.assign(
    INDICATOR_CODE_PREFIX = crba_data_dictionary_indicator.INDEX_CODE +
    "_" +
    crba_data_dictionary_indicator.ISSUE_CODE+
    "_"+
    crba_data_dictionary_indicator.CATEGORY_CODE+
    "_")

# Create indicator code
crba_data_dictionary_indicator = crba_data_dictionary_indicator.assign(
    INDICATOR_CODE = crba_data_dictionary_indicator.INDICATOR_CODE_PREFIX + crba_data_dictionary_indicator.INDICATOR_NAME.apply(
    lambda x: utils.create_ind_code(x)
))

In [5]:
import importlib, inspect

extractors = { 
    cls.type: cls for name, cls in inspect.getmembers(
        importlib.import_module("extract"), 
        inspect.isclass
    ) if hasattr(cls, 'type')
}

# Extract
## API sources
### CSV API sources

In [8]:
# CSV sources
api_sources = crba_data_dictionary_source[
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (ILO)") | 
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (UNESCO)") | 
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (WHO)")
]

# Loop to extract data from API sources
for index, row in api_sources.iterrows():
    print("\n - - - - - \n Extracting source {} \n".format(row["SOURCE_ID"]))
    try:
        dataframe = extract.CSVExtractor.extract(url = row["ENDPOINT_URL"])
        dataframe.to_csv(data_sources_raw / str(row["SOURCE_ID"] + "_raw.csv"))
    except:
        print("There was an issue with source {}".format(row["SOURCE_ID"]))

attributes.Units has 1 unique values.
The column dimensions.Age has 5 unique values.
The column dimensions.Sex has 3 unique values.
The column dimensions.Reporting Type has 1 unique values.

 - - - - - 
 Extracting source S-61
The following columns are present in the datasets, and this is the number of unique values they have. 
The column goal has 1 unique values.
The column target has 1 unique values.
The column indicator has 1 unique values.
The column series has 1 unique values.
The column seriesDescription has 1 unique values.
The column seriesCount has 1 unique values.
The column geoAreaCode has 142 unique values.
The column geoAreaName has 142 unique values.
The column timePeriodStart has 15 unique values.
The column value has 375 unique values.
The column valueType has 1 unique values.
The column time_detail has 1 unique values.
The column timeCoverage has 1 unique values.
The column upperBound has 1 unique values.
The column lowerBound has 1 unique values.
The column basePeriod

### JSON API sources

In [9]:
# JSON sources
api_sources = crba_data_dictionary_source[
    (crba_data_dictionary_source["SOURCE_TYPE"] == "API (SDG)")
]

# Loop to extract data from API sources
for index, row in api_sources.iterrows():
    print("\n - - - - - \n Extracting source {} \n".format(row["SOURCE_ID"]))
    try:
        dataframe = extract.JSONExtractor.extract(url = row["ENDPOINT_URL"])
        dataframe.to_csv(data_sources_raw / str(row["SOURCE_ID"] + "_raw.csv"))
    except:
        print("There was an issue with source {}".format(row["SOURCE_ID"]))

# TO DO: Also include JSON and HTML as extractor --> No other way to put it into the loop than eval()?

e column dimensions.Age has 5 unique values.
The column dimensions.Sex has 3 unique values.
The column dimensions.Reporting Type has 1 unique values.

 - - - - - 
 Extracting source S-61 

The following columns are present in the datasets, and this is the number of unique values they have. 
The column goal has 1 unique values.
The column target has 1 unique values.
The column indicator has 1 unique values.
The column series has 1 unique values.
The column seriesDescription has 1 unique values.
The column seriesCount has 1 unique values.
The column geoAreaCode has 142 unique values.
The column geoAreaName has 142 unique values.
The column timePeriodStart has 15 unique values.
The column value has 375 unique values.
The column valueType has 1 unique values.
The column time_detail has 1 unique values.
The column timeCoverage has 1 unique values.
The column upperBound has 1 unique values.
The column lowerBound has 1 unique values.
The column basePeriod has 1 unique values.
The column sourc

In [5]:
# Extract data
s55_raw = extract.CSVExtractor.extract(url =
    'https://api.uis.unesco.org/sdmx/data/UNESCO,SDG4,2.0/ROFST.PT.L2+L2_3+L3._T._T+F+M.SCH_AGE_GROUP._T.INST_T._Z._T._Z._Z._Z._T._T._Z._Z._Z.?startPeriod=2005&endPeriod=2018&format=csv-sdmx&locale=en&subscription-key=460ab272abdd43c892bb59c218c22c09'
)

# s55_raw.to_csv(data_sources_raw / "S_55_raw.csv")

The following columns are present in the datasets, and this is the number of unique values they have. 
The column Dataflow has 1 unique values.
The column STAT_UNIT has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column EDU_LEVEL has 3 unique values.
The column EDU_CAT has 1 unique values.
The column SEX has 3 unique values.
The column AGE has 1 unique values.
The column GRADE has 1 unique values.
The column SECTOR_EDU has 1 unique values.
The column EDU_ATTAIN has 1 unique values.
The column SUBJECT has 1 unique values.
The column WEALTH_QUINTILE has 1 unique values.
The column INFRASTR has 1 unique values.
The column LOCATION has 1 unique values.
The column EDU_TYPE has 1 unique values.
The column SE_BKGRD has 1 unique values.
The column SOURCE_FUND has 1 unique values.
The column FUND_FLOW has 1 unique values.
The column IMM_STATUS has 1 unique values.
The column REF_AREA has 326 unique values.
The column TIME_PERIOD has 14 unique values.
The column OBS_VALUE h

# Cleansing


< STOPPED HERE , the below code runs (but have to define tha mapping_dict first) --> Next step is to Bring this thing into a loop and take care of the exceptions

In [6]:
import cleanse
import pandas as pd

s102_raw = pd.read_csv(
    filepath_or_buffer = data_sources_raw / "S-102_raw.csv"
)

s102_cleansed = cleanse.Cleanser().cleanse(
    raw_data = s102_raw,
    mapping_dictionary = mapping_dict,
    final_sdmx_col_list = sdmx_df_columns_all,
    dim_cols = sdmx_df_columns_dims,
    country_cols = sdmx_df_columns_country,
    time_cols = sdmx_df_columns_time,
    crba_country_list = country_crba_list
)

s102_cleansed

['TIME_PERIOD']
Index(['Unnamed: 0', 'DIM_SDG_GOAL', 'target', 'indicator', 'series',
       'ATTR_SDG_INDICATOR_DESCRIPTION', 'seriesCount', 'geoAreaCode',
       'REF_AREA', 'TIME_PERIOD', 'RAW_OBS_VALUE', 'valueType', 'time_detail',
       'timeCoverage', 'upperBound', 'lowerBound', 'basePeriod',
       'ATTR_SOURCE_OF_SOURCE', 'geoInfoUrl', 'ATTR_FOOTNOTE_OF_SOURCE',
       'attributes.Nature', 'ATTR_UNIT_MEASURE', 'DIM_AGE', 'DIM_REP_TYPE'],
      dtype='object')
['TIME_PERIOD']
    DIM_SDG_GOAL                     ATTR_SDG_INDICATOR_DESCRIPTION  \
0          ['2']  Proportion of children moderately or severely ...   
1          ['2']  Proportion of children moderately or severely ...   
2          ['2']  Proportion of children moderately or severely ...   
3          ['2']  Proportion of children moderately or severely ...   
4          ['2']  Proportion of children moderately or severely ...   
..           ...                                                ...   
652        ['2

Unnamed: 0,DIM_SDG_GOAL,ATTR_SDG_INDICATOR_DESCRIPTION,REF_AREA,TIME_PERIOD,RAW_OBS_VALUE,ATTR_SOURCE_OF_SOURCE,ATTR_FOOTNOTE_OF_SOURCE,ATTR_UNIT_MEASURE,DIM_AGE,DIM_REP_TYPE,_merge
0,['2'],Proportion of children moderately or severely ...,Afghanistan,2018.0,4.1,Afghanistan Health Survey 2018,[''],PER_POP_U5,<5Y,G,both
1,['2'],Proportion of children moderately or severely ...,Albania,2017.0,16.4,Albania Demographic and Health Survey 2017-18....,[''],PER_POP_U5,<5Y,G,both
2,_T,,Andorra,,,,,,_T,_T,right_only
3,['2'],Proportion of children moderately or severely ...,Algeria,2012.0,12.4,République Algérienne Démocratiqe et Populaire...,[''],PER_POP_U5,<5Y,G,both
4,['2'],Proportion of children moderately or severely ...,Angola,2015.0,3.4,Inquérito de Indicadores Múltiplos e de Saúde ...,[''],PER_POP_U5,<5Y,G,both
...,...,...,...,...,...,...,...,...,...,...,...
190,_T,,Venezuela,,,,,,_T,_T,right_only
191,_T,,Vietnam,,,,,,_T,_T,right_only
192,['2'],Proportion of children moderately or severely ...,Yemen,2013.0,2.5,Yemen National Health and Demographic Survey 2...,[''],PER_POP_U5,<5Y,G,both
193,['2'],Proportion of children moderately or severely ...,Zambia,2018.0,5.2,Zambia Demographic and Health Survey 2018: Key...,[''],PER_POP_U5,<5Y,G,both


In [13]:
# from utils import mapping_dictionary
# %run utils.mapping_dictionary.py

# %run "D:\Documents\2020\28_UNICEF\10_working_repo\data-etl\utils\mapping_dictionary.py"

country_tuple = ("REF_AREA", "COUNTRY")
country_mapper = {key: "REF_AREA" for key in country_tuple}


year_tuple = (
    "TIME_PERIOD",
    "YEAR",
)
year_mapper = {key: "TIME_PERIOD" for key in year_tuple}


obs_value_tuple = ("OBS_VALUE", "Display Value")
obs_value_mapper = {key: "OBS_VALUE" for key in obs_value_tuple}


dim_sex_tuple = "SEX"
dim_sex_mapper = {key: "OBS_VALUE" for key in obs_value_tuple}

"""
dim_edu_tuple = (
    ""
)

dim_age_tuple = (
    "SEX"
)
"""

# Create list of all mapper dictionaries
mapper_tuple_list = [country_mapper, year_mapper, obs_value_mapper, dim_sex_mapper]

# Define the mapping dictionary
mapping_dict = {}

for mapper_tuple in mapper_tuple_list:
    mapping_dict.update(mapper_tuple)

with open("mapping_dict.json", "w") as fp:
    json.dump(mapping_dict, fp)

In [21]:
year_tuple = (
    "TIME_PERIOD",
    "YEAR", 
)

x = {key: "xxx" for key in country_tuple}
y = {key: "yyy" for key in year_tuple}

x.update(y)
x}
y = {key: "yyy" for key in year_tuple}

x.update(y)
x

{'REF_AREA': 'xxx', 'COUNTRY': 'xxx', 'TIME_PERIOD': 'yyy', 'YEAR': 'yyy'}

In [8]:
s55_cleansed = cleanse.Cleanser.cleanse(
    raw_data = s55_raw,
    raw_data_iso_2_col = 'REF_AREA',
    country_df = country_crba_list,
    country_df_iso2_col = 'COUNTRY_ISO_2',
    non_dim_cols = ['OBS_VALUE', 'TIME_PERIOD', 'OBS_STATUS']
)

s55_cleansed.to_csv(data_sources_raw / "S_55_cleansed.csv")

In [9]:
from normalize.scaler import normalizer

s55_normalized = normalizer(
    cleansed_data = s55_cleansed,
    indicator_raw_value = 'OBS_VALUE',
    indicator_code = 'WP_DW_OC_FREASS',
    indicator_name = 'Out-of-school adolescents (lower secondary)',
    indicator_index = 'Workplace',
    indicator_issue = 'Decent working conditions',
    indicator_category = 'Outcome',
    cleansed_df_iso2_col = 'REF_AREA',
    crba_final_country_list = country_crba_list,
    crba_final_country_list_iso_col = 'COUNTRY_ISO_2',
    inverted = True,
    non_dim_cols = [
        'TIME_PERIOD', 
        'REF_AREA', 
        'OBS_VALUE', 
        'OBS_STATUS', 
        'COUNTRY_ISO_3', 
        'COUNTRY_NAME', 
        'COUNTRY_ISO_2', 
        '_merge'
    ])

s55_normalized

You have a selected a few columns, which will not be regarded as dimensions.These are the remaining columns in the dataset, along with the number of values they take in the dataset.
The column Dataflow has 1 unique values.
The column STAT_UNIT has 1 unique values.
The column UNIT_MEASURE has 1 unique values.
The column EDU_LEVEL has 3 unique values.
The column EDU_CAT has 1 unique values.
The column SEX has 3 unique values.
The column AGE has 1 unique values.
The column GRADE has 1 unique values.
The column SECTOR_EDU has 1 unique values.
The column EDU_ATTAIN has 1 unique values.
The column SUBJECT has 1 unique values.
The column WEALTH_QUINTILE has 1 unique values.
The column INFRASTR has 1 unique values.
The column LOCATION has 1 unique values.
The column EDU_TYPE has 1 unique values.
The column SE_BKGRD has 1 unique values.
The column SOURCE_FUND has 1 unique values.
The column FUND_FLOW has 1 unique values.
The column IMM_STATUS has 1 unique values.
The column UNIT_MULT has 1 uniq

Unnamed: 0,Dataflow,STAT_UNIT,UNIT_MEASURE,EDU_LEVEL,EDU_CAT,SEX,AGE,GRADE,SECTOR_EDU,EDU_ATTAIN,...,COUNTRY_ISO_3_y,COUNTRY_NAME_y,COUNTRY_ISO_2_y,RJ_CRBA_FULL_LIST,INDICATOR_NAME,INDICATOR_INDEX,INDICATOR_ISSUE,INDICATOR_CATEGORY,CRBA_RELEASE_YEAR,INDICATOR_CODE
0,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,AFG,Afghanistan,AF,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
1,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,ALB,Albania,AL,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
2,,,,,,,,,,,...,AND,Andorra,AD,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
3,,,,,,,,,,,...,DZA,Algeria,DZ,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
4,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,M,SCH_AGE_GROUP,_T,INST_T,_Z,...,AGO,Angola,AO,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,_T,SCH_AGE_GROUP,_T,INST_T,_Z,...,VEN,Venezuela,VE,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
191,,,,,,,,,,,...,VNM,Vietnam,VN,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
192,UNESCO:SDG4(2.0),ROFST,PT,L3,_T,_T,SCH_AGE_GROUP,_T,INST_T,_Z,...,YEM,Yemen,YE,both,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
193,,,,,,,,,,,...,ZMB,Zambia,ZM,right_only,Out-of-school adolescents (lower secondary),Workplace,Decent working conditions,Outcome,2020,WP_DW_OC_FREASS
