In [361]:
dest_dir = "/tmp/un_sdg"

In [362]:
from owid.catalog import Dataset, Table
from owid import walden
from etl.steps.data.converters import convert_walden_metadata
import requests
import json
import pandas as pd
import numpy as np
import re

In [364]:
ds = Dataset.create_empty(dest_dir)

In [365]:
cat = walden.Catalog()

In [366]:
walden_ds = cat.find_one(short_name="un_sdg")
walden_ds

Dataset(namespace='un_sdg', short_name='un_sdg', name='United Nations Sustainable Development Goals - United Nations (2021-10)', description='The UN SDG database gives access to data on more than 210 SDG indicators for countries across the global by indicator, country, region or time period. The 2030 Agenda for Sustainable Development, adopted by all United Nations Member States in 2015, provides a shared blueprint for peace and prosperity for people and the planet, now and into the future. At its heart are the 17 Sustainable Development Goals (SDGs), which are an urgent call for action by all countries - developed and developing - in a global partnership. They recognize that ending poverty and other deprivations must go hand-in-hand with strategies that improve health and education, reduce inequality, and spur economic growth – all while tackling climate change and working to preserve our oceans and forests.', source_name='United Nations Department of Economic and Social Affairs', url

In [367]:
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [368]:
walden_ds.local_path

'/Users/fionaspooner/.owid/walden/un_sdg/2021-09-30/un_sdg.zip'

In [370]:
import tempfile

In [371]:
dir = tempfile.mkdtemp()
dir

'/var/folders/yc/bkg2k6pj6td2tpkj64v1yvbr0000gn/T/tmp4yf8f9c4'

In [372]:
import zipfile

In [373]:
zf = zipfile.ZipFile(walden_ds.local_path)
zf.extractall(dir)
!ls {dir}

zsh:1: command not found: ls


In [374]:
import pandas as pd
import os

### Reading in the data and removing the last row which is all NaN

In [387]:
df = pd.read_csv(os.path.join(dir, "un-sdg-2021-10.csv"), low_memory=False)
df = df.dropna(axis=0, how="all")  # last row is all NaN
df = df.drop(
    columns=[
        "Goal",
        "GeoAreaCode",
        "Target",
        "TimePeriod",
        "UpperBound",
        "LowerBound",
        "TimeCoverage",
        "BasePeriod",
        "GeoInfoUrl",
        "FootNote",
    ]
)

In [402]:
df.head()

Unnamed: 0,Indicator,SeriesCode,SeriesDescription,GeoAreaName,Value,Time_Detail,Source,Substance use disorders,Tariff regime (status),Severity of price levels,...,Activity,Nature,Sampling Stations,Cities,Level of requirement,Quantile,IHR Capacity,Hazard type,Migratory status,Name of international institution
0,1.1.1,SI_POV_DAY1,Proportion of population below international p...,World,42.7,1981,"World Bank, Development Research Group. Data a...",,,,...,,G,,,,,,,,
1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,World,42.3,1982,"World Bank, Development Research Group. Data a...",,,,...,,G,,,,,,,,
2,1.1.1,SI_POV_DAY1,Proportion of population below international p...,World,41.4,1983,"World Bank, Development Research Group. Data a...",,,,...,,G,,,,,,,,
3,1.1.1,SI_POV_DAY1,Proportion of population below international p...,World,39.8,1984,"World Bank, Development Research Group. Data a...",,,,...,,G,,,,,,,,
4,1.1.1,SI_POV_DAY1,Proportion of population below international p...,World,38.2,1985,"World Bank, Development Research Group. Data a...",,,,...,,G,,,,,,,,


### Making a table of just indicators and series codes


In [415]:
ind = df[["Indicator", "SeriesCode"]].drop_duplicates()

In [416]:
ind.head()

Unnamed: 0,Indicator,SeriesCode
0,1.1.1,SI_POV_DAY1
3046,1.1.1,SI_POV_EMP1
25924,1.2.1,SI_POV_NAHC
27296,1.2.2,SD_MDP_MUHC
28880,1.2.2,SD_MDP_ANDI


In [417]:
ind.set_index("Indicator", inplace=True)

In [418]:
t = Table(ind)
t.metadata.short_name = "Indicators"
t.metadata.title = "List of SDG indicators and associated series codes provided in the UN SDG dataset"
ds.add(t)

### Removing square brackets from some of the column names

In [390]:
new_columns = []
for k in df.columns:
    new_columns.append(re.sub(r"[\[\]]", "", k))

df.columns = new_columns

### Some functions to add more descriptive names to the units and dimension values

In [391]:
def attributes_description() -> dict:
    base_url = "https://unstats.un.org/sdgapi"
    # retrieves all goal codes
    url = f"{base_url}/v1/sdg/Goal/List"
    res = requests.get(url)
    assert res.ok
    goals = json.loads(res.content)
    goal_codes = [int(goal["code"]) for goal in goals]
    # retrieves all area codes
    a = []
    for goal in goal_codes:
        url = f"{base_url}/v1/sdg/Goal/{goal}/Attributes"
        res = requests.get(url)
        assert res.ok
        attr = json.loads(res.content)
        for att in attr:
            for code in att["codes"]:
                a.append(
                    {
                        "code": code["code"],
                        "description": code["description"],
                    }
                )
    att_dict = pd.DataFrame(a).drop_duplicates().set_index("code").squeeze().to_dict()
    return att_dict


def dimensions_description() -> pd.DataFrame:
    base_url = "https://unstats.un.org/sdgapi"
    # retrieves all goal codes
    url = f"{base_url}/v1/sdg/Goal/List"
    res = requests.get(url)
    assert res.ok
    goals = json.loads(res.content)
    goal_codes = [int(goal["code"]) for goal in goals]
    # retrieves all area codes
    d = []
    for goal in goal_codes:
        url = f"{base_url}/v1/sdg/Goal/{goal}/Dimensions"
        res = requests.get(url)
        assert res.ok
        dims = json.loads(res.content)
        for dim in dims:
            for code in dim["codes"]:
                d.append(
                    {
                        "id": dim["id"],
                        "code": code["code"],
                        "description": code["description"],
                    }
                )
    dim_dict = pd.DataFrame(d).drop_duplicates()
    # adding an nan code for each id - a problem for the Coverage dimension
    nan_data = {
        "id": dim_dict.id.unique(),
        "code": np.repeat(np.nan, len(dim_dict.id.unique()), axis=0),
        "description": np.repeat(np.nan, len(dim_dict.id.unique()), axis=0),
    }
    nan_df = pd.DataFrame(nan_data)
    dim_dict = pd.concat([dim_dict, nan_df])
    return dim_dict

In [392]:
series = (
    df.SeriesCode.unique()
)  # I think it makes sense to have one table per series code and then drop out the empty columns in each table *but* it would be good to retain a link to the indicators - maybe another table...

df_dict = {elem: pd.DataFrame for elem in series}

### Replacing the units with a more descriptive version

In [393]:
units_desc = attributes_description()
df["Units"] = df["Units"].apply(lambda x: units_desc[x])

The data is very wide with many different dimensions, some only applicable to single series. There are some codes that appear in multiple series and have slightly different meanings, so it is necessary to match each dimension individually to ensure the dimension codes have the correct description.

In [448]:
dim_description = dimensions_description()
dim_description.groupby(dim_description["code"].tolist(), as_index=False).size().sort_values(by="size")
init_dimensions = tuple(dim_description.id.unique())
dim_description

Unnamed: 0,id,code,description
0,Age,<1M,under 1 month old
1,Age,<1Y,under 1 year old
2,Age,<5Y,under 5 years old
3,Age,<15Y,under 15 years old
4,Age,<18Y,under 18 years old
...,...,...,...
39,Mountain Elevation,,
40,Cause of death,,
41,Parliamentary committees,,
42,Tariff regime (status),,


In [396]:
# Having this in seems to make the subsequent stage much longer...

# for dim in init_dimensions:
#    dim_lambda = dim_description[dim_description['id'] == dim][['code', 'description']].set_index('code').squeeze().to_dict()
#    df[dim] = df[dim].map(dim_lambda)

In [399]:
df.columns

Index(['Indicator', 'SeriesCode', 'SeriesDescription', 'GeoAreaName', 'Value',
       'Time_Detail', 'Source', 'Substance use disorders',
       'Tariff regime (status)', 'Severity of price levels', 'Sex',
       'Deviation Level', 'Mountain Elevation', 'Parliamentary committees',
       'Mode of transportation', 'Fiscal intervention stage',
       'Type of support', 'Type of speed', 'Policy instruments',
       'Policy Domains', 'Counterpart', 'Type of skill', 'Education level',
       'Type of waste treatment', 'Location', 'Food Waste Sector',
       'Name of international agreement', 'Reporting Type', 'Freq',
       'Type of product', 'Cause of death', 'Report Ordinal',
       'Type of facilities', 'Government_Name', 'Observation Status',
       'Type of occupation', 'Type of mobile technology',
       'Name of non-communicable disease', 'Grounds of discrimination',
       'Units', 'Level/Status', 'Age', 'Disability status',
       'Frequency of Chlorophyll-a concentration', 'Activi

In [447]:
# This could be much improved:
# * Add more to the metadata including the clean sources JSON from importers - will do in garden
# * Use the series description as the table short name - but this may need some cleaning as it contains punctuation and units.

tables = []
for key in df_dict.keys():
    df_dict[key] = df[:][df.SeriesCode == key]  # pulling out the data for a given SeriesCode
    df_dict[key] = (
        df_dict[key].dropna(axis=1, how="all").reset_index(drop=True)
    )  # dropping out columns which are all NaN
    # print(key)
    t = Table(df_dict[key])
    sn = df_dict[key]["SeriesDescription"].unique()[0]
    sn = re.sub(r"\([^()]*\)", "", sn).lower().rstrip().replace(" ", "_").replace(",", "").replace("/", "")
    t.metadata.short_name = key
    # t.metadata.description = df_dict[key]['SeriesDescription'].unique().astype(str)
    tables.append(t)

In [132]:
for t in tables:
    ds.add(t)

<class 'owid.catalog.tables.Table'>
SI_POV_DAY1
<class 'owid.catalog.tables.Table'>
SI_POV_EMP1
<class 'owid.catalog.tables.Table'>
SI_POV_NAHC
<class 'owid.catalog.tables.Table'>
SD_MDP_MUHC
<class 'owid.catalog.tables.Table'>
SD_MDP_ANDI
<class 'owid.catalog.tables.Table'>
SD_MDP_MUHHC
<class 'owid.catalog.tables.Table'>
SD_MDP_CSMP
<class 'owid.catalog.tables.Table'>
SD_MDP_ANDIHH
<class 'owid.catalog.tables.Table'>
SI_COV_MATNL
<class 'owid.catalog.tables.Table'>
SI_COV_POOR
<class 'owid.catalog.tables.Table'>
SI_COV_SOCAST
<class 'owid.catalog.tables.Table'>
SI_COV_SOCINS
<class 'owid.catalog.tables.Table'>
SI_COV_CHLD
<class 'owid.catalog.tables.Table'>
SI_COV_UEMP
<class 'owid.catalog.tables.Table'>
SI_COV_VULN
<class 'owid.catalog.tables.Table'>
SI_COV_WKINJRY
<class 'owid.catalog.tables.Table'>
SI_COV_BENFTS
<class 'owid.catalog.tables.Table'>
SI_COV_DISAB
<class 'owid.catalog.tables.Table'>
SI_COV_LMKT
<class 'owid.catalog.tables.Table'>
SI_COV_PENSN
<class 'owid.catalog.tabl

### Clean Up

In [100]:
import shutil

shutil.rmtree(dir)