In [17]:
dest_dir = "/tmp/un_sdg"

In [18]:
from owid.catalog import Dataset, Table
from owid import walden
from etl.steps.data.converters import convert_walden_metadata

In [19]:
ds = Dataset.create_empty(dest_dir)

In [20]:
cat = walden.Catalog()

In [21]:
walden_ds = cat.find_one(short_name="un_sdg")
walden_ds

Dataset(namespace='un_sdg', short_name='un_sdg', name='United Nations Sustainable Development Goals - United Nations (2021-10)', description='The UN SDG database gives access to data on more than 210 SDG indicators for countries across the global by indicator, country, region or time period. The 2030 Agenda for Sustainable Development, adopted by all United Nations Member States in 2015, provides a shared blueprint for peace and prosperity for people and the planet, now and into the future. At its heart are the 17 Sustainable Development Goals (SDGs), which are an urgent call for action by all countries - developed and developing - in a global partnership. They recognize that ending poverty and other deprivations must go hand-in-hand with strategies that improve health and education, reduce inequality, and spur economic growth – all while tackling climate change and working to preserve our oceans and forests.', source_name='United Nations Department of Economic and Social Affairs', url

In [22]:
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [23]:
walden_ds.local_path

'/Users/fionaspooner/.owid/walden/un_sdg/2021-09-30/un_sdg.zip'

In [24]:
ds

Dataset(path='/tmp/un_sdg', metadata=DatasetMeta(namespace='un_sdg', short_name='un_sdg', title='United Nations Sustainable Development Goals - United Nations (2021-10)', description='The UN SDG database gives access to data on more than 210 SDG indicators for countries across the global by indicator, country, region or time period. The 2030 Agenda for Sustainable Development, adopted by all United Nations Member States in 2015, provides a shared blueprint for peace and prosperity for people and the planet, now and into the future. At its heart are the 17 Sustainable Development Goals (SDGs), which are an urgent call for action by all countries - developed and developing - in a global partnership. They recognize that ending poverty and other deprivations must go hand-in-hand with strategies that improve health and education, reduce inequality, and spur economic growth – all while tackling climate change and working to preserve our oceans and forests.', sources=[Source(name='United Nati

In [25]:
import tempfile

In [26]:
dir = tempfile.mkdtemp()
dir

'/var/folders/yc/bkg2k6pj6td2tpkj64v1yvbr0000gn/T/tmp_u2liikr'

In [27]:
import zipfile

In [28]:
zf = zipfile.ZipFile(walden_ds.local_path)
zf.extractall(dir)
!ls {dir}

zsh:1: command not found: ls


In [29]:
import pandas as pd
import os

In [30]:
df = pd.read_csv(
    os.path.join(dir, "un-sdg-2021-10.csv"), low_memory=False
)

In [31]:
df.head()

Unnamed: 0,Goal,Target,Indicator,SeriesCode,SeriesDescription,GeoAreaCode,GeoAreaName,TimePeriod,Value,Time_Detail,...,[Activity],[Nature],[Sampling Stations],[Cities],[Level of requirement],[Quantile],[IHR Capacity],[Hazard type],[Migratory status],[Name of international institution]
0,1.0,1.1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,1.0,World,1981.0,42.7,1981,...,,G,,,,,,,,
1,1.0,1.1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,1.0,World,1982.0,42.3,1982,...,,G,,,,,,,,
2,1.0,1.1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,1.0,World,1983.0,41.4,1983,...,,G,,,,,,,,
3,1.0,1.1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,1.0,World,1984.0,39.8,1984,...,,G,,,,,,,,
4,1.0,1.1,1.1.1,SI_POV_DAY1,Proportion of population below international p...,1.0,World,1985.0,38.2,1985,...,,G,,,,,,,,


In [32]:
df.columns

Index(['Goal', 'Target', 'Indicator', 'SeriesCode', 'SeriesDescription',
       'GeoAreaCode', 'GeoAreaName', 'TimePeriod', 'Value', 'Time_Detail',
       'TimeCoverage', 'UpperBound', 'LowerBound', 'BasePeriod', 'Source',
       'GeoInfoUrl', 'FootNote', '[Substance use disorders]',
       '[Tariff regime (status)]', '[Severity of price levels]', '[Sex]',
       '[Deviation Level]', '[Mountain Elevation]',
       '[Parliamentary committees]', '[Mode of transportation]',
       '[Fiscal intervention stage]', '[Type of support]', '[Type of speed]',
       '[Policy instruments]', '[Policy Domains]', '[Counterpart]',
       '[Type of skill]', '[Education level]', '[Type of waste treatment]',
       '[Location]', '[Food Waste Sector]',
       '[Name of international agreement]', '[Reporting Type]', '[Freq]',
       '[Type of product]', '[Cause of death]', '[Report Ordinal]',
       '[Type of facilities]', '[Government_Name]', '[Observation Status]',
       '[Type of occupation]', '[Type of

In [33]:

series = df.SeriesCode.unique() # I think it makes sense to have one table per series code and then drop out the empty columns in each table *but* it would be good to retain a link to the indicators - maybe another table...

series_clean = [x for x in series if str(x) != 'nan']

df_dict = {elem : pd.DataFrame for elem in series_clean}


In [34]:
df_dict

{'SI_POV_DAY1': pandas.core.frame.DataFrame,
 'SI_POV_EMP1': pandas.core.frame.DataFrame,
 'SI_POV_NAHC': pandas.core.frame.DataFrame,
 'SD_MDP_MUHC': pandas.core.frame.DataFrame,
 'SD_MDP_ANDI': pandas.core.frame.DataFrame,
 'SD_MDP_MUHHC': pandas.core.frame.DataFrame,
 'SD_MDP_CSMP': pandas.core.frame.DataFrame,
 'SD_MDP_ANDIHH': pandas.core.frame.DataFrame,
 'SI_COV_MATNL': pandas.core.frame.DataFrame,
 'SI_COV_POOR': pandas.core.frame.DataFrame,
 'SI_COV_SOCAST': pandas.core.frame.DataFrame,
 'SI_COV_SOCINS': pandas.core.frame.DataFrame,
 'SI_COV_CHLD': pandas.core.frame.DataFrame,
 'SI_COV_UEMP': pandas.core.frame.DataFrame,
 'SI_COV_VULN': pandas.core.frame.DataFrame,
 'SI_COV_WKINJRY': pandas.core.frame.DataFrame,
 'SI_COV_BENFTS': pandas.core.frame.DataFrame,
 'SI_COV_DISAB': pandas.core.frame.DataFrame,
 'SI_COV_LMKT': pandas.core.frame.DataFrame,
 'SI_COV_PENSN': pandas.core.frame.DataFrame,
 'SP_ACS_BSRVH2O': pandas.core.frame.DataFrame,
 'SP_ACS_BSRVSAN': pandas.core.frame.

In [50]:
#This could be much improved:
# * Convert the dimension codes to more descriptive values
# * Add more to the metadata including the clean sources JSON from importers
# * Add a table which links indicators to series codes

tables = []
for key in df_dict.keys():
    df_dict[key] = df[:][df.SeriesCode == key] #pulling out the data for a given SeriesCode
    df_dict[key] = df_dict[key].dropna(axis = 1, how = 'all')  # dropping out columns which are all NaN
    t = Table(df_dict[key])
    t.metadata.short_name = 'un_sdg'
    t.metadata.description = df_dict[key]['SeriesDescription'].unique()
    tables.append(t)

ds.add(tables)

[      Goal Target Indicator   SeriesCode  \
 0      1.0    1.1     1.1.1  SI_POV_DAY1   
 1      1.0    1.1     1.1.1  SI_POV_DAY1   
 2      1.0    1.1     1.1.1  SI_POV_DAY1   
 3      1.0    1.1     1.1.1  SI_POV_DAY1   
 4      1.0    1.1     1.1.1  SI_POV_DAY1   
 ...    ...    ...       ...          ...   
 3041   1.0    1.1     1.1.1  SI_POV_DAY1   
 3042   1.0    1.1     1.1.1  SI_POV_DAY1   
 3043   1.0    1.1     1.1.1  SI_POV_DAY1   
 3044   1.0    1.1     1.1.1  SI_POV_DAY1   
 3045   1.0    1.1     1.1.1  SI_POV_DAY1   
 
                                       SeriesDescription  GeoAreaCode  \
 0     Proportion of population below international p...          1.0   
 1     Proportion of population below international p...          1.0   
 2     Proportion of population below international p...          1.0   
 3     Proportion of population below international p...          1.0   
 4     Proportion of population below international p...          1.0   
 ...                

### Clean Up

In [100]:
import shutil
shutil.rmtree(dir)