In [1]:
import pandas as pd
import numpy as np
import os

from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file

In [2]:
import xmltodict
from lxml import etree

In [3]:
dsd = 'source/education_dsd.xml'
data = 'source/education.xml'

In [4]:
with open(dsd) as f:
    dsd = xmltodict.parse(f.read())

In [66]:
# concepts

In [5]:
indicators = dsd['message:Structure']['message:CodeLists']['CodeList'][0]

In [6]:
len(indicators['Code'])

1868

In [7]:
indi_info_list = []

for i in indicators['Code']:
    indi_info = []
    indi_info.append(i['@value'])
    
    if '@parentCode' in i.keys():
        indi_info.append(i['@parentCode'])
    else:
        indi_info.append(np.nan)
    
    if isinstance(i['Description'], dict):
        indi_info.append(i['Description']['#text'])
    else:
        indi_info.append(i['Description'][0]['#text'])
        
    indi_info_list.append(indi_info)

In [8]:
conc = pd.DataFrame(indi_info_list, columns=['concept', 'drillup', 'name'])

In [9]:
conc['concept'] = conc['concept'].map(to_concept_id)
conc['drillup'] = conc['drillup'].map(to_concept_id)

In [10]:
conc['concept_type'] = 'measure'

In [None]:
# refer to the cells below to check what's all_data

conc = conc.set_index('concept')

In [36]:
cdf = conc.loc[all_data.keys()]

cdf.sort_index()['concept', 'name', 'concept_type'].to_csv('../ddf--concepts--continuous.csv')

KeyError: ('concept', 'name', 'concept_type')

In [33]:
# discrete concepts

disc = pd.DataFrame([['name', 'Name', 'string'],
                     ['time', 'Year', 'time'],
                     ['location', 'Location', 'entity_domain']
                    ], columns=['concept', 'name', 'concept_type'])

disc.to_csv('../ddf--concepts--discrete.csv', index=False)

In [None]:
# location

In [11]:
locs = dsd['message:Structure']['message:CodeLists']['CodeList'][1]

In [12]:
loc_list = []

for c in locs['Code']:
    cinfo = [c['@value'], c['Description'][0]['#text']]
    
    loc_list.append(cinfo)

In [13]:
loc_df = pd.DataFrame(loc_list, columns=['location', 'name'])

In [14]:
loc_df['location'] = loc_df['location'].map(to_concept_id)

In [15]:
loc_df.to_csv('../ddf--entities--location.csv', index=False)

In [None]:
# datapoints

In [16]:
f2 = open('source/education.xml')

In [17]:
d2 = etree.parse(f2)

In [18]:
root = d2.getroot()

In [19]:
root.getchildren()

[<Element {http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message}Header at 0x110f49b08>,
 <Element {http://www.SDMX.org/resources/SDMXML/schemas/v2_0/generic}DataSet at 0x110f44ac8>]

In [20]:
nsmap = root.nsmap.copy()

In [21]:
nsmap

{'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
 'common': 'http://www.SDMX.org/resources/SDMXML/schemas/v2_0/common',
 None: 'http://www.SDMX.org/resources/SDMXML/schemas/v2_0/generic',
 'message': 'http://www.SDMX.org/resources/SDMXML/schemas/v2_0/message'}

In [22]:
nsmap['xmlns'] = nsmap.pop(None)  # change None to a meaningful name, so that I can use later.

In [23]:
all_data = {}

for item in root.xpath('.//xmlns:Series', namespaces=nsmap):
    item_dict = xmltodict.parse(etree.tostring(item))
    
    attrs = {}
    ser = []
    
    for i in item_dict['Series']['SeriesKey']['Value']:
        if i['@concept'] == 'EDULIT_IND':
            attrs['key'] = to_concept_id(i['@value'])
        if i['@concept'] == 'LOCATION':
            attrs['location'] = to_concept_id(i['@value'])
            
    obs = item_dict['Series']['Obs']
    if isinstance(obs, list):
        for o in item_dict['Series']['Obs']:
            ser.append([o['Time'], o['ObsValue']['@value']])
    else:
        ser.append([obs['Time'], obs['ObsValue']['@value']])
    
    df = pd.DataFrame(ser, columns=['time', attrs['key']])
    df['location'] = attrs['location']

    if attrs['key'] not in all_data.keys():
        all_data[attrs['key']] = [df]
    else:
        all_data[attrs['key']].append(df)

In [24]:
len(all_data.keys())

1683

In [28]:
for k, v in all_data.items():
    all_data[k] = pd.concat(v, ignore_index=True)

In [35]:
for k, df in all_data.items():
    df = df.dropna(how='any')
    df = df[df[k] != 'NaN']
    path = '../ddf--datapoints--{}--by--location--time.csv'.format(k)
    
    df.to_csv(path, index=False)

In [34]:
create_index_file('../')

Unnamed: 0,key,value,file
0,concept,drillup,ddf--concepts--continuous.csv
1,concept,name,ddf--concepts--continuous.csv
2,concept,concept_type,ddf--concepts--continuous.csv
0,concept,name,ddf--concepts--discrete.csv
1,concept,concept_type,ddf--concepts--discrete.csv
0,"location,time",20000,ddf--datapoints--20000--by--location--time.csv
0,"location,time",20001,ddf--datapoints--20001--by--location--time.csv
0,"location,time",20002,ddf--datapoints--20002--by--location--time.csv
0,"location,time",20003,ddf--datapoints--20003--by--location--time.csv
0,"location,time",20004,ddf--datapoints--20004--by--location--time.csv
