In [65]:
import json
import os
import collections
import io
import xmltodict
import pandas as pd

* [TED Schemas](https://publications.europa.eu/en/web/eu-vocabularies/tedschemashttps://publications.europa.eu/en/web/eu-vocabularies/tedschemas)
* [Forms in PDF ](http://simap.ted.europa.eu/standard-forms-for-public-procurement)

In [134]:
data_path = "./data"

def load_data(data_path, language="EN"):
    # get the list of the subdirectories
    dirs = os.listdir(data_path)
    parsed_xmls = []
    
    language_tenders = []
    all_tenders = []
    
    # loop through the subdirectories and get the list of files in each
    for dir_ in dirs:
        files = os.listdir(os.path.join(data_path, dir_))
        date = dir_.split("_")[0]
        
        # loop through the files
        for file in files:
            # read the contents of the file
            with io.open(os.path.join(data_path, dir_, file), 'r', encoding="utf-8") as f:
                xml = f.read()
                parsed_xml = xmltodict.parse(xml)
                parsed_xmls.append(parsed_xml)
                forms_section = parsed_xml['TED_EXPORT']['FORM_SECTION']
                notice_data = parsed_xml['TED_EXPORT']['CODED_DATA_SECTION']['NOTICE_DATA']
                codif_data = parsed_xml['TED_EXPORT']['CODED_DATA_SECTION']['CODIF_DATA']
                
                forms = forms_section.keys()
                
                for form in forms:
                    try:
                        form_contents = forms_section[form]
                        form_contents['DATE'] = date
                        form_contents['FILE'] = file
                        form_contents['NO_DOC_OJS'] = notice_data['NO_DOC_OJS']
                        form_contents['ORIGINAL_CPV_CODE'] = notice_data['ORIGINAL_CPV']['@CODE']
                        form_contents['ORIGINAL_CPV_TEXT'] = notice_data['ORIGINAL_CPV']['#text']
                        form_contents['VALUE'] = notice_data['VALUES']['VALUE']['#text']
                        form_contents['VALUE_CURR'] = notice_data['VALUES']['VALUE']['@CURRENCY']
                        form_contents['REF_NO'] = notice_data['REF_NOTICE']['NO_DOC_OJS']
                        
                        if isinstance(form_contents, list):
                            for i, form_contents in enumerate(fred):
                                all_tenders.append(form_contents[i])
                                if language is not None and form_contents['@LG'] == language:
                                    language_tenders.append(form_contents[i])
                        elif isinstance(form_contents, collections.OrderedDict):
                            all_tenders.append(form_contents)
                            if language is not None and form_contents['@LG'] == language:
                                language_tenders.append(form_contents)
                    except Exception as e:
                        pass

    if language == None:
        language_tenders = all_tenders

    parsed_data = []
    for tender in language_tenders:
        flattened = {}
        
        # add some fields
        flattened['DATE'] = tender['DATE']
        flattened['LG'] = tender['@LG']
        flattened['FILE'] = tender['FILE']
        flattened['NO_DOC_OJS'] = tender['NO_DOC_OJS']
        flattened['ORIGINAL_CPV_CODE'] = tender['ORIGINAL_CPV_CODE']
        flattened['ORIGINAL_CPV_TEXT'] = tender['ORIGINAL_CPV_TEXT']
        flattened['VALUE'] = tender['VALUE']
        flattened['VALUE_CURR'] = tender['VALUE_CURR']
        flattened['REF_NO'] = tender['REF_NO']
        try:
            for key, value in tender['CONTRACTING_BODY'].items():
                address_contracting_body = value
                try:
                    for key, value in address_contracting_body.items():
                        flattened[key.lower()] = value
                except:
                    pass
            
            flattened['title'] = tender['OBJECT_CONTRACT']['TITLE']['P']
            flattened['short_description'] = tender['OBJECT_CONTRACT']['SHORT_DESCR']['P']
            
            # unwind some fields
            flattened['country'] = flattened['country']['@VALUE']
            flattened['n2016:nuts'] = flattened['n2016:nuts']['@CODE']
            
            parsed_data.append(flattened)
        except Exception as e:
#             print("error:", e)
            pass

    df = pd.DataFrame(parsed_data)

    return df

In [135]:
df = load_data(data_path, language="EN")

In [136]:
df.head()

Unnamed: 0,@value,DATE,FILE,LG,NO_DOC_OJS,ORIGINAL_CPV_CODE,ORIGINAL_CPV_TEXT,REF_NO,VALUE,VALUE_CURR,...,n2016:nuts,nationalid,officialname,phone,postal_code,short_description,title,town,url_buyer,url_general
0,GENERAL_PUBLIC_SERVICES,20190102,000098_2019.xml,EN,2019/S 001-000098,44110000,Construction materials,2018/S 199-449921,14786959.49,EUR,...,MT,,Department of Contracts,+356 21220212,FRN 1600,Tender for the design and build of the sustain...,Design and Build of the Sustainable Living Com...,Floriana,https://www.etenders.gov.mt/epps,https://www.etenders.gov.mt/epps
1,,20190102,000502_2019.xml,EN,2019/S 001-000502,38000000,"Laboratory, optical and precision equipments (...",2018/S 039-084815,70532.75,EUR,...,MT,,Water Services Corporation,+356 22443553,LQA 9043,[The subject of this tender is the purchase of...,Purchase of Automated Lab Equipment for the Wa...,Luqa,https://www.etenders.gov.mt/epps,https://www.etenders.gov.mt/epps
2,,20190102,000503_2019.xml,EN,2019/S 001-000503,38000000,"Laboratory, optical and precision equipments (...",2018/S 039-084774,20574.0,EUR,...,MT,,Water Services Corporation,+356 22443553,LQA 9043,"[The subject of this tender is the purchase, a...",Purchase of DPU Sampling Equipment for the Wat...,Luqa,https://www.etenders.gov.mt/epps,https://www.etenders.gov.mt/epps
3,GENERAL_PUBLIC_SERVICES,20190102,000961_2019.xml,EN,2019/S 001-000961,60000000,Transport services (excl. Waste transport),2018/S 086-193477,26740.0,GBP,...,UKG21,,Telford and Wrekin Council,+44 1952384622,TF3 4LF,All age passenger transport service to include...,RERUN — Taxis — 0023,Telford,www.telford.gov.uk,www.telford.gov.uk
4,GENERAL_PUBLIC_SERVICES,20190102,000963_2019.xml,EN,2019/S 001-000963,60000000,Transport services (excl. Waste transport),2018/S 086-193477,45840.0,GBP,...,UKG21,,Telford and Wrekin Council,+44 1952384622,TF3 4LF,All age passenger transport service to include...,Taxis — 0029,Telford,www.telford.gov.uk,www.telford.gov.uk


In [126]:
df.loc[0]['VALUE']

OrderedDict([('@TYPE', 'PROCUREMENT_TOTAL'),
             ('@CURRENCY', 'EUR'),
             ('#text', '14786959.49')])