In [35]:
from xml.etree import ElementTree
import collections
import json

import psycopg2

from scraper_boamp.config import CONFIG_DATABASE



In [4]:
# Open connection
connection = psycopg2.connect(
    dbname=CONFIG_DATABASE['name'],
    user=CONFIG_DATABASE['username'],
    password=CONFIG_DATABASE['password'],
)
cursor = connection.cursor()


In [12]:
cursor.close()
connection.close()

In [5]:
cursor.execute("SELECT * FROM boamp LIMIT 1000;")

In [7]:
data = cursor.fetchall()

In [10]:
avis_data = data[0]
year, doc_type, ident, content = avis_data

In [13]:
xml_data = ElementTree.fromstring(content)

In [90]:
def xml2dict(element):
    """
    There is no one-to-one mapping between XML and JSON, so so choices must be made.
    
    Here we do not keep ordering of children. Children are grouped by tag.
    """
    
    assert not element.tail
    
    children = collections.defaultdict(list)
    for child in element:
        assert child.tag != '_attributes'
        children[child.tag].append(xml2dict(child))
    
    children = {
        child_type: child_type_elements if len(child_type_elements) > 1 else child_type_elements[0]
        for child_type, child_type_elements in children.items()
    }
    
    element_text = element.text and element.text.strip()
    
    if (not children) and (not element.attrib):
        return element_text

    child_dict = children
    
    if element.attrib:
        child_dict['_attributes'] = element.attrib

    if element_text:
        child_dict['_text'] = element_text
    
    if (len(child_dict) == 1) and ('_attributes' not in child_dict) and ('_text' not in child_dict):
        key = list(child_dict)[0]
        value = child_dict[key]
        if not value:
            return key

    return child_dict


In [91]:
dict_data = xml2dict(xml_data)

In [83]:
dict_data['DONNEES']['IDENTITE']['DENOMINATION']

'Cté de Communes du Pays des Couleurs'

In [87]:
dict_data['DONNEES']['IDENTITE']['CODE_IDENT_NATIONAL']

KeyError: 'CODE_IDENT_NATIONAL'

In [84]:
dict_data['DONNEES']['TYPE_POUVOIR_ADJUDICATEUR']

'AUTOTRITE_REGIONAL'

In [85]:
dict_data['DONNEES']['ACTIVITE_PRINCIPALE']

{'POUVOIR_ADJUDICATEUR': 'ACT_SERV_ADM_PUB'}

In [86]:
dict_data['DONNEES']['TYPE_ORGANISME']

KeyError: 'TYPE_ORGANISME'

In [92]:
print(json.dumps(dict_data, indent=4))

{
    "GESTION": {
        "REFERENCE": {
            "IDWEB": "15-125217",
            "TYPE_AVIS": {
                "FAMILLE": "JOUE",
                "PERIMETRE": "DIRECTIVE-18",
                "NATURE": "APPEL_OFFRE",
                "STATUT": "INITIAL"
            }
        },
        "NOM_HTML": "15-125217.html",
        "INDEXATION": {
            "DATE_PUBLICATION": "2015-08-12",
            "DATE_LIMITE_REPONSE": "2015-10-30T11:00:00+01:00",
            "DATE_FIN_DIFFUSION": "2015-10-30",
            "CLASSES": {
                "CLASSE": {
                    "CODE": "45",
                    "LIBELLE": "Travaux de construction"
                }
            },
            "DESCRIPTEURS": {
                "DESCRIPTEUR": [
                    {
                        "CODE": "87",
                        "LIBELLE": "D\u00e9molition"
                    },
                    {
                        "CODE": "265",
                        "LIBELLE": "Piscine"
             