In [None]:
# default_exp xml

# xml

> Functions to parse XML files

In [None]:
# export
import pathlib
import re
import datetime
# import xml.etree.ElementTree

import pandas as pd
from lxml import etree

Directory where the data (*XML* files) are stored

In [None]:
directory = pathlib.Path.cwd() / 'samples'
assert directory.exists()
directory

PosixPath('/home/manu/dlsproc/samples')

A (sample) file in that directory

In [None]:
xml_file = directory / 'sample.xml'
assert xml_file.exists()
xml_file

PosixPath('/home/manu/dlsproc/samples/sample.xml')

*Root* element of the *XML* tree

In [None]:
root = etree.parse(xml_file).getroot()

## Convenience functions

A function to extract the *namespace*s declared in an *XML* file

In [None]:
# export
def get_namespaces(input_file: str | pathlib.Path, root_name: str = 'base') -> dict:
    
    tree = etree.parse(input_file)
    
    namespaces = tree.getroot().nsmap
    
    if None in namespaces:
        
        namespaces[root_name] = namespaces.pop(None)
        
    return namespaces

In [None]:
get_namespaces(xml_file)

{'cbc-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2',
 'cac-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2',
 'cbc': 'urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2',
 'cac': 'urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2',
 'ns1': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2',
 'base': 'http://www.w3.org/2005/Atom'}

In order to trim off *namespace*s from a tag:
- a regular expression

In [None]:
# export
re_tag = re.compile('\{(.*)\}(.*)')

In [None]:
assert re_tag.match('{blabla}foo').groups() == ('blabla', 'foo')

In [None]:
re_tag.match('{some.namespace}id').groups()

('some.namespace', 'id')

* a function

In [None]:
# export
def split_namespace_tag(namespace_tag: str) -> str:
    return re_tag.match(namespace_tag).groups()

In [None]:
split_namespace_tag('{some.namespace}id')

('some.namespace', 'id')

In [None]:
split_namespace_tag(root.tag)

('http://www.w3.org/2005/Atom', 'feed')

In [None]:
# export
to_be_skipped = ['author', 'id', 'link', 'title', 'updated', r'deleted-entry']
to_be_skipped

['author', 'id', 'link', 'title', 'updated', 'deleted-entry']

A function to get list of `etree.Element` with all the *entries* (allegedly, *procurement contracts*)

In [None]:
# export
def get_entries(root: etree.Element) -> list[etree.Element]:
    
    return [e for e in etree.parse(xml_file).getroot() if split_namespace_tag(e.tag)[1] == 'entry']

*Entries* are extracted using the above function (only the 4 first ones are shown)

In [None]:
entries = get_entries(root)
assert len(entries) == 117
entries[:4]

[<Element {http://www.w3.org/2005/Atom}entry at 0x7f380384c100>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f380384c680>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f380384e0c0>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f380384df00>]

In [None]:
element = entries[0]
split_namespace_tag(element.tag)

('http://www.w3.org/2005/Atom', 'entry')

In [None]:
subelement = element[3]
split_namespace_tag(subelement.tag)

('http://www.w3.org/2005/Atom', 'title')

In [None]:
subelement.text

"L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí"

In [None]:
element

<Element {http://www.w3.org/2005/Atom}entry at 0x7f380384c100>

Format of the dates to be parsed by `entry_to_series`

In [None]:
# export
datetime_format = '%Y-%m-%dT%H:%M:%S.%f%z'
# datetime_format_without_micro = '%Y-%m-%dT%H:%M:%S%z'

A function to parse an entry into a `pd.Series`

In [None]:
# export
def entry_to_series(entry: etree.Element) -> pd.Series:

    res = {}
    
    # for every "child" of `entry` ...
    for e in entry:
        
        # ...the *namespace* and *tag* are extracted
        namespace, tag = split_namespace_tag(e.tag)

        # the `text` in different tags is processed in different ways...
        if tag == 'updated':
            value = pd.to_datetime(e.text, format=datetime_format)
        else:
            value = e.text

        res[tag] = value
    
    return pd.Series(res)

In [None]:
element_series = entry_to_series(element)
element_series

id                      https://contrataciondelestado.es/sindicacion/P...
link                                                                 None
summary                 Id licitación: C. 2-2021; Órgano de Contrataci...
title                   L'objecte del contracte és la renovació de tot...
updated                                  2022-01-03 01:11:41.826000+01:00
ContractFolderStatus                                       \n            
dtype: object

We can concatenate together the `pd.Series` for the different *entries* into a `pd.DataFrame`

In [None]:
# df = pd.concat([entry_to_series(e).to_frame().T for e in entries[:4]])
df = pd.concat([entry_to_series(e) for e in entries[:4]], axis=1).T
df

Unnamed: 0,id,link,summary,title,updated,ContractFolderStatus
0,https://contrataciondelestado.es/sindicacion/P...,,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03 01:11:41.826000+01:00,\n
1,https://contrataciondelestado.es/sindicacion/P...,,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03 01:00:11.194000+01:00,\n
2,https://contrataciondelestado.es/sindicacion/P...,,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03 01:00:10.399000+01:00,\n
3,https://contrataciondelestado.es/sindicacion/P...,,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-03 00:11:40.740000+01:00,\n


The types of the columns

In [None]:
df.dtypes

id                                                    object
link                                                  object
summary                                               object
title                                                 object
updated                 datetime64[ns, pytz.FixedOffset(60)]
ContractFolderStatus                                  object
dtype: object

In [None]:
# hide
import nbdev.export
nbdev.export.notebook2script('10_xml.ipynb')

Converted 10_xml.ipynb.
