In [61]:
#| default_exp xml

# xml

> Parse XML files

In [62]:
!ls

00_core.ipynb	      20_bundle.ipynb	     60_assemble.ipynb	nbdev.yml
05_structure.ipynb    30_hierarchical.ipynb  70_extend.ipynb	_quarto.yml
10_xml.ipynb	      40_io.ipynb	     80_download.ipynb	sidebar.yml
15_postprocess.ipynb  50_parse.ipynb	     index.ipynb	styles.css


In [63]:
import sys
sys.path.append('../')

In [64]:
#| export
import pathlib
import re
import datetime
from collections.abc import Iterable

import numpy as np
import pandas as pd
from lxml import etree

import sproc.structure
import sproc.postprocess

## Sample data

Directory where the data (*XML* files) are stored

In [65]:
directory = pathlib.Path.cwd().parent / 'samples'
assert directory.exists()
directory

PosixPath('/export/usuarios_ml4ds/cggamella/sproc/samples')

A (sample) file in that directory

In [66]:
xml_file = directory / 'PlataformasAgregadasSinMenores_20220104_030016_1.atom'
assert xml_file.exists()
xml_file

PosixPath('/export/usuarios_ml4ds/cggamella/sproc/samples/PlataformasAgregadasSinMenores_20220104_030016_1.atom')

In [67]:
!head {xml_file} --lines=3000

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:cbc-place-ext="urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2" xmlns:cac-place-ext="urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2" xmlns:cbc="urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2" xmlns:cac="urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2" xmlns:ns1="urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2">
    <author>
        <name>Plataforma de Contratación del Sector Público</name>
        <uri>https://contrataciondelestado.es</uri>
        <email>contrataciondelestado@minhafp.es</email>
    </author>
    <id>https://contrataciondelestado.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores.atom</id>
    <link href="PlataformasAgregadasSinMenores_20220104_030016_1.atom" rel="self"/>
    <link href="PlataformasAgregadasSinMenores.atom" rel="firs

*Root* element of the *XML* tree

In [68]:
root = etree.parse(xml_file).getroot()

## Convenience functions

A function to extract the *namespace*s declared in an *XML* file

In [69]:
#| export
def get_namespaces(
    input_file: str | pathlib.Path, # XML file
    root_name: str = 'base' # Name of the root element
    ) -> dict[str, str]: # Mapping from *tag* to *namespace*
    "Returns the namespaces in the input XML file"
    
    tree = etree.parse(input_file)
    
    namespaces = tree.getroot().nsmap
    
    if None in namespaces:
        
        namespaces[root_name] = namespaces.pop(None)
        
    return namespaces

In [70]:
get_namespaces(xml_file)

{'cbc-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2',
 'cac-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2',
 'cbc': 'urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2',
 'cac': 'urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2',
 'ns1': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2',
 'base': 'http://www.w3.org/2005/Atom'}

In order to trim off *namespace*s from a tag:
- a regular expression

In [71]:
#| export
re_tag = re.compile('\{(.*)\}(.*)')

In [72]:
assert re_tag.match('{blabla}foo').groups() == ('blabla', 'foo')

In [73]:
re_tag.match('{some.namespace}id').groups()

('some.namespace', 'id')

* a convenience function exploiting the latter

In [74]:
#| export
def split_namespace_tag(
    namespace_tag: str # Input
    ) -> tuple[str]: # Namespace and tag
    'Splits a hierarchical "address" in an XML file into *namespace* and *tag*'
    
    return re_tag.match(namespace_tag).groups()

In [75]:
split_namespace_tag('{some.namespace}id')

('some.namespace', 'id')

In [76]:
split_namespace_tag(root.tag)

('http://www.w3.org/2005/Atom', 'feed')

In [77]:
root.tag

'{http://www.w3.org/2005/Atom}feed'

### Regular entries

A function to get a list of `etree.Element` with all the *entries* (allegedly, *procurement contracts*)

In [78]:
#| export
def get_entries(
    root: etree.Element # XML root
    ) -> list[etree.Element]: # Entries
    "Returns all the entries hanging from the input"
    
    return [e for e in root if split_namespace_tag(e.tag)[1] == 'entry']

*Entries* are extracted using the above function (only the 4 first ones are shown)

In [79]:
entries = get_entries(root)
assert len(entries) == 117
entries[:4]

[<Element {http://www.w3.org/2005/Atom}entry at 0x7f83f4d64540>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f83f4d927c0>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f83a9165200>,
 <Element {http://www.w3.org/2005/Atom}entry at 0x7f83f4d91fc0>]

In [80]:
element = entries[0]
split_namespace_tag(element.tag)

('http://www.w3.org/2005/Atom', 'entry')

In [81]:
subelement = element[3]
split_namespace_tag(subelement.tag)

('http://www.w3.org/2005/Atom', 'title')

In [82]:
subelement.text

"L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí"

In [83]:
element

<Element {http://www.w3.org/2005/Atom}entry at 0x7f83f4d64540>

Everything is stored as a string in an *XML* file. The approach below is used to handle conversions.

In [84]:
numeric_field = '8'
numeric_field.isnumeric()
float(numeric_field).is_integer()

True

### Deleted entries

It makes sense to treat separately since they information provided for them is completely different.

In [85]:
#| export
def get_deleted_entries(
    root: etree.Element # XML root
    ) -> list[etree.Element]: # *Deleted* entries
    "Returns all the *deleted* entries hanging from the input"
    
    return [e for e in root if split_namespace_tag(e.tag)[1] == 'deleted-entry']

In [86]:
deleted_entries = get_deleted_entries(root)
deleted_entries[:2]

[<Element {http://purl.org/atompub/tombstones/1.0}deleted-entry at 0x7f83f4da3700>,
 <Element {http://purl.org/atompub/tombstones/1.0}deleted-entry at 0x7f83f4da3600>]

In [87]:
deleted_element = deleted_entries[0]
split_namespace_tag(deleted_element.tag)

('http://purl.org/atompub/tombstones/1.0', 'deleted-entry')

These are *all-attributes* elements

In [88]:
deleted_element.attrib

{'ref': 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968384', 'when': '2022-01-03T00:11:41.879+01:00'}

A function to process an XML file into a `pd.Series`

In [89]:
#| export
def deleted_to_series(
    input_file: str | pathlib.Path # XML file
) -> pd.Series: # A Pandas Series with XML data
    "Reads and parses 'deleted' entries in an XML file."
    
    tree = etree.parse(input_file)
    root = tree.getroot()
    
    ids = []
    dates = []

    for e in get_deleted_entries(root):
        ids.append(e.attrib['ref'])
        # dates.append(pd.to_datetime(e.attrib['when']))
        dates.append(pd.to_datetime(e.attrib['when'], utc=True))
        
    name = 'deleted_on'
        
    if not ids:
        
        return pd.Series([], dtype='datetime64[ns]', name=name)
    
    else:

        return pd.Series(data=dates, index=ids, name=name)

In [90]:
deleted_series = deleted_to_series(xml_file)
deleted_series

https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968384   2022-01-02 23:11:41.879000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/6802801   2022-01-02 23:11:41.837000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8231582   2022-01-02 23:11:41.790000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968385   2022-01-02 23:11:41.750000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1969766   2022-01-02 23:11:41.698000+00:00
                                                                                                    ...               
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1958181   2021-12-30 23:13:12.646000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1962159   2021-12-30 23:13:12.596000+00:00
https://contrataciondelestado.es/sindicacion/Pla

Are there duplicates?

In [91]:
len(deleted_series) > len(deleted_series.unique())

False

In [92]:
#| hide
# offending_xml = directory / 'deleted_offending.atom'
# offending_xml

In [93]:
#| hide
# offending_deleted_series = deleted_to_series(offending_xml)
# offending_deleted_series

## Parsing

A function to handle *multi-valued* fields

In [94]:
#| export
def set_or_append(
    d: dict, # Input (to be modified)
    key: str, # Entry of the dictionary to be created/extended
    value: str | dict # Value to be set/added
    ) -> None:
    "Set or append a new element to the dictionary storing the data in a single entry"

    # if there is already something in the given `key`...
    if key in d:

        # if what is already there is a list...
        if type(d[key]) == list:

            # ...and so is the `value` to be added...
            if isinstance(value, list):

                # ...whatever came before and was not a list is turned into (a single-element) one
                d[key] = [e if isinstance(e, list) else [e] for e in d[key]] + [value]

            # if what is going to be added is NOT a list
            else:

                # if what is there is actually a list of lists...
                if isinstance(d[key][0], list):

                    # ...the new `value` is turned into a (singleton)
                    d[key].append([value])

                # if what is there is a (plain) list of scalars (and so is `value`)...
                else:
            
                    d[key].append(value)
        
        # if what is already there is NOT a list...
        else:

            # ...we make one, but...

            # ...if the new element is a list...
            if isinstance(value, list):

                # ...whatever scalar was there is turned into a (single-element) list inside the new list
                d[key] = [[d[key]]]

            else:

                # whatever was there becomes the 1st element in a new list
                d[key] = [d[key]]

            # the `value` is finally added
            d[key].append(value)
    
    # if there is nothing for the given `key`...
    else:

        # if `value` is a list AND of scalars...
        if isinstance(value, list) and not isinstance(value[0], list):

            # in order to play it safe, the list is assumed to be just one element in a sequence
            d[key] = [value]

        # if the `value` is not a list OR it IS a list of lists...
        else:

            d[key] = value

    # if the *final* value is a list...
    if isinstance(d[key], list):

        # "double lists" ([[]]) are turned into simple lists
        d[key] = [e[0] if (isinstance(e, list) and (len(e) == 1) and isinstance(e[0], list)) else e  for e in d[key]]

            

A sample dictionary

In [95]:
sample_dict = {}

A value for *foo* is added

In [96]:
set_or_append(sample_dict, 'foo', 1)
sample_dict

{'foo': 1}

Another value for the *same* key is added

In [97]:
set_or_append(sample_dict, 'foo', 2)
sample_dict

{'foo': [1, 2]}

A different key is added

In [98]:
set_or_append(sample_dict, 'blah', 3)
sample_dict

{'foo': [1, 2], 'blah': 3}

A **recursive** function to parse a node of the *XML* tree

In [99]:
#| export
def entry_to_dict(
    entry: etree.Element, # XML entry
    recursive: bool = True # If `True`, children of `entry` are also parsed
    ) -> dict:
    "Parse an XML entry into a Python dictionary"

    res = {}
    
    # for every "child" of `entry` ...
    for e in entry:
        
        # ...the *tag* are extracted
        _, tag = split_namespace_tag(e.tag)
        
        # for the sake of readability
        value = e.text
        print(tag, value)
            
        # if `value` is "something" and not an empty string after striping it of blank characters...
        if value and (value.strip() != ''):
            
            # if the text contains a number...
            if value.isnumeric():
                
                # ...it is turned into a `float`
                value = float(value)
                
                # if the latter is actually an integer...
                if value.is_integer():
                    
                    # ...conversion is performed
                    value = int(value)
            
            # assert tag not in res, f'multiple values for {tag}'
            
            # the value of this element (whether the original text or the obtained number) is stored
            set_or_append(res, tag, value)
        else:
            if tag == "link" and ('href' in e.attrib):
                # Update to take the link field
                value = e.attrib['href']
                set_or_append(res, tag, value)
                
        if tag == 'ID' and ('schemeName' in e.attrib):

            # print(f'Yep...{e.attrib.keys()}')

            res[tag + 'schemeName'] = e.attrib['schemeName']

        # print(tag)
        
        # if in "recursive mode" and this element has children (`len(e)` is different from 0)...
        if recursive and len(e):
            
            # recursion
            sub_res = entry_to_dict(e)
            
            for k, v in sub_res.items():
                
                # the name of the new "key" is assembled from those of the parent and the child
                key_name = f'{tag}{sproc.structure.nested_tags_separator}{k}'
                
                set_or_append(res, key_name, v)
    
    return res

In [100]:
element_series = entry_to_dict(element)
element_series.keys()

id https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346
link None
summary Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA
title L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí
updated 2022-01-03T01:11:41.826+01:00
ContractFolderStatus 
            
ContractFolderID C. 2-2021
ContractFolderStatusCode ADJ
LocatedContractingParty 
                
BuyerProfileURIID https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318
Party 
                    
PartyName 
                        
Name Ajuntament de Sant Ramon
ParentLocatedParty 
                    
PartyName 
                        
Name Entitats municipals de Catalunya
ParentLocatedParty None
ProcurementProje

dict_keys(['id', 'link', 'summary', 'title', 'updated', 'ContractFolderStatus - ContractFolderID', 'ContractFolderStatus - ContractFolderStatusCode', 'ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID', 'ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name', 'ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name', 'ContractFolderStatus - ProcurementProject - Name', 'ContractFolderStatus - ProcurementProject - TypeCode', 'ContractFolderStatus - ProcurementProject - BudgetAmount - EstimatedOverallContractAmount', 'ContractFolderStatus - ProcurementProject - BudgetAmount - TaxExclusiveAmount', 'ContractFolderStatus - ProcurementProject - RequiredCommodityClassification - ItemClassificationCode', 'ContractFolderStatus - ProcurementProject - RealizedLocation - CountrySubentityCode', 'ContractFolderStatus - ProcurementProject - PlannedPeriod - DurationMeasure', 'ContractFolderStatus - TenderResult - ResultCode', 'Contract

For field `ContractFolderStatus - TenderResult - WinningParty - PartyIdentification - ID`, and only for it,  the *attribute* `schemeName` is also extracted.

In [101]:
element_series['ContractFolderStatus - TenderResult - WinningParty - PartyIdentification - IDschemeName']

'NIF'

An example of *multi-valued* field

In [102]:
element_series[sproc.structure.assemble_name(
    ['ContractFolderStatus', 'ValidNoticeInfo', 'AdditionalPublicationStatus', 'AdditionalPublicationDocumentReference', 'IssueDate']
)]

[['2021-11-30', '2022-01-03']]

## Data structures

### Series

A function that just wraps the result of `entry_to_dict` into a `pd.Series`

In [103]:
#| export
def entry_to_series(
    entry: etree.Element # Input
    ) -> pd.Series: # Output
    "Turns an XML element into a Pandas' series"

    return pd.Series(entry_to_dict(entry))

Only the first 8 fields are printed (enough to show *nested* elements)

In [104]:
element_series = entry_to_series(element)
element_series[:8]

id https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346
link None
summary Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA
title L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí
updated 2022-01-03T01:11:41.826+01:00
ContractFolderStatus 
            
ContractFolderID C. 2-2021
ContractFolderStatusCode ADJ
LocatedContractingParty 
                
BuyerProfileURIID https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318
Party 
                    
PartyName 
                        
Name Ajuntament de Sant Ramon
ParentLocatedParty 
                    
PartyName 
                        
Name Entitats municipals de Catalunya
ParentLocatedParty None
ProcurementProje

id                                                                    https://contrataciondelestado.es/sindicacion/P...
link                                                                  https://contractaciopublica.gencat.cat/ecofin_...
summary                                                               Id licitación: C. 2-2021; Órgano de Contrataci...
title                                                                 L'objecte del contracte és la renovació de tot...
updated                                                                                   2022-01-03T01:11:41.826+01:00
ContractFolderStatus - ContractFolderID                                                                       C. 2-2021
ContractFolderStatus - ContractFolderStatusCode                                                                     ADJ
ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID    https://contractaciopublica.gencat.cat/ecofin_...
dtype: object

### DataFrame

We can concatenate together the `pd.Series` for the different *entries* into a `pd.DataFrame`

In [105]:
df = pd.concat([entry_to_series(e) for e in entries[:4]], axis=1).T
df

id https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346
link None
summary Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA
title L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí
updated 2022-01-03T01:11:41.826+01:00
ContractFolderStatus 
            
ContractFolderID C. 2-2021
ContractFolderStatusCode ADJ
LocatedContractingParty 
                
BuyerProfileURIID https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318
Party 
                    
PartyName 
                        
Name Ajuntament de Sant Ramon
ParentLocatedParty 
                    
PartyName 
                        
Name Entitats municipals de Catalunya
ParentLocatedParty None
ProcurementProje

Unnamed: 0,id,link,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,...,ContractFolderStatus - TenderResult - WinningParty - PartyName - Name,ContractFolderStatus - TenderResult - AwardedTenderedProject - LegalMonetaryTotal - TaxExclusiveAmount,ContractFolderStatus - TenderingProcess - ProcedureCode,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndDate,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndTime,ContractFolderStatus - ValidNoticeInfo - NoticeTypeCode,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - PublicationMediaName,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - AdditionalPublicationDocumentReference - IssueDate,ContractFolderStatus - LegalDocumentReference - ID,ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI
0,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,...,"AERONAVAL DE CONSTRUCCIONES I INSTALACIONES , ...",90078.51,9,2021-12-17,14:00:00,"[[DOC_CN, DOC_CAN_ADJ]]","[[Perfil del contratante, Perfil del contratan...","[[2021-11-30, 2022-01-03]]",,
1,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03T01:00:11.194+01:00,8128_3/2021,PUB,,Pleno del Ayuntamiento,AYUNTAMIENTO DE MONREAL,...,,,1,2022-01-22,23:30:00,DOC_CN,Perfil del contratante,2022-01-03,,
2,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03T01:00:10.399+01:00,1000_0005-CP01-2021-000063,EV,,El Director General de Comunicación y Relacion...,"Departamento de Presidencia, Igualdad, Función...",...,,,1,,,DOC_CN,"[[DOUE, Perfil del contratante]]","[[2021-12-01, 2022-01-03]]",,
3,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-03T00:11:40.740+01:00,1379/2020 4738,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Canet de Mar,Entitats municipals de Catalunya,...,,,9,2022-01-02,23:59:00,DOC_CN,Perfil del contratante,2021-12-13,Plec Clausules.pdf,https://contractaciopublica.gencat.cat/ecofin_...


The types of the columns (every type is `object` since there are missing values everywhere)

In [106]:
df.dtypes

id                                                                                                                           object
link                                                                                                                         object
summary                                                                                                                      object
title                                                                                                                        object
updated                                                                                                                      object
ContractFolderStatus - ContractFolderID                                                                                      object
ContractFolderStatus - ContractFolderStatusCode                                                                              object
ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID          

The function below follows the above strategy to turn an *XML* file into a *Pandas* `pd.DataFrame`

In [107]:
#| export
def to_df(
    input_file: str | pathlib.Path # XML file
) -> pd.DataFrame: # Data in tabular format
    "Reads and parses an XML file into a `pd.DataFrame`"
    
    tree = etree.parse(input_file)
    root = tree.getroot()
    entries = get_entries(root)

    # if the input file was empty....
    if not entries:

        print(f'no entries were found in {input_file.name}')

        return pd.DataFrame()

    # if the the input file was NOT empty
    else:
    
        return pd.concat([entry_to_series(e) for e in entries], axis=1).T

In [108]:
df = to_df(xml_file)
df

id https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346
link None
summary Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA
title L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí
updated 2022-01-03T01:11:41.826+01:00
ContractFolderStatus 
            
ContractFolderID C. 2-2021
ContractFolderStatusCode ADJ
LocatedContractingParty 
                
BuyerProfileURIID https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318
Party 
                    
PartyName 
                        
Name Ajuntament de Sant Ramon
ParentLocatedParty 
                    
PartyName 
                        
Name Entitats municipals de Catalunya
ParentLocatedParty None
ProcurementProje

Unnamed: 0,id,link,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,...,ContractFolderStatus - TechnicalDocumentReference - ID,ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - ProcurementProject - PlannedPeriod - StartDate,ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate,ContractFolderStatus - LocatedContractingParty - Party - PartyIdentification - ID,ContractFolderStatus - LocatedContractingParty - Party - PartyIdentification - IDschemeName,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime,ContractFolderStatus - TenderResult - AwardedTenderedProject - ProcurementProjectLotID
0,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,...,,,,,,,,,,
1,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03T01:00:11.194+01:00,8128_3/2021,PUB,,Pleno del Ayuntamiento,AYUNTAMIENTO DE MONREAL,...,,,,,,,,,,
2,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03T01:00:10.399+01:00,1000_0005-CP01-2021-000063,EV,,El Director General de Comunicación y Relacion...,"Departamento de Presidencia, Igualdad, Función...",...,,,,,,,,,,
3,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-03T00:11:40.740+01:00,1379/2020 4738,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Canet de Mar,Entitats municipals de Catalunya,...,,,,,,,,,,
4,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: 2021-44; Órgano de Contratación...,Subministre i la instal·lació fotovoltaica en ...,2022-01-03T00:11:40.696+01:00,2021-44,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Valls,Entitats municipals de Catalunya,...,Enllac plec clausules tecniques.doc,https://contractaciopublica.gencat.cat/ecofin_...,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 1005_391-2021; Órgano de Contra...,Apoyo a la gestión del patrimonio filmográfico...,2021-12-31T01:00:14.946+01:00,1005_391-2021,PUB,,Dirección General de Cultura-Institución Prínc...,"Departamento de Cultura, Deporte y Juventud",...,,,,,,,,,,
113,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 8165_3/2021; Órgano de Contrata...,Asistencia técnica para la prestación del serv...,2021-12-31T01:00:14.393+01:00,8165_3/2021,EV,,Mancomunidad de Servicios Sociales de Base de ...,MANCOMUNIDAD DE SERVICIOS DE HUARTE Y DE ESTER...,...,,,,,,,,,,
114,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 8113_3/2021; Órgano de Contrata...,"Contrato de servicios de desinfección, desinse...",2021-12-31T01:00:13.594+01:00,8113_3/2021,EV,,Subdirector de Gestión y Recursos,Agencia Navarra para la Dependencia,...,,,2022-01-01,2022-12-31,,,,,,
115,https://contrataciondelestado.es/sindicacion/P...,https://hacienda.navarra.es/sicpportal/mtoAnun...,Id licitación: 8113_01 2021; Órgano de Contrat...,Contrato del Servicio de Teleasistencia para l...,2021-12-31T01:00:12.604+01:00,8113_01 2021,EV,,Agencia Navarra de Autonomía y Desarrollo de l...,Agencia Navarra para la Dependencia,...,,,,,,,,,,


A convenience function combining `to_df` and `post_process`

In [109]:
#| export
def to_curated_df(
    input_file: str | pathlib.Path # Input file
    ) -> pd.DataFrame: # A Pandas DataFrame with XML data
    "Reads, parses and tidies up an XML file into a `pd.DataFrame`"

    raw_df = to_df(input_file)

    # if the input file was empty....
    if raw_df.empty:

        return raw_df
    
    # if the the input file was NOT empty
    else:
    
        return sproc.postprocess.typecast_columns(raw_df)

In [110]:
#tidy_df = to_curated_df(xml_file)
#tidy_df.head()

Some (post-)processing took place

In [112]:
#tidy_df.dtypes[:5]

## Assorted

A function to find the depth (inside the *XML*) of every column. Columns associated with *leafs* (0 depth) are not reported.

In [113]:
#| export
def columns_depth(
    df: pd.DataFrame # Input
    ) -> pd.Series: # Depths
    "Returns the depth, inside the original XML, of every column"

    n_nestings = df.columns.str.extractall(f'(\\S{sproc.structure.nested_tags_separator}\\S)')
    n_nestings.index.names = ['column', 'match']
    
    return n_nestings[0].groupby('column').size()

In [115]:
#tidy_df_columns_depth = columns_depth(tidy_df)
#tidy_df_columns_depth.head()

## Robustness

A (sample) file in that directory

In [116]:
xml_file_single = directory / 'PlataformasAgregadasSinMenores_20220104_030016_1_single.atom'
assert xml_file_single.exists()
xml_file_single

PosixPath('/export/usuarios_ml4ds/cggamella/sproc/samples/PlataformasAgregadasSinMenores_20220104_030016_1_single.atom')

In [117]:
to_df(xml_file_single)

id https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346
link None
summary Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA
title L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí
updated 2022-01-03T01:11:41.826+01:00
ContractFolderStatus 
            
ContractFolderID C. 2-2021
ContractFolderStatusCode ADJ
LocatedContractingParty 
                
BuyerProfileURIID https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318
Party 
                    
PartyName 
                        
Name Ajuntament de Sant Ramon
ParentLocatedParty 
                    
PartyName 
                        
Name Entitats municipals de Catalunya
ParentLocatedParty None
ProcurementProje

Unnamed: 0,id,link,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,...,ContractFolderStatus - TenderResult - WinningParty - PartyIdentification - ID,ContractFolderStatus - TenderResult - WinningParty - PartyIdentification - IDschemeName,ContractFolderStatus - TenderResult - WinningParty - PartyName - Name,ContractFolderStatus - TenderResult - AwardedTenderedProject - LegalMonetaryTotal - TaxExclusiveAmount,ContractFolderStatus - TenderingProcess - ProcedureCode,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndDate,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndTime,ContractFolderStatus - ValidNoticeInfo - NoticeTypeCode,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - PublicationMediaName,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - AdditionalPublicationDocumentReference - IssueDate
0,https://contrataciondelestado.es/sindicacion/P...,https://contractaciopublica.gencat.cat/ecofin_...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,...,"[[A28526275, A28526275 II]]","[[NIF, NIF]]",[[AERONAVAL DE CONSTRUCCIONES I INSTALACIONES ...,90078.51,9,2021-12-17,14:00:00,"[[DOC_CN, DOC_CAN_ADJ]]","[[Perfil del contratante, Perfil del contratan...","[[2021-11-30, 2022-01-03]]"


In [None]:
#| hide
from nbdev.doclinks import nbdev_export

In [None]:
#| hide
nbdev_export('10_xml.ipynb')