In [None]:
#| default_exp xml

# xml

> Parse XML files

In [None]:
#| export
import pathlib
import re
import datetime
from collections.abc import Iterable

import numpy as np
import pandas as pd
from lxml import etree

import sproc.structure
import sproc.postprocess

## Sample data

Directory where the data (*XML* files) are stored

In [None]:
directory = pathlib.Path.cwd().parent / 'samples'
assert directory.exists()
directory

PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples')

A (sample) file in that directory

In [None]:
xml_file = directory / 'PlataformasAgregadasSinMenores_20220104_030016_1.atom'
assert xml_file.exists()
xml_file

PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_20220104_030016_1.atom')

In [None]:
!head {xml_file} --lines=20

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:cbc-place-ext="urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2" xmlns:cac-place-ext="urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2" xmlns:cbc="urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2" xmlns:cac="urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2" xmlns:ns1="urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2">
    <author>
        <name>Plataforma de Contratación del Sector Público</name>
        <uri>https://contrataciondelestado.es</uri>
        <email>contrataciondelestado@minhafp.es</email>
    </author>
    <id>https://contrataciondelestado.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores.atom</id>
    <link href="PlataformasAgregadasSinMenores_20220104_030016_1.atom" rel="self"/>
    <link href="PlataformasAgregadasSinMenores.atom" rel="first"/>
    

*Root* element of the *XML* tree

In [None]:
root = etree.parse(xml_file).getroot()

## Convenience functions

A function to extract the *namespace*s declared in an *XML* file

In [None]:
#| export
def get_namespaces(
    input_file: str | pathlib.Path, # XML file
    root_name: str = 'base' # Name of the root element
    ) -> dict[str, str]: # Mapping from *tag* to *namespace*
    
    tree = etree.parse(input_file)
    
    namespaces = tree.getroot().nsmap
    
    if None in namespaces:
        
        namespaces[root_name] = namespaces.pop(None)
        
    return namespaces

In [None]:
get_namespaces(xml_file)

{'cbc-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonBasicComponents-2',
 'cac-place-ext': 'urn:dgpe:names:draft:codice-place-ext:schema:xsd:CommonAggregateComponents-2',
 'cbc': 'urn:dgpe:names:draft:codice:schema:xsd:CommonBasicComponents-2',
 'cac': 'urn:dgpe:names:draft:codice:schema:xsd:CommonAggregateComponents-2',
 'ns1': 'urn:oasis:names:specification:ubl:schema:xsd:CommonExtensionComponents-2',
 'base': 'http://www.w3.org/2005/Atom'}

In order to trim off *namespace*s from a tag:
- a regular expression

In [None]:
#| export
re_tag = re.compile('\{(.*)\}(.*)')

In [None]:
assert re_tag.match('{blabla}foo').groups() == ('blabla', 'foo')

In [None]:
re_tag.match('{some.namespace}id').groups()

('some.namespace', 'id')

* a convenience function exploiting the latter

In [None]:
#| export
def split_namespace_tag(namespace_tag: str) -> str:
    
    return re_tag.match(namespace_tag).groups()

In [None]:
split_namespace_tag('{some.namespace}id')

('some.namespace', 'id')

In [None]:
split_namespace_tag(root.tag)

('http://www.w3.org/2005/Atom', 'feed')

In [None]:
root.tag

'{http://www.w3.org/2005/Atom}feed'

### Regular entries

A function to get a list of `etree.Element` with all the *entries* (allegedly, *procurement contracts*)

In [None]:
#| export
def get_entries(root: etree.Element) -> list[etree.Element]:
    
    return [e for e in root if split_namespace_tag(e.tag)[1] == 'entry']

*Entries* are extracted using the above function (only the 4 first ones are shown)

In [None]:
entries = get_entries(root)
assert len(entries) == 117
entries[:4]

[<Element {http://www.w3.org/2005/Atom}entry>,
 <Element {http://www.w3.org/2005/Atom}entry>,
 <Element {http://www.w3.org/2005/Atom}entry>,
 <Element {http://www.w3.org/2005/Atom}entry>]

In [None]:
element = entries[0]
split_namespace_tag(element.tag)

('http://www.w3.org/2005/Atom', 'entry')

In [None]:
subelement = element[3]
split_namespace_tag(subelement.tag)

('http://www.w3.org/2005/Atom', 'title')

In [None]:
subelement.text

"L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí"

In [None]:
element

<Element {http://www.w3.org/2005/Atom}entry>

Everything is stored as a string in an *XML* file. The approach below is used to handle conversions.

In [None]:
numeric_field = '8'
numeric_field.isnumeric()
float(numeric_field).is_integer()

True

### Deleted entries

It makes sense to treat separately since they information provided for them is completely different.

In [None]:
#| export
def get_deleted_entries(root: etree.Element) -> list[etree.Element]:
    
    return [e for e in root if split_namespace_tag(e.tag)[1] == 'deleted-entry']

In [None]:
deleted_entries = get_deleted_entries(root)
deleted_entries[:2]

[<Element {http://purl.org/atompub/tombstones/1.0}deleted-entry>,
 <Element {http://purl.org/atompub/tombstones/1.0}deleted-entry>]

In [None]:
deleted_element = deleted_entries[0]
split_namespace_tag(deleted_element.tag)

('http://purl.org/atompub/tombstones/1.0', 'deleted-entry')

These are *all-attributes* elements

In [None]:
deleted_element.attrib

{'ref': 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968384', 'when': '2022-01-03T00:11:41.879+01:00'}

A function to process an XML file into a `pd.Series`

In [None]:
#| export
def deleted_to_series(
    input_file: str | pathlib.Path # XML file
) -> pd.Series: # A Pandas Series with XML data
    "Reads and parses 'deleted' entries in an XML file."
    
    tree = etree.parse(input_file)
    root = tree.getroot()
    
    ids = []
    dates = []

    for e in get_deleted_entries(root):
        ids.append(e.attrib['ref'])
        # dates.append(pd.to_datetime(e.attrib['when']))
        dates.append(pd.to_datetime(e.attrib['when'], utc=True))
        
    name = 'deleted_on'
        
    if not ids:
        
        return pd.Series([], dtype='datetime64[ns]', name=name)
    
    else:

        return pd.Series(data=dates, index=ids, name=name)

In [None]:
deleted_series = deleted_to_series(xml_file)
deleted_series

https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968384   2022-01-02 23:11:41.879000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/6802801   2022-01-02 23:11:41.837000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8231582   2022-01-02 23:11:41.790000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968385   2022-01-02 23:11:41.750000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1969766   2022-01-02 23:11:41.698000+00:00
                                                                                                    ...               
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1958181   2021-12-30 23:13:12.646000+00:00
https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1962159   2021-12-30 23:13:12.596000+00:00
https://contrataciondelestado.es/sindicacion/Pla

Are there duplicates?

In [None]:
len(deleted_series) > len(deleted_series.unique())

False

In [None]:
#| hide
# offending_xml = directory / 'deleted_offending.atom'
# offending_xml

In [None]:
#| hide
# offending_deleted_series = deleted_to_series(offending_xml)
# offending_deleted_series

## Parsing

A function to handle *multi-valued* fields

In [None]:
#| export
def set_or_append(
    d: dict, # Input (to be modified)
    key: str, # Entry of the dictionary to be created/extended
    value: str | dict # Value to be set/added
    ) -> None:
    "Set or append a new element to the dictionary storing the data in a single entry"

    # if there is already something in the given `key`...
    if key in d:

        # if what is already there is a list...
        if type(d[key]) == list:

            # ...and so is the `value` to be added...
            if isinstance(value, list):

                # ...whatever came before and was not a list is turned into (a single-element) one
                d[key] = [e if isinstance(e, list) else [e] for e in d[key]] + [value]

            # if what is going to be added is NOT a list
            else:

                # if what is there is actually a list of lists...
                if isinstance(d[key][0], list):

                    # ...the new `value` is turned into a (singleton)
                    d[key].append([value])

                # if what is there is a (plain) list of scalars (and so is `value`)...
                else:
            
                    d[key].append(value)
        
        # if what is already there is NOT a list...
        else:

            # ...we make one, but...

            # ...if the new element is a list...
            if isinstance(value, list):

                # ...whatever scalar was there is turned into a (single-element) list inside the new list
                d[key] = [[d[key]]]

            else:

                # whatever was there becomes the 1st element in a new list
                d[key] = [d[key]]

            # the `value` is finally added
            d[key].append(value)
    
    # if there is nothing for the given `key`...
    else:

        # if `value` is a list AND of scalars...
        if isinstance(value, list) and not isinstance(value[0], list):

            # in order to play it safe, the list is assumed to be just one element in a sequence
            d[key] = [value]

        # if the `value` is not a list OR it IS a list of lists...
        else:

            d[key] = value

    # if the *final* value is a list...
    if isinstance(d[key], list):

        # "double lists" ([[]]) are turned into simple lists
        d[key] = [e[0] if (isinstance(e, list) and (len(e) == 1) and isinstance(e[0], list)) else e  for e in d[key]]

            

A sample dictionary

In [None]:
sample_dict = {}

A value for *foo* is added

In [None]:
set_or_append(sample_dict, 'foo', 1)
sample_dict

{'foo': 1}

Another value for the *same* key is added

In [None]:
set_or_append(sample_dict, 'foo', 2)
sample_dict

{'foo': [1, 2]}

A different key is added

In [None]:
set_or_append(sample_dict, 'blah', 3)
sample_dict

{'foo': [1, 2], 'blah': 3}

A **recursive** function to parse a node of the *XML* tree

In [None]:
#| export
def entry_to_dict(
    entry: etree.Element, # XML entry
    recursive: bool = True # If `True`, children of `entry` are also parsed
    ) -> dict:
    "Parse an XML entry into a Python dictionary"

    res = {}
    
    # for every "child" of `entry` ...
    for e in entry:
        
        # ...the *namespace* and *tag* are extracted
        namespace, tag = split_namespace_tag(e.tag)
        
        # for the sake of readability
        value = e.text
            
        # if `value` is "something" and not an empty string after striping it of blank characters...
        if value and (value.strip() != ''):
            
            # if the text contains a number...
            if value.isnumeric():
                
                # ...it is turned into a `float`
                value = float(value)
                
                # if the latter is actually an integer...
                if value.is_integer():
                    
                    # ...conversion is performed
                    value = int(value)
            
            # assert tag not in res, f'multiple values for {tag}'
            
            # the value of this element (whether the original text or the obtained number) is stored
            set_or_append(res, tag, value)
        
        # if in "recursive mode" and this element has children (`len(e)` is different from 0)...
        if recursive and len(e):
            
            # recursion
            sub_res = entry_to_dict(e)
            
            for k, v in sub_res.items():
                
                # the name of the new "key" is assembled from those of the parent and the child
                key_name = f'{tag}{sproc.structure.nested_tags_separator}{k}'
                
                set_or_append(res, key_name, v)
    
    return res

In [None]:
element_series = entry_to_dict(element)
element_series

{'id': 'https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/8799346',
 'summary': 'Id licitación: C. 2-2021; Órgano de Contratación: Ajuntament de Sant Ramon; Importe: 135553.26; Estado: ADJUDICADA',
 'title': "L'objecte del contracte és la renovació de totes les llumeneres que formen la il·luminació existent de tots els carrers i vials del casc urbà de la localitat de Sant Ramon i dels nuclis agregats de La Manresana, Portell, Viver i Gospí",
 'updated': '2022-01-03T01:11:41.826+01:00',
 'ContractFolderStatus - ContractFolderID': 'C. 2-2021',
 'ContractFolderStatus - ContractFolderStatusCode': 'ADJ',
 'ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID': 'https://contractaciopublica.gencat.cat/ecofin_pscp/AppJava/cap.pscp?reqCode=viewDetail&idCap=2763318',
 'ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name': 'Ajuntament de Sant Ramon',
 'ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Na

An example of *multi-valued* field

In [None]:
element_series[sproc.structure.assemble_name(
    ['ContractFolderStatus', 'ValidNoticeInfo', 'AdditionalPublicationStatus', 'AdditionalPublicationDocumentReference', 'IssueDate']
)]

[['2021-11-30', '2022-01-03']]

## Data structures

### Series

A function that just wraps the result of `entry_to_dict` into a `pd.Series`

In [None]:
#| export
def entry_to_series(entry: etree.Element) -> pd.Series:

    return pd.Series(entry_to_dict(entry))

Only the first 8 fields are printed (enough to show *nested* elements)

In [None]:
element_series = entry_to_series(element)
element_series[:8]

id                                                                           https://contrataciondelestado.es/sindicacion/P...
summary                                                                      Id licitación: C. 2-2021; Órgano de Contrataci...
title                                                                        L'objecte del contracte és la renovació de tot...
updated                                                                                          2022-01-03T01:11:41.826+01:00
ContractFolderStatus - ContractFolderID                                                                              C. 2-2021
ContractFolderStatus - ContractFolderStatusCode                                                                            ADJ
ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID           https://contractaciopublica.gencat.cat/ecofin_...
ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name                             Ajuntamen

### DataFrame

We can concatenate together the `pd.Series` for the different *entries* into a `pd.DataFrame`

In [None]:
df = pd.concat([entry_to_series(e) for e in entries[:4]], axis=1).T
df

Unnamed: 0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - TenderResult - WinningParty - PartyName - Name,ContractFolderStatus - TenderResult - AwardedTenderedProject - LegalMonetaryTotal - TaxExclusiveAmount,ContractFolderStatus - TenderingProcess - ProcedureCode,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndDate,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndTime,ContractFolderStatus - ValidNoticeInfo - NoticeTypeCode,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - PublicationMediaName,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - AdditionalPublicationDocumentReference - IssueDate,ContractFolderStatus - LegalDocumentReference - ID,ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI
0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,L'objecte del contracte és la renovació de tot...,...,"AERONAVAL DE CONSTRUCCIONES I INSTALACIONES , ...",90078.51,9,2021-12-17,14:00:00,"[[DOC_CN, DOC_CAN_ADJ]]","[[Perfil del contratante, Perfil del contratan...","[[2021-11-30, 2022-01-03]]",,
1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03T01:00:11.194+01:00,8128_3/2021,PUB,,Pleno del Ayuntamiento,AYUNTAMIENTO DE MONREAL,Obras de restauración hidromorfológica del río...,...,,,1,2022-01-22,23:30:00,DOC_CN,Perfil del contratante,2022-01-03,,
2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03T01:00:10.399+01:00,1000_0005-CP01-2021-000063,EV,,El Director General de Comunicación y Relacion...,"Departamento de Presidencia, Igualdad, Función...",Contrato del servicio de realización de labore...,...,,,1,,,DOC_CN,"[[DOUE, Perfil del contratante]]","[[2021-12-01, 2022-01-03]]",,
3,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-03T00:11:40.740+01:00,1379/2020 4738,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Canet de Mar,Entitats municipals de Catalunya,Obres de renovació de l'enllumenat públic a la...,...,,,9,2022-01-02,23:59:00,DOC_CN,Perfil del contratante,2021-12-13,Plec Clausules.pdf,https://contractaciopublica.gencat.cat/ecofin_...


The types of the columns (every type is `object` since there are missing values everywhere)

In [None]:
df.dtypes

id                                                                                                                           object
summary                                                                                                                      object
title                                                                                                                        object
updated                                                                                                                      object
ContractFolderStatus - ContractFolderID                                                                                      object
ContractFolderStatus - ContractFolderStatusCode                                                                              object
ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID                                                           object
ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name   

The function below follows the above strategy to turn an *XML* file into a *Pandas* `pd.DataFrame`

In [None]:
#| export
def to_df(input_file: str | pathlib.Path) -> pd.DataFrame:
    """
    Reads and parses an XML file into a `pd.DataFrame`.
    
    **Parameters**
    
    - input_file: str or Path
    
        Input file.
    
    **Returns**
    
    - out: pd.DataFrame
    
        A Pandas DataFrame with XML data.
    
    """
    
    tree = etree.parse(input_file)
    root = tree.getroot()
    entries = get_entries(root)
    
    return pd.concat([entry_to_series(e) for e in entries], axis=1).T

In [None]:
df = to_df(xml_file)
df

Unnamed: 0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - TechnicalDocumentReference - ID,ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - ProcurementProject - PlannedPeriod - StartDate,ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate,ContractFolderStatus - LocatedContractingParty - Party - PartyIdentification - ID,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime,ContractFolderStatus - TenderResult - AwardedTenderedProject - ProcurementProjectLotID
0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,L'objecte del contracte és la renovació de tot...,...,,,,,,,,,,
1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03T01:00:11.194+01:00,8128_3/2021,PUB,,Pleno del Ayuntamiento,AYUNTAMIENTO DE MONREAL,Obras de restauración hidromorfológica del río...,...,,,,,,,,,,
2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03T01:00:10.399+01:00,1000_0005-CP01-2021-000063,EV,,El Director General de Comunicación y Relacion...,"Departamento de Presidencia, Igualdad, Función...",Contrato del servicio de realización de labore...,...,,,,,,,,,,
3,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-03T00:11:40.740+01:00,1379/2020 4738,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Canet de Mar,Entitats municipals de Catalunya,Obres de renovació de l'enllumenat públic a la...,...,https://contractaciopublica.gencat.cat/ecofin_...,,,,,,,,,
4,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021-44; Órgano de Contratación...,Subministre i la instal·lació fotovoltaica en ...,2022-01-03T00:11:40.696+01:00,2021-44,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Valls,Entitats municipals de Catalunya,Subministre i la instal·lació fotovoltaica en ...,...,https://contractaciopublica.gencat.cat/ecofin_...,Enllac plec clausules tecniques.doc,https://contractaciopublica.gencat.cat/ecofin_...,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1005_391-2021; Órgano de Contra...,Apoyo a la gestión del patrimonio filmográfico...,2021-12-31T01:00:14.946+01:00,1005_391-2021,PUB,,Dirección General de Cultura-Institución Prínc...,"Departamento de Cultura, Deporte y Juventud",Apoyo a la gestión del patrimonio filmográfico...,...,,,,,,,,,,
113,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8165_3/2021; Órgano de Contrata...,Asistencia técnica para la prestación del serv...,2021-12-31T01:00:14.393+01:00,8165_3/2021,EV,,Mancomunidad de Servicios Sociales de Base de ...,MANCOMUNIDAD DE SERVICIOS DE HUARTE Y DE ESTER...,Asistencia técnica para la prestación del serv...,...,,,,,,,,,,
114,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8113_3/2021; Órgano de Contrata...,"Contrato de servicios de desinfección, desinse...",2021-12-31T01:00:13.594+01:00,8113_3/2021,EV,,Subdirector de Gestión y Recursos,Agencia Navarra para la Dependencia,"Contrato de servicios de desinfección, desinse...",...,,,,2022-01-01,2022-12-31,,,,,
115,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8113_01 2021; Órgano de Contrat...,Contrato del Servicio de Teleasistencia para l...,2021-12-31T01:00:12.604+01:00,8113_01 2021,EV,,Agencia Navarra de Autonomía y Desarrollo de l...,Agencia Navarra para la Dependencia,Contrato del Servicio de Teleasistencia para l...,...,,,,,,,,,,


A convenience function combining `to_df` and `post_process`

In [None]:
#| export
def to_curated_df(input_file: str | pathlib.Path) -> pd.DataFrame:
    """
    Reads, parses and tidies up an XML file into a `pd.DataFrame`.
    
    **Parameters**
    
    - input_file: str or Path
    
        Input file.
    
    **Returns**
    
    - out: pd.DataFrame
    
        A Pandas DataFrame with XML data.
    
    """
    
    return sproc.postprocess.typecast_columns(to_df(input_file))

In [None]:
tidy_df = to_curated_df(xml_file)
tidy_df.head()

Unnamed: 0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - TechnicalDocumentReference - ID,ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - ProcurementProject - PlannedPeriod - StartDate,ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate,ContractFolderStatus - LocatedContractingParty - Party - PartyIdentification - ID,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime,ContractFolderStatus - TenderResult - AwardedTenderedProject - ProcurementProjectLotID,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod
0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03 00:11:41.826000+00:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,L'objecte del contracte és la renovació de tot...,...,,,,,,,,,,2021-12-17 14:00:00+00:00
1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 8128_3/2021; Órgano de Contrata...,Obras de restauración hidromorfológica del río...,2022-01-03 00:00:11.194000+00:00,8128_3/2021,PUB,,Pleno del Ayuntamiento,AYUNTAMIENTO DE MONREAL,Obras de restauración hidromorfológica del río...,...,,,,,,,,,,2022-01-22 23:30:00+00:00
2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1000_0005-CP01-2021-000063; Órg...,Contrato del servicio de realización de labore...,2022-01-03 00:00:10.399000+00:00,1000_0005-CP01-2021-000063,EV,,El Director General de Comunicación y Relacion...,"Departamento de Presidencia, Igualdad, Función...",Contrato del servicio de realización de labore...,...,,,,,,,,,,NaT
3,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1379/2020 4738; Órgano de Contr...,Obres de renovació de l'enllumenat públic a la...,2022-01-02 23:11:40.740000+00:00,1379/2020 4738,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Canet de Mar,Entitats municipals de Catalunya,Obres de renovació de l'enllumenat públic a la...,...,,,,,,,,,,2022-01-02 23:59:00+00:00
4,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021-44; Órgano de Contratación...,Subministre i la instal·lació fotovoltaica en ...,2022-01-02 23:11:40.696000+00:00,2021-44,EV,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Valls,Entitats municipals de Catalunya,Subministre i la instal·lació fotovoltaica en ...,...,Enllac plec clausules tecniques.doc,https://contractaciopublica.gencat.cat/ecofin_...,,,,,,,,2022-01-02 23:59:00+00:00


Some (post-)processing took place

In [None]:
tidy_df.dtypes[:5]

id                                                      string
summary                                                 string
title                                                   string
updated                                    datetime64[ns, UTC]
ContractFolderStatus - ContractFolderID                 string
dtype: object

## Assorted

A function to find the depth (inside the *XML*) of every column. Columns associated with *leafs* (0 depth) are not reported.

In [None]:
#| export
def columns_depth(df: pd.DataFrame) -> pd.Series:

    n_nestings = df.columns.str.extractall(f'(\\S{sproc.structure.nested_tags_separator}\\S)')
    n_nestings.index.names = ['column', 'match']
    
    return n_nestings[0].groupby('column').size()

In [None]:
tidy_df_columns_depth = columns_depth(tidy_df)
tidy_df_columns_depth.head()

column
4    1
5    1
6    2
7    4
8    4
Name: 0, dtype: int64

## Robustness

A (sample) file in that directory

In [None]:
xml_file_single = directory / 'PlataformasAgregadasSinMenores_20220104_030016_1_single.atom'
assert xml_file_single.exists()
xml_file_single

PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_20220104_030016_1_single.atom')

In [None]:
to_df(xml_file_single)

Unnamed: 0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - TenderResult - ReceivedTenderQuantity,ContractFolderStatus - TenderResult - WinningParty - PartyIdentification - ID,ContractFolderStatus - TenderResult - WinningParty - PartyName - Name,ContractFolderStatus - TenderResult - AwardedTenderedProject - LegalMonetaryTotal - TaxExclusiveAmount,ContractFolderStatus - TenderingProcess - ProcedureCode,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndDate,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod - EndTime,ContractFolderStatus - ValidNoticeInfo - NoticeTypeCode,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - PublicationMediaName,ContractFolderStatus - ValidNoticeInfo - AdditionalPublicationStatus - AdditionalPublicationDocumentReference - IssueDate
0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: C. 2-2021; Órgano de Contrataci...,L'objecte del contracte és la renovació de tot...,2022-01-03T01:11:41.826+01:00,C. 2-2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Ramon,Entitats municipals de Catalunya,L'objecte del contracte és la renovació de tot...,...,11,"[[A28526275, A28526275 II]]",[[AERONAVAL DE CONSTRUCCIONES I INSTALACIONES ...,90078.51,9,2021-12-17,14:00:00,"[[DOC_CN, DOC_CAN_ADJ]]","[[Perfil del contratante, Perfil del contratan...","[[2021-11-30, 2022-01-03]]"


In [None]:
#| hide
from nbdev.doclinks import nbdev_export

In [None]:
#| hide
nbdev_export('10_xml.ipynb')