In [None]:
#| default_exp bundle

# bundle

> Functionality to deal with multiple XML files

In [None]:
#| export
import pathlib
import zipfile

import pandas as pd

import sproc.structure
import sproc.xml

Some handy imports that are not actually required by the library.

In [None]:
from IPython.display import display

Directory where the zip files are stored

In [None]:
directory = pathlib.Path.cwd().parent / 'samples'
assert directory.exists()
directory

PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples')

### Zip file

A (sample) file in that directory

In [None]:
input_file = directory / 'PlataformasAgregadasSinMenores_202201_05-06.zip'
assert input_file.exists()
input_file

PosixPath('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_202201_05-06.zip')

A function to read **all** the *XML* files contained in a given zip file.

In [None]:
#| export
def read_zip(
    input_file: str | pathlib.Path, # Input file
    concatenate: bool = False, # If `True` all the files are concatenated in a single `pd.DataFrame`
    return_filenames: bool = False # If `True` the names of the files (read) within the zip are returned too
) -> list | pd.DataFrame | tuple[list, list] | tuple[pd.DataFrame, list]: # XML data
    "Reads and parses an XML file into a `pd.DataFrame`"

    # in case a `str` was passed
    input_file = pathlib.Path(input_file)
    
    dfs = []
    
    # zip file is opened
    with zipfile.ZipFile(input_file) as zip_file:
        
        # for the sake of convenience
        filenames = zip_file.namelist()
        
        # every file within it...
        for name in filenames:
            
            # ...is opened...
            with zip_file.open(name) as f:
                
                # ...and processed
                dfs.append(sproc.xml.to_curated_df(f))
    
    if concatenate:
        
        # dfs = pd.concat(dfs, keys=filenames, names=['file name', 'entry'])

        dfs = pd.concat(dfs, keys=pd.MultiIndex.from_product(([input_file.name], filenames)), names=['zip', 'file name', 'entry'])
    
    if return_filenames:
        
        return dfs, filenames
    
    else:
        
        return dfs

In [None]:
dfs_zip = read_zip(input_file)
print(f'{len(dfs_zip)=}')
dfs_zip[0].head(3)

len(dfs_zip)=2


Unnamed: 0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - ProcurementProject - PlannedPeriod - StartDate,ContractFolderStatus - LegalDocumentReference - ID,ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate,ContractFolderStatus - TechnicalDocumentReference - ID,ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod
0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021039438; Órgano de Contratac...,L'objecte és la contractació del servei de bug...,2022-01-04 12:12:09.464000+00:00,2021039438,RES,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Girona,Entitats municipals de Catalunya,L'objecte és la contractació del servei de bug...,...,,,,,,,,,,2021-12-13 23:59:00+00:00
1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021_2568; Órgano de Contrataci...,Servei comunicacions postals de l'Ajuntament d...,2022-01-04 12:12:09.400000+00:00,2021_2568,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Collbató,Entitats municipals de Catalunya,Servei comunicacions postals de l'Ajuntament d...,...,,,,,,,,,,2021-11-29 23:59:00+00:00
2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 12450/2021; Órgano de Contratac...,L'objecte del contracte és l'execució de les o...,2022-01-04 12:12:09.343000+00:00,12450/2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Mont-roig del Camp,Entitats municipals de Catalunya,L'objecte del contracte és l'execució de les o...,...,,,,,,,,,,2021-12-20 14:00:00+00:00


A single `pd.DataFrame` encompassing all the files in the zip can be returned (file names are used as the top-level index)

In [None]:
dfs_concatenated_zip = read_zip(input_file, concatenate=True)
dfs_concatenated_zip

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,updated,ContractFolderStatus - ContractFolderID,ContractFolderStatus - ContractFolderStatusCode,ContractFolderStatus - LocatedContractingParty - BuyerProfileURIID,ContractFolderStatus - LocatedContractingParty - Party - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - ProcurementProject - Name,...,ContractFolderStatus - ProcurementProject - PlannedPeriod - StartDate,ContractFolderStatus - LegalDocumentReference - ID,ContractFolderStatus - LegalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - ProcurementProject - PlannedPeriod - EndDate,ContractFolderStatus - TechnicalDocumentReference - ID,ContractFolderStatus - TechnicalDocumentReference - Attachment - ExternalReference - URI,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndDate,ContractFolderStatus - TenderingProcess - ParticipationRequestReceptionPeriod - EndTime,ContractFolderStatus - TenderingProcess - TenderSubmissionDeadlinePeriod
zip,file name,entry,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220105_030012.atom,0,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021039438; Órgano de Contratac...,L'objecte és la contractació del servei de bug...,2022-01-04 12:12:09.464000+00:00,2021039438,RES,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Girona,Entitats municipals de Catalunya,L'objecte és la contractació del servei de bug...,...,,,,,,,,,,2021-12-13 23:59:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220105_030012.atom,1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021_2568; Órgano de Contrataci...,Servei comunicacions postals de l'Ajuntament d...,2022-01-04 12:12:09.400000+00:00,2021_2568,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Collbató,Entitats municipals de Catalunya,Servei comunicacions postals de l'Ajuntament d...,...,,,,,,,,,,2021-11-29 23:59:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220105_030012.atom,2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 12450/2021; Órgano de Contratac...,L'objecte del contracte és l'execució de les o...,2022-01-04 12:12:09.343000+00:00,12450/2021,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Mont-roig del Camp,Entitats municipals de Catalunya,L'objecte del contracte és l'execució de les o...,...,,,,,,,,,,2021-12-20 14:00:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220105_030012.atom,3,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021/14/M0400100; Órgano de Con...,El serveiobjecte del contracte és la difusió p...,2022-01-04 12:12:09.285000+00:00,2021/14/M0400100,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Casa de Cultura de Girona,Entitats municipals de Catalunya,El serveiobjecte del contracte és la difusió p...,...,,,,,,,,,,2021-10-22 14:00:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220105_030012.atom,4,https://contrataciondelestado.es/sindicacion/P...,Id licitación: ICVI-2022-1; Órgano de Contrata...,Servei de neteja de les instal·lacions de l'IN...,2022-01-04 12:12:09.227000+00:00,ICVI-2022-1,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Institut Català de la Vinya i el Vi (INCAVI),Departaments i Sector Públic de la Generalitat,Servei de neteja de les instal·lacions de l'IN...,...,,,,,,,,,,2021-11-30 18:00:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220106_030013.atom,471,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2021/57-12334; Órgano de Contra...,L'objecte d'aquest contracte és la prestació c...,2022-01-04 12:12:09.949000+00:00,2021/57-12334,PUB,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Palau-solità i Plegamans,Entitats municipals de Catalunya,L'objecte d'aquest contracte és la prestació c...,...,,PCAP subminist. 8 cameres videovigilancia 2021...,https://contractaciopublica.gencat.cat/ecofin_...,,PPT_subministrament 8 cameres videovigilancia_...,https://contractaciopublica.gencat.cat/ecofin_...,,,,2022-01-19 15:00:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220106_030013.atom,472,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 32/2021; Órgano de Contratación...,Licitació per procediment obert harmonitzat i ...,2022-01-04 12:12:09.742000+00:00,32/2021,RES,https://contractaciopublica.gencat.cat/ecofin_...,Consell Comarcal del Baix Llobregat,Entitats municipals de Catalunya,Licitació per procediment obert harmonitzat i ...,...,,,,,,,,,,2021-05-07 15:00:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220106_030013.atom,473,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 1977/2021; Órgano de Contrataci...,L'objecte d'aquest contracte és la prestació d...,2022-01-04 12:12:09.666000+00:00,1977/2021,RES,https://contractaciopublica.gencat.cat/ecofin_...,Ajuntament de Sant Adrià de Besòs,Entitats municipals de Catalunya,L'objecte d'aquest contracte és la prestació d...,...,,,,,,,,,,2021-06-30 23:59:00+00:00
PlataformasAgregadasSinMenores_202201_05-06.zip,PlataformasAgregadasSinMenores_20220106_030013.atom,474,https://contrataciondelestado.es/sindicacion/P...,Id licitación: EXI-2022-7; Órgano de Contratac...,"Servei de trasllat de béns mobles( mobiliari, ...",2022-01-04 12:12:09.602000+00:00,EXI-2022-7,ADJ,https://contractaciopublica.gencat.cat/ecofin_...,Departament d'Acció Exterior i Govern Obert,Departaments i Sector Públic de la Generalitat,"Servei de trasllat de béns mobles( mobiliari, ...",...,,,,,,,,,,2021-12-13 13:00:00+00:00


In [None]:
dfs_concatenated_zip.dtypes[:4]

id                      string
summary                 string
title                   string
updated    datetime64[ns, UTC]
dtype: object

Filenames can also be requested

In [None]:
_, filenames = read_zip(input_file, return_filenames=True)
print(filenames)

['PlataformasAgregadasSinMenores_20220105_030012.atom', 'PlataformasAgregadasSinMenores_20220106_030013.atom']


#### Deleted entries

In [None]:
#| export
def read_deleted_zip(
    input_file: str | pathlib.Path # Input file
    ) -> pd.Series: # XML data
    "Reads and parses an XML file into a `pd.DataFrame`"
    
    series = []

    # in case a `str` was passed
    input_file = pathlib.Path(input_file)
    
    # zip file is opened
    with zipfile.ZipFile(input_file) as zip_file:
        
        # for the sake of convenience
        filenames = zip_file.namelist()
        
        # every file within it...
        for name in filenames:
            
            # ...is opened...
            with zip_file.open(name) as f:
                
                # ...and processed
                series.append(sproc.xml.deleted_to_series(f))

    return pd.concat(series, keys=pd.MultiIndex.from_product(([input_file.name], filenames)), names=['zip', 'file name', 'id'])
    # return pd.concat(series, keys=filenames, names=['file name', 'id'])

In [None]:
read_deleted_zip(input_file)

zip                                              file name                                            id                                                                                 
PlataformasAgregadasSinMenores_202201_05-06.zip  PlataformasAgregadasSinMenores_20220105_030012.atom  https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/6724977   2022-01-04 00:12:01.376000+00:00
                                                                                                      https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968315   2022-01-03 23:11:57.567000+00:00
                                                                                                      https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1968345   2022-01-03 23:11:57.516000+00:00
                                                                                                      https://contrataciondelestado.es/sindicacion/Plat

## Extra tests

In [None]:
%%script false --no-raise-error

# data_file = pathlib.Path.cwd() / 'data' / 'agregados'
# data_file /= 'PlataformasAgregadasSinMenores_202201.zip'
# data_file /= 'PlataformasAgregadasSinMenores_202202.zip'
# data_file /= 'PlataformasAgregadasSinMenores_202203.zip'


data_file = pathlib.Path.cwd() / 'data' / 'perfiles_plataforma'
data_file /= 'licitacionesPerfilesContratanteCompleto3_202201.zip'

In [None]:
%%script false --no-raise-error

df = read_zip(data_file, concatenate=True)
df.shape

In [None]:
%%script false --no-raise-error

df['ContractFolderStatus - ProcurementProject - BudgetAmount - TotalAmount'].dtype

In [None]:
%%script false --no-raise-error

grouping_col = sproc.structure.assemble_name(['ContractFolderStatus', 'LocatedContractingParty', 'Party', 'PartyName', 'Name'])
grouping_col

In [None]:
%%script false --no-raise-error

amount_col = sproc.structure.assemble_name(['ContractFolderStatus', 'ProcurementProject', 'BudgetAmount', 'TotalAmount'])
amount_col

In [None]:
%%script false --no-raise-error

df.groupby(grouping_col).agg(money=(amount_col, 'sum'), n=(amount_col, 'size'))

In [None]:
#| hide
from nbdev.doclinks import nbdev_export

In [None]:
#| hide
nbdev_export('20_bundle.ipynb')