In [1]:
#| default_exp core

# core

> Main functionality.

The main functionality is actually provided by the function below. `main` is just a *wrapper* that parses command-line arguments.

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import sys
import argparse
import pathlib
import datetime

import yaml
import pandas as pd
from tqdm import tqdm

import sproc.extend
import sproc.hier
import sproc.assemble
import sproc.bundle
import sproc.postprocess
import sproc.structure
import sproc.download
import sproc.parse

Directory where the zip files are stored

In [4]:
directory = pathlib.Path.cwd().parent / 'samples'
assert directory.exists()
directory

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples')

## Processing a single zip file

In [5]:
#| export
def cli_process_zip(args: list = None) -> None:
    
    parser = argparse.ArgumentParser(description='Process zip file')

    parser.add_argument('zip_file', type=argparse.FileType('r'), help='zip file')
    parser.add_argument('output_file', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
    
    data_df, deleted_series = sproc.assemble.distilled_data_from_zip(command_line_arguments.zip_file.name)
    
    res = sproc.assemble.merge_deleted(data_df, deleted_series)
    res = sproc.assemble.parquet_amenable(res)
    
    res.to_parquet(output_file)

In [6]:
zip_file = directory /'yearly' / 'PlataformasAgregadasSinMenores_2018.zip'
assert zip_file.exists()
print(f'{zip_file=}')

zip_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2018.zip')


In [7]:
output_file = directory / 'year_2018.parquet'
print(f'{output_file=}')

output_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/year_2018.parquet')


In [8]:
args = [zip_file.as_posix(), output_file.as_posix()]
cli_process_zip(args)

In [9]:
%ls {directory}

2018-2021_20samples.parquet
extended_sample.parquet
[0m[01;34mgencat[0m/
merged.parquet
PLACE.yaml
PlataformasAgregadasSinMenores_20220104_030016_1.atom
PlataformasAgregadasSinMenores_20220104_030016_1_single.atom
PlataformasAgregadasSinMenores_202201_05-06.zip
PlataformasAgregadasSinMenores_202201_08-11.zip
PlataformasAgregadasSinMenores_202201_28-29.zip
README.md
renamed_cols_extended_sample.parquet
year_2018.parquet
[01;34myearly[0m/


In [10]:
pd.read_parquet(output_file).head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,...,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 23_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Party,Party,Name,TypeCode,...,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 23_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,PartyIdentification,PartyName,Unnamed: 11_level_3,Unnamed: 12_level_3,...,Unnamed: 14_level_3,ExternalReference,Unnamed: 16_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 22_level_3,Unnamed: 23_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,ID,Name,Unnamed: 11_level_4,Unnamed: 12_level_4,...,Unnamed: 14_level_4,URI,Unnamed: 16_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,...,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,PartyName,ParentLocatedParty,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,...,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Name,PartyName,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,...,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Name,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7
zip,file name,entry,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT


## Extending historical data

A function to extend an existing *parquet* file with new data in a *zip* file.

In [11]:
#| export
def cli_extend_parquet_with_zip(args: list = None) -> None:
    
    parser = argparse.ArgumentParser(description='Extend existing parquet file with data from a given zip')

    parser.add_argument('history_file', type=argparse.FileType('r'), help='Parquet file')
    parser.add_argument('zip_file', type=argparse.FileType('r'), help='Zip file')
    parser.add_argument('output_file', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    history_file = pathlib.Path(command_line_arguments.history_file.name)
    zip_file = pathlib.Path(command_line_arguments.zip_file.name)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
    
    sproc.extend.parquet_with_zip(history_file, zip_file, output_file)

Testing with some sample files

In [12]:
history_file = directory /'2018-2021_20samples.parquet'
assert history_file.exists()
print(f'{history_file=}')

history_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/2018-2021_20samples.parquet')


In [13]:
new_zip_file = directory / 'PlataformasAgregadasSinMenores_202201_28-29.zip'
assert new_zip_file.exists()
print(f'{new_zip_file=}')

new_zip_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_202201_28-29.zip')


In [14]:
output_file = directory / 'extended_sample.parquet'
output_file

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/extended_sample.parquet')

In [15]:
args = [history_file.as_posix(), new_zip_file.as_posix(), output_file.as_posix()]
cli_extend_parquet_with_zip(args)

In [16]:
%ls {directory}

2018-2021_20samples.parquet
extended_sample.parquet
[0m[01;34mgencat[0m/
merged.parquet
PLACE.yaml
PlataformasAgregadasSinMenores_20220104_030016_1.atom
PlataformasAgregadasSinMenores_20220104_030016_1_single.atom
PlataformasAgregadasSinMenores_202201_05-06.zip
PlataformasAgregadasSinMenores_202201_08-11.zip
PlataformasAgregadasSinMenores_202201_28-29.zip
README.md
renamed_cols_extended_sample.parquet
year_2018.parquet
[01;34myearly[0m/


## Renaming columns

In [17]:
#| export
def cli_rename_columns(args: list = None) -> None:
    
    parser = argparse.ArgumentParser(description='Rename columns')

    parser.add_argument('hierarchical_file', type=argparse.FileType('r'), help='(Hierarchical) Parquet file')
    parser.add_argument('mapping_file', type=argparse.FileType('r'), help='YAML file mapping hierarchical colum names to plain ones')
    parser.add_argument('output_file', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    hierarchical_file = pathlib.Path(command_line_arguments.hierarchical_file.name)
    assert hierarchical_file.suffix == '.parquet', 'a (hierarchical) .parquet file was expected'
    
    mapping_file = pathlib.Path(command_line_arguments.mapping_file.name)
    assert (mapping_file.suffix == '.yaml') or (mapping_file.suffix == '.YAML'), 'a YAML file was expected'
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
    
    with mapping_file.open() as yaml_data:
        data_scheme = yaml.load(yaml_data, Loader=yaml.FullLoader)
        
    df = pd.read_parquet(hierarchical_file)
    renamed_cols_df = sproc.hier.flatten_columns_names(df, data_scheme)
    
    renamed_cols_df.to_parquet(output_file)

In [18]:
renamed_cols_output_file = directory / 'renamed_cols_extended_sample.parquet'
renamed_cols_output_file

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/renamed_cols_extended_sample.parquet')

In [19]:
mapping_file = directory / 'PLACE.yaml'
assert mapping_file.exists()
mapping_file

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PLACE.yaml')

In [20]:
args = [output_file.as_posix(), mapping_file.as_posix(), renamed_cols_output_file.as_posix()]
cli_rename_columns(args)

In [21]:
pd.read_parquet(renamed_cols_output_file).head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,updated,Número de Expediente,Estado,ID,Nombre,Objeto del Contrato,Tipo de Contrato,...,Pliego de cláusulas administrativas (URI),Pliego de Prescripciones técnicas,Pliego de Prescripciones técnicas (URI),ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,Presentación de Solicitudes (Fecha),Presentación de Solicitudes (Hora),Presentación de Oferta,deleted_on,URL perfil de contratante
zip,file name,entry,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,...,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT,
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,...,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT,


## Reading a bunch of zip files

It receives a `list` of *zip* files and returns a (column-hierarchical) `pd.DataFrame` encompassing all the data

In [22]:
#| export
def read_zips(
    files: list[str | pathlib.Path] # Input files
    ) -> pd.DataFrame: # Procurement data
    "Build a `DataFrame` out of a bunch of zip files"
    
    # at the beginning it is guaranteed that every file is present
    for f in files:
        
        # in case `str` (rather than `Pathlib`s) were passed
        f = pathlib.Path(f)
        
        assert f.exists(), f'{f} doesn\'t exist'
    
    # accumulators for the data itself (contracts) and records of deleted entries
    res_df = None
    res_deleted_series = None

    for f in tqdm(files, desc='Assembling files'):
    # for f in files:

        # print(f'Processing "{f}"')
        tqdm.write(f'Processing "{f}"')

        # data is read from the above *zip* file, and `concatenate`d into a single `pd.DataFrame`...
        df = sproc.bundle.read_zip(f, concatenate=True)

        # ...which is re-structured with multiindexed columns
        df = sproc.hier.flat_df_to_multiindexed_df(df)

        # every ATOM inside the zip file also contains information (at the beginning) about deleted entries
        deleted_series = sproc.bundle.read_deleted_zip(f)

        # if this is NOT the first iteration...
        if res_df is not None:

            # ...the new data is stacked
            res_df = sproc.assemble.stack(res_df, df)
            res_deleted_series = pd.concat((res_deleted_series, deleted_series), axis=0)

        # ...if this is the first iteration
        else:

            # ...the new data is set as the accumulated result
            res_df = df
            res_deleted_series = deleted_series
            
    # some contracts show up more than once, and only the last update is to be kept
    res_last_update_only_df = sproc.postprocess.keep_updates_only(res_df)

    # a new *deleted* `pd.Series` is built by dropping duplicates (again, only the last one is kept)
    deduplicated_deleted_series = sproc.postprocess.deduplicate_deleted_series(res_deleted_series)

    # the *deleted* series is used to flag the appropriate entries in the "main" `pd.DataFrame`;
    # the result is "stateful" in the sense that we know the state of each entry (deleted -and, if so, when- or not)
    stateful_df = sproc.assemble.merge_deleted(res_last_update_only_df, deduplicated_deleted_series)
    
    # the number of filled-in rows for column `deleted_on` should match the number of `id`s in `deduplicated_deleted_series` that show up in `stateful_df`
    assert stateful_df['deleted_on'].notna().sum() == len(set(stateful_df['id']) & set(deduplicated_deleted_series.index.get_level_values(2)))
            
    return stateful_df

Let us pick a couple of files for testing

In [23]:
zip_files = ['PlataformasAgregadasSinMenores_2018.zip', 'PlataformasAgregadasSinMenores_2019.zip']
zip_files = [directory/ 'yearly' / e for e in zip_files]
zip_files

[Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2018.zip'),
 Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2019.zip')]

In [24]:
df = read_zips(zip_files)
df.head()

Assembling files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2018.zip"


Assembling files:  50%|█████     | 1/2 [00:01<00:01,  1.03s/it]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2019.zip"


Assembling files: 100%|██████████| 2/2 [00:02<00:00,  1.00s/it]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,...,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 23_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Party,Party,Name,TypeCode,...,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 23_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,PartyIdentification,PartyName,Unnamed: 11_level_3,Unnamed: 12_level_3,...,Unnamed: 14_level_3,ExternalReference,Unnamed: 16_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 22_level_3,Unnamed: 23_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,ID,Name,Unnamed: 11_level_4,Unnamed: 12_level_4,...,Unnamed: 14_level_4,URI,Unnamed: 16_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,...,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,PartyName,ParentLocatedParty,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,...,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Name,PartyName,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,...,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Name,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7
zip,file name,entry,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,451,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1281/17, Entidad: Diputación Provi...",Refuerzo de firme en la VP 4013 Melgar de Arri...,2018-01-02 08:02:51.744000+00:00,1281/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de firme en la VP 4013 Melgar de Arri...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,450,https://contrataciondelestado.es/sindicacion/P...,Id licitación: VI/17/04-015; Órgano de Contrat...,Obras de edificación en el barrio de Pumarabul...,2018-01-02 08:02:56.115000+00:00,VI/17/04-015,EV,,Consejería de Servicios y Derechos Sociales,"Edificación de 36 VPP, garaje y trasteros en e...",3.0,...,Pliego_Clausulas_Administrativas_VI-17-04-015.pdf,http://www.asturias.es/Proveedores/FICHEROS/ES...,,,,,,,2017-12-11 14:00:00+00:00,NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,449,https://contrataciondelestado.es/sindicacion/P...,"Id Licitación: PcPG/2017/194222, Órgano de Con...",Suministro de gas natural canalizado y gas nat...,2018-01-02 09:10:49.572000+00:00,PcPG/2017/194222,ADJ,A12017369,"Consellería de Economía, Emprego e Industria",Suministro de gas natural canalizado y gas nat...,1.0,...,,,,,,,,,2017-09-29 23:59:00+00:00,NaT


### CLI

A companion function to allow using the above from the command-line.

In [25]:
#| export
def cli_read_zips(args: list = None) -> None:
    
    parser = argparse.ArgumentParser(description='Process a bunch of zip files')

    parser.add_argument('input_files', type=argparse.FileType('r'), nargs='+', help='zip files')
    parser.add_argument('-o', '--output_file', default='out.parquet', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
        
    # the `pd.DataFrame` is built...
    df = read_zips([f.name for f in command_line_arguments.input_files])
    
    # ...rearranged for saving in parquet format
    parquet_df = sproc.assemble.parquet_amenable(df)
    
    parquet_df.to_parquet(output_file)
    
    print(f'writing {output_file}...')

In [26]:
cli_read_zips([e.as_posix() for e in zip_files] + '-o o.parquet'.split())

Assembling files:   0%|          | 0/2 [00:00<?, ?it/s]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2018.zip"


Assembling files:  50%|█████     | 1/2 [00:01<00:01,  1.10s/it]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2019.zip"


Assembling files: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]


writing o.parquet...


## Updates

In [27]:
def update(
    kind: str, # One of 'outsiders', 'insiders', or 'minors'
    output_directory: str | pathlib.Path # The path where hosting
    ):

    # `kind` should be one of the pre-set types
    assert kind in sproc.structure.tables

    # just in case
    output_directory = pathlib.Path(output_directory)

    # the name of the output file is determined by `kind`, and it's a parquet file
    output_file = pathlib.Path(output_directory / kind).with_suffix('.parquet')

    # if a there is a previous file...
    if output_file.exists():

        # the latter is read
        df = pd.read_parquet(output_file)

        # date strings are extracted from the "zip" index (level 0)...
        date_strs = df.index.get_level_values(0).drop_duplicates().str.extract('.*_([0-9]*).zip')[0].astype('str')

        # ...and parsed
        date_strs = date_strs.apply(sproc.parse.year_and_maybe_month)

        # the date from which to download new data is taken to be the maximum
        from_date = date_strs.max()

        # print(from_date)

        # print(sproc.download.make_urls(**sproc.structure.tables[kind], from_date=from_date))

        # required files are downloaded
        downloaded_files = sproc.download.from_date(kind, date=from_date, output_directory=output_directory)

        if not downloaded_files:

            print('file is up-to-date')

            return

        # in the beginning, the file to be updated represents the whole history
        history_df = df

        # every file that has been downloaded...
        for f in downloaded_files:

            # ...is used to extend the past
            history_df = sproc.extend.df_with_zip(history_df, f)

    # if a there is NOT a previous file...
    else:

        # print('not existing...')

        # agreed upon
        from_date = datetime.datetime(2017, 12, 1)

        # print(sproc.download.make_urls(**sproc.structure.tables[kind], from_date=from_date))

        # downloading
        downloaded_files = sproc.download.from_date(kind, date=from_date, output_directory=output_directory)

        # assembling
        history_df = read_zips(downloaded_files)

    # tidy up the `DataFrame` so that it can be saved in a parquet file
    parquet_df = sproc.assemble.parquet_amenable(history_df)
    
    # parquet_df.to_parquet(output_file.with_stem('new'))
    parquet_df.to_parquet(output_file)

Let us make a new directory...

In [28]:
# output_directory = pathlib.Path.cwd().parent / 'data' / 'agregados'
output_directory = pathlib.Path.cwd().parent / 'data' / 'plataforma'
# output_directory = pathlib.Path.cwd().parent / 'data' / 'menores'
output_directory.mkdir(exist_ok=True)
print(output_directory)

/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/data/plataforma


In [29]:
# update('outsiders', output_directory)
update('insiders', output_directory)
# update('minors', output_directory)

Downloading raw data: 100%|██████████| 4/4 [00:00<00:00, 294.41it/s]


"licitacionesPerfilesContratanteCompleto3_2019.zip" already exists
"licitacionesPerfilesContratanteCompleto3_2020.zip" already exists
"licitacionesPerfilesContratanteCompleto3_2021.zip" already exists
"licitacionesPerfilesContratanteCompleto3_2022.zip" already exists


Assembling files:   0%|          | 0/4 [00:00<?, ?it/s]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/data/plataforma/licitacionesPerfilesContratanteCompleto3_2019.zip"


Assembling files:  25%|██▌       | 1/4 [07:58<23:55, 478.60s/it]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/data/plataforma/licitacionesPerfilesContratanteCompleto3_2020.zip"


Assembling files:  50%|█████     | 2/4 [16:45<16:54, 507.13s/it]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/data/plataforma/licitacionesPerfilesContratanteCompleto3_2021.zip"


Assembling files:  75%|███████▌  | 3/4 [28:32<09:58, 598.36s/it]

Processing "/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/data/plataforma/licitacionesPerfilesContratanteCompleto3_2022.zip"


Assembling files: 100%|██████████| 4/4 [46:18<00:00, 694.56s/it]


In [30]:
# %connect_info

In [31]:
#| hide
from nbdev.doclinks import nbdev_export

In [32]:
#| hide
nbdev_export('00_core.ipynb')