In [None]:
#| default_exp core

# core

> Main functionality.

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import sys
import argparse
import pathlib
import datetime

import yaml
import pandas as pd
from tqdm import tqdm

import sproc.extend
import sproc.hier
import sproc.assemble
import sproc.bundle
import sproc.postprocess
import sproc.structure
import sproc.download
import sproc.parse

Directory where the zip files are stored

In [None]:
directory = pathlib.Path.cwd().parent / 'samples'
assert directory.exists()
directory

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples')

## Processing a single zip file

A function to read a single *zip* file from the command line. One could also achieve the same result with `cli_read_zips`, though this is slightly more efficient.

In [None]:
#| export
def cli_read_single_zip(
    args: list = None # Command-line arguments
    ) -> None:
    "Parses command-line arguments to read a single zip file exploiting the functionality of `sproc.assemble`"
    
    parser = argparse.ArgumentParser(description='Process zip file')

    parser.add_argument('zip_file', type=argparse.FileType('r'), help='zip file')
    parser.add_argument('output_file', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
    
    data_df, deleted_series = sproc.assemble.distilled_data_from_zip(command_line_arguments.zip_file.name)
    
    res = sproc.assemble.merge_deleted(data_df, deleted_series)
    res = sproc.assemble.parquet_amenable(res)
    
    res.to_parquet(output_file)

In [None]:
zip_file = directory /'yearly' / 'PlataformasAgregadasSinMenores_2018.zip'
assert zip_file.exists()
print(f'{zip_file=}')

zip_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/yearly/PlataformasAgregadasSinMenores_2018.zip')


In [None]:
output_file = directory / 'year_2018.parquet'
print(f'{output_file=}')

output_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/year_2018.parquet')


In [None]:
args = [zip_file.as_posix(), output_file.as_posix()]
cli_read_single_zip(args)

In [None]:
%ls {directory}

2018-2021_20samples.parquet
extended_sample.parquet
extended_sample_renamed.parquet
[0m[01;34mgencat[0m/
insiders_sample.parquet
merged.parquet
minors_sample.parquet
PLACE.yaml
PlataformasAgregadasSinMenores_20220104_030016_1.atom
PlataformasAgregadasSinMenores_20220104_030016_1_single.atom
[01;31mPlataformasAgregadasSinMenores_202201_05-06.zip[0m
[01;31mPlataformasAgregadasSinMenores_202201_08-11.zip[0m
[01;31mPlataformasAgregadasSinMenores_202201_28-29.zip[0m
README.md
renamed_cols_extended_sample.parquet
year_2018.parquet
[01;34myearly[0m/


In [None]:
pd.read_parquet(output_file).head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,updated,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,...,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 21_level_1,ContractFolderStatusCode,Unnamed: 23_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Party,Party,Name,TypeCode,BudgetAmount,BudgetAmount,...,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,PartyIdentification,PartyName,Unnamed: 9_level_3,Unnamed: 10_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,...,Unnamed: 14_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,ID,Name,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,...,Unnamed: 14_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,...,Unnamed: 14_level_5,Unnamed: 15_level_5,PartyName,ParentLocatedParty,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,...,Unnamed: 14_level_6,Unnamed: 15_level_6,Name,PartyName,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,...,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Name,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7
zip,file name,entry,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,1284/17,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,...,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,[2018-01-02 08:01:52.024000+00:00],[RES],NaT
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,1282/17,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,...,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,[2018-01-02 08:02:24.833000+00:00],[RES],NaT


## Extending historical data

A function to extend an existing *parquet* file with new data in a *zip* file.

In [None]:
#| export
def cli_extend_parquet_with_zip(
    args: list = None # Command-line arguments
    ) -> None:
    "Parses command-line arguments to be passed to `sproc.extend.parquet_with_zip`"
    
    parser = argparse.ArgumentParser(description='Extend existing parquet file with data from a given zip')

    parser.add_argument('history_file', type=argparse.FileType('r'), help='Parquet file')
    parser.add_argument('zip_file', type=argparse.FileType('r'), help='Zip file')
    parser.add_argument('output_file', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    history_file = pathlib.Path(command_line_arguments.history_file.name)
    zip_file = pathlib.Path(command_line_arguments.zip_file.name)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
    
    sproc.extend.parquet_with_zip(history_file, zip_file, output_file)

Testing with some sample files

In [None]:
history_file = directory /'2018-2021_20samples.parquet'
assert history_file.exists()
print(f'{history_file=}')

history_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/2018-2021_20samples.parquet')


In [None]:
new_zip_file = directory / 'PlataformasAgregadasSinMenores_202201_28-29.zip'
assert new_zip_file.exists()
print(f'{new_zip_file=}')

new_zip_file=Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PlataformasAgregadasSinMenores_202201_28-29.zip')


In [None]:
output_file = directory / 'extended_sample.parquet'
output_file

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/extended_sample.parquet')

In [None]:
args = [history_file.as_posix(), new_zip_file.as_posix(), output_file.as_posix()]
cli_extend_parquet_with_zip(args)

In [None]:
%ls {directory}

2018-2021_20samples.parquet
extended_sample.parquet
extended_sample_renamed.parquet
[0m[01;34mgencat[0m/
insiders_sample.parquet
merged.parquet
minors_sample.parquet
PLACE.yaml
PlataformasAgregadasSinMenores_20220104_030016_1.atom
PlataformasAgregadasSinMenores_20220104_030016_1_single.atom
[01;31mPlataformasAgregadasSinMenores_202201_05-06.zip[0m
[01;31mPlataformasAgregadasSinMenores_202201_08-11.zip[0m
[01;31mPlataformasAgregadasSinMenores_202201_28-29.zip[0m
README.md
renamed_cols_extended_sample.parquet
year_2018.parquet
[01;34myearly[0m/


## Renaming columns

A function to *flatten* a hierarchical (column-multiindex) `pd.DataFrame` using a given *naming scheme* or a default one.

In [None]:
#| export
def cli_rename_columns(
    args: list = None # Command-line arguments
    ) -> pathlib.Path: # Output file
    "Parses command-line arguments to be passed to `sproc.hier.flatten_columns_names`"
    
    parser = argparse.ArgumentParser(description='Rename columns')

    parser.add_argument('hierarchical_file', type=argparse.FileType('r'), help='(Hierarchical) Parquet file')
    # parser.add_argument('output_file', help='Output (parquet) file')
    
    parser.add_argument('-l', '--from-local-file', type=argparse.FileType('r'), help='Local file')
    parser.add_argument('-r', '--from-repository-file', help='Repository file')

    command_line_arguments = parser.parse_args(args)
    
    # for the sake of convenience
    hierarchical_file = pathlib.Path(command_line_arguments.hierarchical_file.name)
    
    assert not (command_line_arguments.from_repository_file and command_line_arguments.from_local_file), f'"from-local-file" and "from-repository-file" options are exclusive'
    
    # assert (command_line_arguments.from_repository_file is None) ^ (command_line_arguments.from_local_file is None), 'Either "from-local-file" or "from-repository-file" is expected'
    
    # if a repository file was requested OR no argument was passed...
    if command_line_arguments.from_repository_file or ((command_line_arguments.from_repository_file is None) and (command_line_arguments.from_local_file is None)):
        
        # print('repository...')

        url = 'https://raw.githubusercontent.com/manuvazquez/sproc/main/naming/'

        # if a repository file was requested...
        if command_line_arguments.from_repository_file is not None:
        
            # url = 'https://raw.githubusercontent.com/manuvazquez/sproc/main/naming/' + command_line_arguments.from_repository_file
            url += command_line_arguments.from_repository_file

        # if no argument was passed
        else:

            if hierarchical_file.stem in sproc.structure.tables:

                url += sproc.structure.tables[hierarchical_file.stem]['naming_filename']
                
                # print(url)

            else:

                print(f'The name of the input file is not associated with any naming scheme')
                return
        
        data_scheme = sproc.download.yaml_to_dict(url)
        
    # elif command_line_arguments.from_local_file:
    else:
        
        # print('local...')
        
        with open(command_line_arguments.from_local_file.name) as yaml_data:
            
            data_scheme = yaml.load(yaml_data, Loader=yaml.FullLoader)
        
    # else:

    #     print('nor repository nor local...trying to guess from the file name')

    #     # kind = command_line_arguments.hierarchical_file.stem
        
    # print(data_scheme)
        
    assert hierarchical_file.suffix == '.parquet', 'a (hierarchical) .parquet file was expected'
    
    # name of the output file is derived from that of the input
    output_file = hierarchical_file.with_stem(hierarchical_file.stem + '_renamed')
        
    df = pd.read_parquet(hierarchical_file)
    renamed_cols_df = sproc.hier.flatten_columns_names(df, data_scheme)
    
    # renamed file is written
    renamed_cols_df.to_parquet(output_file)
    
    return output_file

A local file encompassing a name *mapping*...

In [None]:
mapping_file = directory / 'PLACE.yaml'
assert mapping_file.exists()
mapping_file

Path('/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/samples/PLACE.yaml')

...is used to rename the columns

In [None]:
args = [output_file.as_posix(), '--from-local-file', mapping_file.as_posix()]
renamed_cols_output_file = cli_rename_columns(args)
renamed_cols_output_file_df = pd.read_parquet(renamed_cols_output_file).head(2)
renamed_cols_output_file_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,Número de Expediente,Nombre,Objeto del Contrato,Tipo de Contrato,Valor estimado del contrato,Presupuesto base sin impuestos,Clasificación CPV,...,ID,Lote,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,Presentación de Oferta,ContractFolderStatus - LocatedContractingParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - ParentLocatedParty - PartyName - Name,Presentación de Oferta (Observaciones),URL perfil de contratante,deleted_on,updated,Estado
zip,file name,entry,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,1284/17,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,[45233142.0],...,L02000047,[1.0],,2017-11-02 23:59:00+00:00,,,,NaT,[2018-01-02 08:01:52.024000+00:00],[RES]
PlataformasAgregadasSinMenores_2018.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,1282/17,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,[45233142.0],...,L02000047,[1.0],,2017-11-02 23:59:00+00:00,,,,NaT,[2018-01-02 08:02:24.833000+00:00],[RES]


Columns are **not** multiindexed anymore

In [None]:
renamed_cols_output_file_df.columns[:5]

Index(['id', 'summary', 'title', 'Número de Expediente', 'Nombre'], dtype='object')

A **remote** `naming` scheme stored in the repository can also be used

In [None]:
args = [output_file.as_posix(), '--from-repository-file', 'outsiders.yaml']
renamed_cols_output_file = cli_rename_columns(args)
pd.read_parquet(renamed_cols_output_file).head(2)

## Reading a bunch of zip files

It receives a `list` of *zip* files and returns a (column-hierarchical) `pd.DataFrame` encompassing all the data

In [None]:
#| export
def read_zips(
    files: list[str | pathlib.Path] # Input files
    ) -> pd.DataFrame: # Procurement data
    "Build a `DataFrame` out of a bunch of zip files"
    
    # at the beginning it is guaranteed that every file is present
    for f in files:
        
        # in case `str` (rather than `Pathlib`s) were passed
        f = pathlib.Path(f)
        
        assert f.exists(), f'{f} doesn\'t exist'
    
    # accumulators for the data itself (contracts) and records of deleted entries
    res_df = None
    res_deleted_series = None

    for f in tqdm(files, desc='Assembling files'):
    # for f in files:

        # print(f'Processing "{f}"')
        tqdm.write(f'Processing "{f}"')

        # data is read from the above *zip* file, and `concatenate`d into a single `pd.DataFrame`...
        df = sproc.bundle.read_zip(f, concatenate=True)

        # ...which is re-structured with multiindexed columns
        df = sproc.hier.flat_df_to_multiindexed_df(df)

        # every ATOM inside the zip file also contains information (at the beginning) about deleted entries
        deleted_series = sproc.bundle.read_deleted_zip(f)

        # if this is NOT the first iteration...
        if res_df is not None:

            # ...the new data is stacked
            res_df = sproc.assemble.stack(res_df, df)
            res_deleted_series = pd.concat((res_deleted_series, deleted_series), axis=0)

        # ...if this is the first iteration
        else:

            # ...the new data is set as the accumulated result
            res_df = df
            res_deleted_series = deleted_series
            
    # some contracts show up more than once, and only the last update is to be kept
    res_last_update_only_df = sproc.postprocess.keep_updates_only(res_df)

    # a new *deleted* `pd.Series` is built by dropping duplicates (again, only the last one is kept)
    deduplicated_deleted_series = sproc.postprocess.deduplicate_deleted_series(res_deleted_series)

    # the *deleted* series is used to flag the appropriate entries in the "main" `pd.DataFrame`;
    # the result is "stateful" in the sense that we know the state of each entry (deleted -and, if so, when- or not)
    stateful_df = sproc.assemble.merge_deleted(res_last_update_only_df, deduplicated_deleted_series)
    
    # the number of filled-in rows for column `deleted_on` should match the number of `id`s in `deduplicated_deleted_series` that show up in `stateful_df`
    assert stateful_df['deleted_on'].notna().sum() == len(set(stateful_df['id']) & set(deduplicated_deleted_series.index.get_level_values(2)))
            
    return stateful_df

Let us pick a couple of files for testing

In [None]:
zip_files = ['PlataformasAgregadasSinMenores_2018.zip', 'PlataformasAgregadasSinMenores_2019.zip']
zip_files = [directory/ 'yearly' / e for e in zip_files]
zip_files

In [None]:
df = read_zips(zip_files)
df.head()

### CLI

A companion function to allow using the above from the command-line.

In [None]:
#| export
def cli_read_zips(
    args: list = None # Command-line arguments
    ) -> None:
    "Parses command-line arguments to be passed to `read_zips`"
    
    parser = argparse.ArgumentParser(description='Process a bunch of zip files')

    parser.add_argument('input_files', type=argparse.FileType('r'), nargs='+', help='zip files')
    parser.add_argument('-o', '--output_file', default='out.parquet', help='Output (parquet) file')

    command_line_arguments = parser.parse_args(args)
    
    output_file = pathlib.Path(command_line_arguments.output_file)
    assert output_file.suffix == '.parquet', 'a .parquet file was expected'
        
    # the `pd.DataFrame` is built...
    df = read_zips([f.name for f in command_line_arguments.input_files])
    
    # ...rearranged for saving in parquet format
    parquet_df = sproc.assemble.parquet_amenable(df)
    
    parquet_df.to_parquet(output_file)
    
    print(f'writing {output_file}...')

In [None]:
cli_read_zips([e.as_posix() for e in zip_files] + '-o o.parquet'.split())

## Downloading new data

Core function to download new data and updated existing local structures.

In [None]:
#| export
def dl(
    kind: str, # One of 'outsiders', 'insiders', or 'minors'
    output_directory: str | pathlib.Path # The path where data is to be stored
    ):
    "Download data or update local one"

    # `kind` should be one of the pre-set types
    assert kind in sproc.structure.tables

    # just in case
    output_directory = pathlib.Path(output_directory)

    # the output directory is expected to exist
    assert output_directory.exists()

    # the name of the output file is determined by `kind`, and it's a parquet file
    output_file = pathlib.Path(output_directory / kind).with_suffix('.parquet')

    # if a there is a previous file...
    if output_file.exists():

        print(f'found previous "{output_file}": extending it...')

        # the latter is read
        df = pd.read_parquet(output_file)

        # date strings are extracted from the "zip" index (level 0)...
        date_strs = df.index.get_level_values(0).drop_duplicates().str.extract('.*_([0-9]*).zip')[0].astype('str')

        # ...and parsed
        date_strs = date_strs.apply(sproc.parse.year_and_maybe_month)

        # the date from which to download new data is taken to be the maximum
        from_date = date_strs.max()

        # required files are downloaded
        downloaded_files = sproc.download.from_date(kind, date=from_date, output_directory=output_directory)

        if not downloaded_files:

            print('file is up-to-date')

            return

        # in the beginning, the file to be updated represents the whole history
        history_df = df

        # every file that has been downloaded...
        for f in tqdm(downloaded_files, desc='Updating'):

            tqdm.write(f'Appending "{f.name}"')

            # ...is used to extend the past
            history_df = sproc.extend.df_with_zip(history_df, f)

    # if a there is NOT a previous file...
    else:

        # agreed upon
        from_date = datetime.datetime(2017, 12, 1)

        print(f'no previous "{output_file}" was found: making one using data since {from_date.date()}...')

        # downloading
        downloaded_files = sproc.download.from_date(kind, date=from_date, output_directory=output_directory)

        # assembling
        history_df = read_zips(downloaded_files)

    # tidy up the `DataFrame` so that it can be saved in a parquet file
    parquet_df = sproc.assemble.parquet_amenable(history_df)
    
    # parquet_df.to_parquet(output_file.with_stem('new'))
    parquet_df.to_parquet(output_file)

### CLI

A companion function to allow using the above from the command-line.

In [None]:
#| export
def cli_dl(
    args: list = None # Command-line arguments
    ) -> None:
    "Parses command-line arguments to be passed to `dl`"
    
    parser = argparse.ArgumentParser(description='Update (or make) local data')

    parser.add_argument('kind', choices=sproc.structure.tables.keys())
    parser.add_argument('-o', '--output_directory', help='Output directory', default=pathlib.Path.cwd(), type=pathlib.Path)

    command_line_arguments = parser.parse_args(args)

    # directory is made if it doesn't exist
    command_line_arguments.output_directory.mkdir(exist_ok=True)

    # print(command_line_arguments)

    dl(command_line_arguments.kind, command_line_arguments.output_directory)

In [None]:
# output_directory = pathlib.Path.cwd().parent / 'data' / 'plataforma'
# args = ['outsiders', '-o', output_directory.as_posix()]
# cli_update(args)

In [None]:
#| hide
from nbdev.doclinks import nbdev_export

In [None]:
#| hide
nbdev_export('00_core.ipynb')