In [None]:
import pathlib
import shutil

import pandas as pd
import yaml

import dlsproc.extend
import dlsproc.hier

So that *all* the columns of a `pd.DataFrame` are shown:

In [None]:
pd.set_option('display.max_columns', None)

# Directories

The directory containing downloaded data:

In [None]:
data_directory = pathlib.Path.cwd().parent / 'data' / 'agregados'
assert data_directory.exists()
print(data_directory)

/home/manu/dlsproc/data/agregados


A `list` with all the *zip* files that are to be processed

In [None]:
new_zip_files = sorted(data_directory.glob('PlataformasAgregadasSinMenores_2022*.zip'))
new_zip_files

[PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202201.zip'),
 PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202202.zip'),
 PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202203.zip'),
 PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202204.zip'),
 PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202205.zip'),
 PosixPath('/home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202206.zip')]

The final output file

In [None]:
output_file = data_directory / 'agregados_sin_menores_multiindex.parquet'
output_file

PosixPath('/home/manu/dlsproc/data/agregados/agregados_sin_menores_multiindex.parquet')

The historical file (in *parquet* format)

In [None]:
# history_file = pathlib.Path.cwd().parent / 'make' / '2018-2021.parquet'
history_file = data_directory / '2018-2021.parquet'
assert history_file.exists()
print(f'{history_file=}')

history_file=PosixPath('/home/manu/dlsproc/data/agregados/2018-2021.parquet')


# Files

The output file is in the beginning a copy of the *history file*

In [None]:
shutil.copy(history_file, output_file)

PosixPath('/home/manu/dlsproc/data/agregados/agregados_sin_menores_multiindex.parquet')

In [None]:
for i, zip_file in enumerate(new_zip_files):
    print(f'appending {zip_file}')
    dlsproc.extend.parquet_with_zip(output_file, zip_file, output_file)

appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202201.zip
appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202202.zip
appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202203.zip
appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202204.zip
appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202205.zip
appending /home/manu/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202206.zip


In [None]:
%ls {output_file.parent}

2018-2021.parquet
2018-2021.pickle
agregados_sin_menores_multiindex.parquet
[0m[01;31mPlataformasAgregadasSinMenores_2018.zip[0m
[01;31mPlataformasAgregadasSinMenores_2019.zip[0m
[01;31mPlataformasAgregadasSinMenores_2020.zip[0m
[01;31mPlataformasAgregadasSinMenores_2021.zip[0m
[01;31mPlataformasAgregadasSinMenores_202201.zip[0m
[01;31mPlataformasAgregadasSinMenores_202202.zip[0m
[01;31mPlataformasAgregadasSinMenores_202203.zip[0m
[01;31mPlataformasAgregadasSinMenores_202204.zip[0m
[01;31mPlataformasAgregadasSinMenores_202205.zip[0m
[01;31mPlataformasAgregadasSinMenores_202206.zip[0m
[01;34mtmp[0m/


In [None]:
res_df = pd.read_parquet(output_file)
res_df.shape

(207088, 43)

A mapping between columns and *human-readable* fields can be found [here](***REMOVED***). The latter was processed in `naming.ipynb` to get the file below.

In [None]:
# data_scheme_file = pathlib.Path.cwd() / 'samples' / 'PLACE.yaml'
# assert data_scheme_file.exists()

It provides (as a `dict`) a mapping from *human-readable* names to (maybe nested) fields in *Atom* files

In [None]:
# with open(data_scheme_file) as yaml_data:
#     data_scheme = yaml.load(yaml_data, Loader=yaml.FullLoader)

In [None]:
# dlsproc.hier.flatten_columns_names(df, data_scheme)