In [None]:
import pathlib
import itertools

import pandas as pd
from IPython.display import display
import yaml

import dlsproc.assemble
import dlsproc.core

So that *all* the columns of a `pd.DataFrame` are shown:

In [None]:
pd.set_option('display.max_columns', None)

# Parameters

The directory containing downloaded data:

In [None]:
data_directory = pathlib.Path.cwd().parent / 'data'
assert data_directory.exists()
print(data_directory)

/home/manu/Sync/git/dlsproc/data


Directory in which the data is to be saved

In [None]:
output_directory = data_directory / 'agregados'
assert output_directory.exists()
print(output_directory)

/home/manu/Sync/git/dlsproc/data/agregados


The name of the file **without** extension

In [None]:
output_file = output_directory / '2018-2021'
print(f'{output_file} (existing? {output_file.exists()})')

/home/manu/Sync/git/dlsproc/data/agregados/2018-2021 (existing? False)


From the above *basename*

In [None]:
pickle_file = output_file.with_suffix('.pickle')
parquet_file = output_file.with_suffix('.parquet')

# Processing

> Licitaciones publicadas en la Plataforma mediante mecanismos de agregación, excluyendo los contratos menores

The directory for *outsiders* data

In [None]:
outsiders_directory = data_directory / 'agregados'
assert outsiders_directory.exists()
print(outsiders_directory)

/home/manu/Sync/git/dlsproc/data/agregados


We infer whether we are working on a sample or the full dataset (useful later on)

In [None]:
working_on_a_sample = outsiders_directory.parts[-2] == 'samples'
working_on_a_sample

False

The list of files to be processed. They were downloaded [here](https://www.hacienda.gob.es/es-ES/GobiernoAbierto/Datos%20Abiertos/Paginas/LicitacionesAgregacion.aspx). One could use something like
```
wget https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2018.zip
```

In [None]:
zip_files = [
    'PlataformasAgregadasSinMenores_2018.zip',
    'PlataformasAgregadasSinMenores_2019.zip',
    'PlataformasAgregadasSinMenores_2020.zip',
    'PlataformasAgregadasSinMenores_2021.zip'
]
zip_files = [outsiders_directory/e for e in zip_files]
zip_files

[PosixPath('/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2018.zip'),
 PosixPath('/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2019.zip'),
 PosixPath('/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2020.zip'),
 PosixPath('/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2021.zip')]

In [None]:
%%time
df = dlsproc.core.read_zips([outsiders_directory / e for e in zip_files])

Processing "/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2018.zip"
Processing "/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2019.zip"
Processing "/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2020.zip"
Processing "/home/manu/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2021.zip"
CPU times: user 17min 54s, sys: 3.24 s, total: 17min 57s
Wall time: 17min 57s


Saving to *pickle*

In [None]:
if not working_on_a_sample:
    df.to_pickle(pickle_file)

Saving to *parquet*

In [None]:
parquet_df = dlsproc.assemble.parquet_amenable(df)
parquet_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,TenderResult,TenderResult,TenderResult,TenderResult,TenderResult,TenderingProcess,TenderingProcess,TenderingProcess,LegalDocumentReference,LegalDocumentReference,ValidNoticeInfo,ValidNoticeInfo,ValidNoticeInfo,LocatedContractingParty,TenderingProcess,TenderingProcess,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,ProcurementProject,ProcurementProject,LocatedContractingParty,TenderResult,LocatedContractingParty,TenderingProcess,LocatedContractingParty,TenderingProcess,LocatedContractingParty,Unnamed: 44_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Party,Name,TypeCode,BudgetAmount,BudgetAmount,RequiredCommodityClassification,RealizedLocation,PlannedPeriod,ResultCode,ReceivedTenderQuantity,WinningParty,WinningParty,AwardedTenderedProject,ProcedureCode,TenderSubmissionDeadlinePeriod,TenderSubmissionDeadlinePeriod,ID,Attachment,NoticeTypeCode,AdditionalPublicationStatus,AdditionalPublicationStatus,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,ID,Attachment,ParentLocatedParty,PlannedPeriod,PlannedPeriod,Party,AwardedTenderedProject,ParentLocatedParty,TenderSubmissionDeadlinePeriod,ParentLocatedParty,TenderSubmissionDeadlinePeriod,BuyerProfileURIID,Unnamed: 44_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,PartyName,Unnamed: 9_level_3,Unnamed: 10_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,ItemClassificationCode,CountrySubentityCode,DurationMeasure,Unnamed: 16_level_3,Unnamed: 17_level_3,PartyIdentification,PartyName,LegalMonetaryTotal,Unnamed: 21_level_3,EndDate,EndTime,Unnamed: 24_level_3,ExternalReference,Unnamed: 26_level_3,PublicationMediaName,AdditionalPublicationDocumentReference,PartyName,EndDate,EndTime,Unnamed: 32_level_3,ExternalReference,ParentLocatedParty,StartDate,EndDate,PartyIdentification,ProcurementProjectLotID,ParentLocatedParty,Unnamed: 40_level_3,ParentLocatedParty,Description,Unnamed: 43_level_3,Unnamed: 44_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Name,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,ID,Name,TaxExclusiveAmount,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4,URI,Unnamed: 26_level_4,Unnamed: 27_level_4,IssueDate,Name,Unnamed: 30_level_4,Unnamed: 31_level_4,Unnamed: 32_level_4,URI,PartyName,Unnamed: 35_level_4,Unnamed: 36_level_4,ID,Unnamed: 38_level_4,ParentLocatedParty,Unnamed: 40_level_4,ParentLocatedParty,Unnamed: 42_level_4,Unnamed: 43_level_4,Unnamed: 44_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Unnamed: 25_level_5,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Unnamed: 30_level_5,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Name,Unnamed: 35_level_5,Unnamed: 36_level_5,Unnamed: 37_level_5,Unnamed: 38_level_5,PartyName,Unnamed: 40_level_5,ParentLocatedParty,Unnamed: 42_level_5,Unnamed: 43_level_5,Unnamed: 44_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Unnamed: 37_level_6,Unnamed: 38_level_6,Name,Unnamed: 40_level_6,PartyName,Unnamed: 42_level_6,Unnamed: 43_level_6,Unnamed: 44_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Unnamed: 38_level_7,Unnamed: 39_level_7,Unnamed: 40_level_7,Name,Unnamed: 42_level_7,Unnamed: 43_level_7,Unnamed: 44_level_7
file name,entry,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8,Unnamed: 24_level_8,Unnamed: 25_level_8,Unnamed: 26_level_8,Unnamed: 27_level_8,Unnamed: 28_level_8,Unnamed: 29_level_8,Unnamed: 30_level_8,Unnamed: 31_level_8,Unnamed: 32_level_8,Unnamed: 33_level_8,Unnamed: 34_level_8,Unnamed: 35_level_8,Unnamed: 36_level_8,Unnamed: 37_level_8,Unnamed: 38_level_8,Unnamed: 39_level_8,Unnamed: 40_level_8,Unnamed: 41_level_8,Unnamed: 42_level_8,Unnamed: 43_level_8,Unnamed: 44_level_8
PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,[45233142.0],ES418,3.0,[8.0],[14.0],[A47082185],[CONSTRUCCIONES HERMANOS SASTRE S.A.],[60690.08],1.0,2017-11-02,23:59:00,,,[DOC_FORM],[Publicación del anuncio de formalización en u...,[2018-01-02],,2017-11-02,23:59:00,,,,,,L02000047,[1.0],,2017-11-02 23:59:00+00:00,,,,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,[45233142.0],ES418,3.0,[8.0],[13.0],[A49012792],[CONTRATAS Y OBRAS SAN GREGORIO S.A.],[118919.49],1.0,2017-11-02,23:59:00,,,[DOC_FORM],[Publicación del anuncio de formalización en u...,[2018-01-02],,2017-11-02,23:59:00,,,,,,L02000047,[1.0],,2017-11-02 23:59:00+00:00,,,,NaT


In [None]:
if not working_on_a_sample:
    parquet_df.to_parquet(parquet_file)

## Making a sample

If we are working with sample files, a *parquet* sample file is created

In [None]:
if working_on_a_sample:
    sample_df  = parquet_df.iloc[itertools.chain(range(10), range(-10, 0))]
    sample_df.to_parquet(outsiders_directory.parent / '2018-2021_20samples.parquet')