In [None]:
import pathlib
import itertools

import pandas as pd
from IPython.display import display
import yaml

import dlsproc.assemble
import dlsproc.core

So that *all* the columns of a `pd.DataFrame` are shown:

In [None]:
pd.set_option('display.max_columns', None)

# Parameters

The directory containing downloaded data:

In [None]:
data_directory = pathlib.Path.cwd().parent / 'data'
assert data_directory.exists()
print(data_directory)

/export/usuarios01/mvazquez/Sync/git/dlsproc/data


Directory in which the data is to be saved

In [None]:
output_directory = data_directory / 'agregados'
assert output_directory.exists()
print(output_directory)

/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados


The name of the file **without** extension

In [None]:
output_file = output_directory / '2018-2021'
print(f'{output_file} (existing? {output_file.exists()})')

/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados/2018-2021 (existing? False)


From the above *basename*

In [None]:
pickle_file = output_file.with_suffix('.pickle')
parquet_file = output_file.with_suffix('.parquet')

# Processing

> Licitaciones publicadas en la Plataforma mediante mecanismos de agregación, excluyendo los contratos menores

The directory for *outsiders* data

In [None]:
outsiders_directory = data_directory / 'agregados'
assert outsiders_directory.exists()
print(outsiders_directory)

/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados


We infer whether we are working on a sample or the full dataset (useful later on)

In [None]:
working_on_a_sample = outsiders_directory.parts[-2] == 'samples'
working_on_a_sample

False

The list of files to be processed. They were downloaded [here](https://www.hacienda.gob.es/es-ES/GobiernoAbierto/Datos%20Abiertos/Paginas/LicitacionesAgregacion.aspx). One could use something like
```
wget https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2018.zip
```

In [None]:
zip_files = [
    # 'PlataformasAgregadasSinMenores_2018.zip',
    # 'PlataformasAgregadasSinMenores_2019.zip',
    # 'PlataformasAgregadasSinMenores_2020.zip',
    'PlataformasAgregadasSinMenores_2021.zip',
    'PlataformasAgregadasSinMenores_202201.zip',
]
zip_files = [outsiders_directory/e for e in zip_files]
zip_files

[PosixPath('/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2021.zip'),
 PosixPath('/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202201.zip')]

In [None]:
%%time
df = dlsproc.core.read_zips([outsiders_directory/e for e in zip_files])

Processing "/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_2021.zip"
Processing "/export/usuarios01/mvazquez/Sync/git/dlsproc/data/agregados/PlataformasAgregadasSinMenores_202201.zip"
CPU times: user 3min 4s, sys: 1.04 s, total: 3min 5s
Wall time: 3min 5s


Saving to *pickle*

In [None]:
if not working_on_a_sample:
    df.to_pickle(pickle_file)

Saving to *parquet*

In [None]:
parquet_df = dlsproc.assemble.parquet_amenable(df)
parquet_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,TenderingProcess,TenderingProcess,TenderingProcess,ValidNoticeInfo,ValidNoticeInfo,ValidNoticeInfo,ProcurementProject,ProcurementProject,LocatedContractingParty,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,TenderResult,TenderResult,TenderResult,TenderResult,TenderResult,TenderingProcess,TenderingProcess,LocatedContractingParty,LocatedContractingParty,TenderResult,TenderingProcess,LocatedContractingParty,Unnamed: 42_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Party,ParentLocatedParty,Name,TypeCode,BudgetAmount,BudgetAmount,RequiredCommodityClassification,RealizedLocation,PlannedPeriod,ProcedureCode,TenderSubmissionDeadlinePeriod,TenderSubmissionDeadlinePeriod,NoticeTypeCode,AdditionalPublicationStatus,AdditionalPublicationStatus,PlannedPeriod,PlannedPeriod,ParentLocatedParty,ID,Attachment,ID,Attachment,ResultCode,ReceivedTenderQuantity,WinningParty,WinningParty,AwardedTenderedProject,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,BuyerProfileURIID,Party,AwardedTenderedProject,TenderSubmissionDeadlinePeriod,ParentLocatedParty,Unnamed: 42_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,PartyName,PartyName,Unnamed: 10_level_3,Unnamed: 11_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,ItemClassificationCode,CountrySubentityCode,DurationMeasure,Unnamed: 17_level_3,EndDate,EndTime,Unnamed: 20_level_3,PublicationMediaName,AdditionalPublicationDocumentReference,StartDate,EndDate,ParentLocatedParty,Unnamed: 26_level_3,ExternalReference,Unnamed: 28_level_3,ExternalReference,Unnamed: 30_level_3,Unnamed: 31_level_3,PartyIdentification,PartyName,LegalMonetaryTotal,EndDate,EndTime,Unnamed: 37_level_3,PartyIdentification,ProcurementProjectLotID,Unnamed: 40_level_3,ParentLocatedParty,Unnamed: 42_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Name,Name,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,IssueDate,Unnamed: 23_level_4,Unnamed: 24_level_4,PartyName,Unnamed: 26_level_4,URI,Unnamed: 28_level_4,URI,Unnamed: 30_level_4,Unnamed: 31_level_4,ID,Name,TaxExclusiveAmount,Unnamed: 35_level_4,Unnamed: 36_level_4,Unnamed: 37_level_4,ID,Unnamed: 39_level_4,Unnamed: 40_level_4,ParentLocatedParty,Unnamed: 42_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Name,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Unnamed: 30_level_5,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Unnamed: 34_level_5,Unnamed: 35_level_5,Unnamed: 36_level_5,Unnamed: 37_level_5,Unnamed: 38_level_5,Unnamed: 39_level_5,Unnamed: 40_level_5,PartyName,Unnamed: 42_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Unnamed: 37_level_6,Unnamed: 38_level_6,Unnamed: 39_level_6,Unnamed: 40_level_6,Name,Unnamed: 42_level_6
file name,entry,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Unnamed: 38_level_7,Unnamed: 39_level_7,Unnamed: 40_level_7,Unnamed: 41_level_7,Unnamed: 42_level_7
PlataformasAgregadasSinMenores_20210101_030039.atom,294,https://contrataciondelestado.es/sindicacion/P...,Id licitación: PA-02-2020; Órgano de Contrata...,SERVICIO DE RECEPCIÓN Y ATENCIÓN TELEFÓNICA A ...,2020-12-30 14:25:27.661000+00:00,PA-02-2020,EV,Consejo Administración,Centro de Desarrollo Empresarial Margen Izquie...,Servicio de recepción y atención telefónica,2.0,104000.0,52000.0,[98341120.0],ES213,1.0,1.0,2020-11-30,13:00:00,[DOC_CN],[Perfil del contratante],[2020-11-10],2021-02-01,,Centro de Desarrollo Empresarial Margen Izquie...,Pliego clausulas_admin_particulares_RECEPCION ...,https://contratacion.euskadi.eus/w32-kpeperfi/...,Pliego prescripciones tecnicas_particulares _R...,https://contratacion.euskadi.eus/w32-kpeperfi/...,[nan],[nan],[nan],[nan],[nan],,,,,[nan],2020-11-30 13:00:00+00:00,,NaT
PlataformasAgregadasSinMenores_20210101_030039.atom,291,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2006OB07; Órgano de Contratació...,Subministrament d'energia elèctrica en BAIXA T...,2020-12-30 14:26:25.241000+00:00,2006OB07,RES,"Corporació Catalana de Mitjans Audiovisuals, S...",Departaments i Sector Públic de la Generalitat,Subministrament d'energia elèctrica en BAIXA T...,1.0,3460000.0,1730000.0,[65310000.0],ES51,1.0,1.0,2020-09-10,13:00:00,"[DOC_CN, DOC_CAN_ADJ, DOC_FORM]","[DOUE, DOUE, Perfil del contratante]","[2020-06-22, ['2020-12-07', '2020-12-07'], ['2...",,,,,,,,"[9.0, 9.0]","[7.0, 7.0]","[A62332580, A62338827]","[NEXUS ENERGIA S.A., AUDAX RENOVABLES ,S.A.]","[235396.58, 1229756.98]",,,,,"[1.0, 2.0]",2020-09-10 13:00:00+00:00,,NaT


In [None]:
if not working_on_a_sample:
    parquet_df.to_parquet(parquet_file)

## Making a sample

If we are working with sample files, a *parquet* sample file is created

In [None]:
if working_on_a_sample:
    sample_df  = parquet_df.iloc[itertools.chain(range(10), range(-10, 0))]
    sample_df.to_parquet(outsiders_directory.parent / '2018-2021_20samples.parquet')