In [None]:
import pathlib
import itertools

import pandas as pd
from IPython.display import display
import yaml

import dlsproc.bundle
import dlsproc.hier
import dlsproc.postprocess
import dlsproc.assemble

So that *all* the columns of a `pd.DataFrame` are shown:

In [None]:
pd.set_option('display.max_columns', None)

# Parameters

The directory containing downloaded data:

In [None]:
data_directory = pathlib.Path.cwd().parent / 'data'
assert data_directory.exists()
print(data_directory)

/home/manu/dlsproc/data


Directory in which the data is to be saved

In [None]:
output_directory = pathlib.Path.cwd()
assert output_directory.exists()
print(output_directory)

/home/manu/dlsproc/reports


The name of the file **without** extension

In [None]:
output_file = output_directory / '2018-2021'
print(f'{output_file} (existing? {output_file.exists()})')

/home/manu/dlsproc/reports/2018-2021 (existing? False)


# Processing

> Licitaciones publicadas en la Plataforma mediante mecanismos de agregación, excluyendo los contratos menores

The directory for *outsiders* data

In [None]:
# outsiders_directory = data_directory / 'agregados'
outsiders_directory = pathlib.Path.cwd().parent / 'samples' / 'yearly'
assert outsiders_directory.exists()
print(outsiders_directory)

/home/manu/dlsproc/samples/yearly


The list of files to be processed. They were downloaded [here](https://www.hacienda.gob.es/es-ES/GobiernoAbierto/Datos%20Abiertos/Paginas/LicitacionesAgregacion.aspx). One could use something like
```
wget https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2018.zip
```

In [None]:
zip_files = [
    'PlataformasAgregadasSinMenores_2018.zip',
    'PlataformasAgregadasSinMenores_2019.zip',
    'PlataformasAgregadasSinMenores_2020.zip',
    'PlataformasAgregadasSinMenores_2021.zip'
]

Accumulators for the data itself (contracts) and records of deleted entries

In [None]:
res_df = None
res_deleted_series = None

Every file is processed in a loop

In [None]:
%%time
for filename in zip_files:
    
    print(f'Processing "{filename}"')
    
    # above file names are relative to a directory
    f = outsiders_directory / filename
    
    assert f.exists()
    
    # data is read from the above *zip* file, and concatenated into a single `pd.DataFrame`...
    yearly_df = dlsproc.bundle.read_zip(f, concatenate=True)
    
    # ...and re-structured with multiindexed columns
    yearly_df = dlsproc.hier.flat_df_to_multiindexed_df(yearly_df)
    
    # the same contract might show up more than once due to updates...but only the last one is kept
    last_update_only_df = dlsproc.postprocess.keep_updates_only(yearly_df)
    
    # the same zip file also contains information (at the beginning) about deleted entries
    deleted_series = dlsproc.bundle.read_deleted_zip(f)
    
    # if this is NOT the first iteration...
    if res_df is not None:
    
        # ...the new data is stacked
        res_df = dlsproc.assemble.stack(res_df, last_update_only_df)
        res_deleted_series = pd.concat((res_deleted_series, deleted_series), axis=0)
        
    # ...if this is the first iteration
    else:
        
        # ...the new data is set as the accumulated result
        res_df = last_update_only_df
        res_deleted_series = deleted_series

Processing "PlataformasAgregadasSinMenores_2018.zip"
Processing "PlataformasAgregadasSinMenores_2019.zip"
Processing "PlataformasAgregadasSinMenores_2020.zip"
Processing "PlataformasAgregadasSinMenores_2021.zip"
CPU times: user 7.6 s, sys: 36.8 ms, total: 7.63 s
Wall time: 7.63 s


In [None]:
res_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,TenderResult,TenderResult,TenderResult,TenderResult,TenderResult,TenderingProcess,TenderingProcess,TenderingProcess,ValidNoticeInfo,ValidNoticeInfo,ValidNoticeInfo,LocatedContractingParty,LocatedContractingParty,ProcurementProject,TenderResult,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Party,Party,Name,TypeCode,BudgetAmount,BudgetAmount,RequiredCommodityClassification,RealizedLocation,PlannedPeriod,PlannedPeriod,ResultCode,ReceivedTenderQuantity,WinningParty,WinningParty,AwardedTenderedProject,ProcedureCode,TenderSubmissionDeadlinePeriod,TenderSubmissionDeadlinePeriod,NoticeTypeCode,AdditionalPublicationStatus,AdditionalPublicationStatus,ParentLocatedParty,ParentLocatedParty,PlannedPeriod,AwardedTenderedProject,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,PartyIdentification,PartyName,Unnamed: 10_level_3,Unnamed: 11_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,ItemClassificationCode,CountrySubentityCode,StartDate,EndDate,Unnamed: 18_level_3,Unnamed: 19_level_3,PartyIdentification,PartyName,LegalMonetaryTotal,Unnamed: 23_level_3,EndDate,EndTime,Unnamed: 26_level_3,PublicationMediaName,AdditionalPublicationDocumentReference,PartyName,ParentLocatedParty,DurationMeasure,ProcurementProjectLotID,Unnamed: 33_level_3,ExternalReference,Unnamed: 35_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 41_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,ID,Name,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,ID,Name,TaxExclusiveAmount,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,IssueDate,Name,PartyName,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,URI,Unnamed: 35_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 39_level_4,Unnamed: 40_level_4,Unnamed: 41_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Unnamed: 25_level_5,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Name,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Unnamed: 34_level_5,Unnamed: 35_level_5,Unnamed: 36_level_5,PartyName,ParentLocatedParty,Unnamed: 39_level_5,Unnamed: 40_level_5,Unnamed: 41_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Name,PartyName,Unnamed: 39_level_6,Unnamed: 40_level_6,Unnamed: 41_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Name,Unnamed: 39_level_7,Unnamed: 40_level_7,Unnamed: 41_level_7
file name,entry,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8,Unnamed: 24_level_8,Unnamed: 25_level_8,Unnamed: 26_level_8,Unnamed: 27_level_8,Unnamed: 28_level_8,Unnamed: 29_level_8,Unnamed: 30_level_8,Unnamed: 31_level_8,Unnamed: 32_level_8,Unnamed: 33_level_8,Unnamed: 34_level_8,Unnamed: 35_level_8,Unnamed: 36_level_8,Unnamed: 37_level_8,Unnamed: 38_level_8,Unnamed: 39_level_8,Unnamed: 40_level_8,Unnamed: 41_level_8
PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,45233142,ES418,,,8,14,A47082185,CONSTRUCCIONES HERMANOS SASTRE S.A.,60690.08,1.0,2017-11-02,23:59:00,DOC_FORM,Publicación del anuncio de formalización en un...,2018-01-02,,,3.0,1,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00
PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,45233142,ES418,,,8,13,A49012792,CONTRATAS Y OBRAS SAN GREGORIO S.A.,118919.49,1.0,2017-11-02,23:59:00,DOC_FORM,Publicación del anuncio de formalización en un...,2018-01-02,,,3.0,1,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00


How many ros and columns?

In [None]:
res_df.shape

(5062, 40)

The *deleted series* might contain duplicates. We build a new `pd.Series` dropping duplicates by keeping only the last one.

In [None]:
deduplicated_deleted_series = res_deleted_series.sort_values().groupby(res_deleted_series.index).tail(1)
deduplicated_deleted_series

file name                                              id                                                                                 
PlataformasAgregadasSinMenores_20180217_180137_1.atom  https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1974509   2017-12-31 13:10:39.114000+01:00
                                                       https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1970608   2017-12-31 13:10:39.180000+01:00
                                                       https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1973692   2017-12-31 14:10:39.150000+01:00
                                                       https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1975721   2017-12-31 14:10:39.242000+01:00
                                                       https://contrataciondelestado.es/sindicacion/PlataformasAgregadasSinMenores/1974172   2017-12-31 14:10:39.3680

In [None]:
stateful_df = dlsproc.assemble.merge_deleted(res_df, deduplicated_deleted_series)
stateful_df

Unnamed: 0_level_0,Unnamed: 1_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,TenderResult,TenderResult,TenderResult,TenderResult,TenderResult,TenderingProcess,TenderingProcess,TenderingProcess,ValidNoticeInfo,ValidNoticeInfo,ValidNoticeInfo,LocatedContractingParty,LocatedContractingParty,ProcurementProject,TenderResult,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 42_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Party,Party,Name,TypeCode,BudgetAmount,BudgetAmount,RequiredCommodityClassification,RealizedLocation,PlannedPeriod,PlannedPeriod,ResultCode,ReceivedTenderQuantity,WinningParty,WinningParty,AwardedTenderedProject,ProcedureCode,TenderSubmissionDeadlinePeriod,TenderSubmissionDeadlinePeriod,NoticeTypeCode,AdditionalPublicationStatus,AdditionalPublicationStatus,ParentLocatedParty,ParentLocatedParty,PlannedPeriod,AwardedTenderedProject,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 42_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,PartyIdentification,PartyName,Unnamed: 10_level_3,Unnamed: 11_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,ItemClassificationCode,CountrySubentityCode,StartDate,EndDate,Unnamed: 18_level_3,Unnamed: 19_level_3,PartyIdentification,PartyName,LegalMonetaryTotal,Unnamed: 23_level_3,EndDate,EndTime,Unnamed: 26_level_3,PublicationMediaName,AdditionalPublicationDocumentReference,PartyName,ParentLocatedParty,DurationMeasure,ProcurementProjectLotID,Unnamed: 33_level_3,ExternalReference,Unnamed: 35_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 41_level_3,Unnamed: 42_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,ID,Name,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,ID,Name,TaxExclusiveAmount,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,IssueDate,Name,PartyName,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,URI,Unnamed: 35_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 39_level_4,Unnamed: 40_level_4,Unnamed: 41_level_4,Unnamed: 42_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Unnamed: 25_level_5,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Name,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Unnamed: 34_level_5,Unnamed: 35_level_5,Unnamed: 36_level_5,PartyName,ParentLocatedParty,Unnamed: 39_level_5,Unnamed: 40_level_5,Unnamed: 41_level_5,Unnamed: 42_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Name,PartyName,Unnamed: 39_level_6,Unnamed: 40_level_6,Unnamed: 41_level_6,Unnamed: 42_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Name,Unnamed: 39_level_7,Unnamed: 40_level_7,Unnamed: 41_level_7,Unnamed: 42_level_7
file name,entry,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8,Unnamed: 24_level_8,Unnamed: 25_level_8,Unnamed: 26_level_8,Unnamed: 27_level_8,Unnamed: 28_level_8,Unnamed: 29_level_8,Unnamed: 30_level_8,Unnamed: 31_level_8,Unnamed: 32_level_8,Unnamed: 33_level_8,Unnamed: 34_level_8,Unnamed: 35_level_8,Unnamed: 36_level_8,Unnamed: 37_level_8,Unnamed: 38_level_8,Unnamed: 39_level_8,Unnamed: 40_level_8,Unnamed: 41_level_8,Unnamed: 42_level_8
PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,45233142,ES418,,,8,14,A47082185,CONSTRUCCIONES HERMANOS SASTRE S.A.,60690.08,1.0,2017-11-02,23:59:00,DOC_FORM,Publicación del anuncio de formalización en un...,2018-01-02,,,3.0,1,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,45233142,ES418,,,8,13,A49012792,CONTRATAS Y OBRAS SAN GREGORIO S.A.,118919.49,1.0,2017-11-02,23:59:00,DOC_FORM,Publicación del anuncio de formalización en un...,2018-01-02,,,3.0,1,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,451,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1281/17, Entidad: Diputación Provi...",Refuerzo de firme en la VP 4013 Melgar de Arri...,2018-01-02 08:02:51.744000+00:00,1281/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de firme en la VP 4013 Melgar de Arri...,3.0,229259.52,229259.52,45233142,ES418,,,8,11,B49160567,EXFAMEX S.L.,178478.55,1.0,2017-11-02,23:59:00,DOC_FORM,Publicación del anuncio de formalización en un...,2018-01-02,,,3.0,1,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,450,https://contrataciondelestado.es/sindicacion/P...,Id licitación: VI/17/04-015; Órgano de Contrat...,Obras de edificación en el barrio de Pumarabul...,2018-01-02 08:02:56.115000+00:00,VI/17/04-015,EV,,Consejería de Servicios y Derechos Sociales,"Edificación de 36 VPP, garaje y trasteros en e...",3.0,2850705.78,2850705.78,45211340,ES120,,,,,,,,1.0,2017-12-11,14:00:00,DOC_CN,[Plataforma de Contratación del Sector Público...,"[2017-11-15, 2017-11-15, 2017-11-15]",,,20.0,,Pliego_Clausulas_Administrativas_VI-17-04-015.pdf,http://www.asturias.es/Proveedores/FICHEROS/ES...,,,,,,,2017-12-11 14:00:00+00:00,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,449,https://contrataciondelestado.es/sindicacion/P...,"Id Licitación: PcPG/2017/194222, Órgano de Con...",Suministro de gas natural canalizado y gas nat...,2018-01-02 09:10:49.572000+00:00,PcPG/2017/194222,ADJ,A12017369,"Consellería de Economía, Emprego e Industria",Suministro de gas natural canalizado y gas nat...,1.0,,29359984.62,9123000,ES11,,,"[8, 8]","[3, 1]","[A61797536, A61797536]","[GAS NATURAL COMERCIALIZADORA, SA, GAS NATURAL...","[27716723.57, 1643261.06]",4.0,2017-09-29,23:59:00,"[DOC_CN, DOC_CAN_ADJ]","[BOE, DOUE, DOG, Perfil del contratante]","[2017-09-09, 2017-08-31, 2017-09-07, 2018-01-02]",,,2.0,"[1, 2]",,,,,,,,,2017-09-29 23:59:00+00:00,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PlataformasAgregadasSinMenores_20210106_030028.atom,4,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 901800/2020; Órgano de Contrata...,Subministrament i instal.lació de les noves vi...,2021-01-05 08:42:01.478000+00:00,901800/2020,PUB,,Àrea Metropolitana de Barcelona,Subministrament i instal.lació de les noves vi...,1.0,284082.33,284082.33,39171000,ES511,,,,,,,,1.0,2021-01-18,11:00:00,DOC_CN,DOUE,2020-12-23,Entitats municipals de Catalunya,,3.0,,Plec administratiu_901800_2020.pdf,https://contractaciopublica.gencat.cat/ecofin_...,Memòria tècnica valorada.pdf,https://contractaciopublica.gencat.cat/ecofin_...,,,,,2021-01-18 11:00:00+00:00,NaT
PlataformasAgregadasSinMenores_20210106_030028.atom,3,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 2020ZEBO0071; Órgano de Contra...,"Servicios: conservación básica, reten, reparac...",2021-01-05 08:55:41.039000+00:00,2020ZEBO0071,EV,,Pleno,"Servicios: conservación básica, reten, reparac...",2.0,690803.20,690803.20,"[50711000, 50232110, 50232100]",ES212,,,,,,,,1.0,2021-01-04,23:55:00,DOC_CN,Perfil del contratante,2020-11-30,Ayuntamiento de Beasain,Ayuntamiento de Beasain,3.0,,Pliego admvo.pdf,https://contratacion.euskadi.eus/w32-kpeperfi/...,Pliego técnico.pdf,https://contratacion.euskadi.eus/w32-kpeperfi/...,,,,,2021-01-04 23:55:00+00:00,NaT
PlataformasAgregadasSinMenores_20210106_030028.atom,2,https://contrataciondelestado.es/sindicacion/P...,Id licitación: 255K; Órgano de Contratación: ...,CONTRATO DE EJECUCIÓN DE OBRAS DE EDIFICACIÓN ...,2021-01-05 08:55:41.290000+00:00,255K,EV,,Consejo de Admnistración,Ejecución de las obras de edificación de 39 VP...,3.0,6843971.79,5703309.82,45000000,ES212,,,,,,,,1.0,2021-01-04,23:59:00,DOC_CN,Perfil del contratante,2020-12-03,Azpeitia Berritzen S.A.,Azpeitia Berritzen S.A.,18.0,,255K PCAP.pdf,https://contratacion.euskadi.eus/w32-kpeperfi/...,255K Proyecto de ejecución.odt,https://contratacion.euskadi.eus/w32-kpeperfi/...,,,,,2021-01-04 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_20210106_030028.atom,1,https://contrataciondelestado.es/sindicacion/P...,Id licitación: SA-12-2020; Órgano de Contrata...,"Servicio de un sistema de grabación, creación ...",2021-01-05 08:55:41.492000+00:00,SA-12-2020,EV,,Junta de Gobierno Local,Contratación del servicio de un sistema de gra...,2.0,30100.00,15050.00,72510000,ES213,,,,,,,,1.0,2021-01-04,23:59:00,DOC_CN,Perfil del contratante,2020-12-17,Ayuntamiento de Muskiz,Ayuntamiento de Muskiz,2.0,,PCAP SERVICIO VIDEOACTA 2021.doc,https://contratacion.euskadi.eus/w32-kpeperfi/...,PLIEGO CLÁUSULAS TÉCNICAS GRABACIÓN PLENOS.doc,https://contratacion.euskadi.eus/w32-kpeperfi/...,,,,,2021-01-04 23:59:00+00:00,NaT


Saving to *pickle*

In [None]:
stateful_df.to_pickle(output_file.with_suffix('.pickle'))

Saving to *parquet*

In [None]:
parquet_df = dlsproc.assemble.parquet_amenable(stateful_df)
parquet_df.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,ProcurementProject,TenderResult,TenderResult,TenderResult,TenderResult,TenderResult,TenderingProcess,TenderingProcess,TenderingProcess,ValidNoticeInfo,ValidNoticeInfo,ValidNoticeInfo,LocatedContractingParty,LocatedContractingParty,ProcurementProject,TenderResult,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 42_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Party,Party,Name,TypeCode,BudgetAmount,BudgetAmount,RequiredCommodityClassification,RealizedLocation,PlannedPeriod,PlannedPeriod,ResultCode,ReceivedTenderQuantity,WinningParty,WinningParty,AwardedTenderedProject,ProcedureCode,TenderSubmissionDeadlinePeriod,TenderSubmissionDeadlinePeriod,NoticeTypeCode,AdditionalPublicationStatus,AdditionalPublicationStatus,ParentLocatedParty,ParentLocatedParty,PlannedPeriod,AwardedTenderedProject,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 42_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,PartyIdentification,PartyName,Unnamed: 10_level_3,Unnamed: 11_level_3,EstimatedOverallContractAmount,TaxExclusiveAmount,ItemClassificationCode,CountrySubentityCode,StartDate,EndDate,Unnamed: 18_level_3,Unnamed: 19_level_3,PartyIdentification,PartyName,LegalMonetaryTotal,Unnamed: 23_level_3,EndDate,EndTime,Unnamed: 26_level_3,PublicationMediaName,AdditionalPublicationDocumentReference,PartyName,ParentLocatedParty,DurationMeasure,ProcurementProjectLotID,Unnamed: 33_level_3,ExternalReference,Unnamed: 35_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 41_level_3,Unnamed: 42_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,ID,Name,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,ID,Name,TaxExclusiveAmount,Unnamed: 23_level_4,Unnamed: 24_level_4,Unnamed: 25_level_4,Unnamed: 26_level_4,Unnamed: 27_level_4,IssueDate,Name,PartyName,Unnamed: 31_level_4,Unnamed: 32_level_4,Unnamed: 33_level_4,URI,Unnamed: 35_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 39_level_4,Unnamed: 40_level_4,Unnamed: 41_level_4,Unnamed: 42_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5,Unnamed: 25_level_5,Unnamed: 26_level_5,Unnamed: 27_level_5,Unnamed: 28_level_5,Unnamed: 29_level_5,Name,Unnamed: 31_level_5,Unnamed: 32_level_5,Unnamed: 33_level_5,Unnamed: 34_level_5,Unnamed: 35_level_5,Unnamed: 36_level_5,PartyName,ParentLocatedParty,Unnamed: 39_level_5,Unnamed: 40_level_5,Unnamed: 41_level_5,Unnamed: 42_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6,Unnamed: 24_level_6,Unnamed: 25_level_6,Unnamed: 26_level_6,Unnamed: 27_level_6,Unnamed: 28_level_6,Unnamed: 29_level_6,Unnamed: 30_level_6,Unnamed: 31_level_6,Unnamed: 32_level_6,Unnamed: 33_level_6,Unnamed: 34_level_6,Unnamed: 35_level_6,Unnamed: 36_level_6,Name,PartyName,Unnamed: 39_level_6,Unnamed: 40_level_6,Unnamed: 41_level_6,Unnamed: 42_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7,Unnamed: 24_level_7,Unnamed: 25_level_7,Unnamed: 26_level_7,Unnamed: 27_level_7,Unnamed: 28_level_7,Unnamed: 29_level_7,Unnamed: 30_level_7,Unnamed: 31_level_7,Unnamed: 32_level_7,Unnamed: 33_level_7,Unnamed: 34_level_7,Unnamed: 35_level_7,Unnamed: 36_level_7,Unnamed: 37_level_7,Name,Unnamed: 39_level_7,Unnamed: 40_level_7,Unnamed: 41_level_7,Unnamed: 42_level_7
file name,entry,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8,Unnamed: 24_level_8,Unnamed: 25_level_8,Unnamed: 26_level_8,Unnamed: 27_level_8,Unnamed: 28_level_8,Unnamed: 29_level_8,Unnamed: 30_level_8,Unnamed: 31_level_8,Unnamed: 32_level_8,Unnamed: 33_level_8,Unnamed: 34_level_8,Unnamed: 35_level_8,Unnamed: 36_level_8,Unnamed: 37_level_8,Unnamed: 38_level_8,Unnamed: 39_level_8,Unnamed: 40_level_8,Unnamed: 41_level_8,Unnamed: 42_level_8
PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,89917.95,89917.95,[45233142.0],ES418,,,[8.0],[14.0],[A47082185],[CONSTRUCCIONES HERMANOS SASTRE S.A.],[60690.08],1.0,2017-11-02,23:59:00,[DOC_FORM],[Publicación del anuncio de formalización en u...,[2018-01-02],,,3.0,[1.0],,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,175708.46,175708.46,[45233142.0],ES418,,,[8.0],[13.0],[A49012792],[CONTRATAS Y OBRAS SAN GREGORIO S.A.],[118919.49],1.0,2017-11-02,23:59:00,[DOC_FORM],[Publicación del anuncio de formalización en u...,[2018-01-02],,,3.0,[1.0],,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT


In [None]:
parquet_df.to_parquet(output_file.with_suffix('.parquet'))

## Making a sample

If we are working with sample files, a *parquet* sample file is created

In [None]:
if outsiders_directory.parts[-2] == 'samples':
    sample_df  = parquet_df.iloc[itertools.chain(range(10), range(-10, 0))]
    sample_df.to_parquet(outsiders_directory.parent / '2018-2021_20samples.parquet')