In [None]:
#| default_exp download

# download

> Download new data from internet

In [None]:
#| export
import pathlib
import urllib
import datetime

import urllib3
import pandas as pd
# import rich.progress
# from tqdm.autonotebook import tqdm
from tqdm import tqdm
import yaml

import sproc.structure

In order to avoid errors like
> certificate verify failed: unable to get local issuer certificate

when downloading from <https://contrataciondelsectorpublico.gob.es/>

In [None]:
#| export
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

A function to download a file from internet.

In [None]:
#| export
def file(
    url: str, # URL for the file to be downloaded
    output_file: str | pathlib.Path | None, # Name of the local file to be saved; if `None` its content is returned
    timeout: float = 2. # How long to wait for a response
    ) -> None | bytes:

    pool_manager = urllib3.PoolManager(cert_reqs='CERT_NONE')

    try:
    
        # the request is made
        request = pool_manager.request('GET', url, timeout=timeout)

    except urllib3.exceptions.MaxRetryError:

        # print(f'can\'t download "{url}"')

        raise Exception(f'can\'t download "{url}"')
    
    # if no output file was given...
    if output_file is None:
    
        # ...the content is returned
        return request.data

    # in case a `str` was passed
    output_file = pathlib.Path(output_file)

    with output_file.open('wb') as f:

        f.write(request.data)

A sample file (from this repository) is downloaded

In [None]:
url = 'https://raw.githubusercontent.com/manuvazquez/sproc/main/samples/2018-2021_20samples.parquet'
output_file = 'download_sample.parquet'
file(url, output_file)

Let us check it is readable

In [None]:
pd.read_parquet(output_file).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,summary,title,updated,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,ContractFolderStatus,deleted_on
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,ContractFolderID,ContractFolderStatusCode,LocatedContractingParty,LocatedContractingParty,ProcurementProject,ProcurementProject,...,LegalDocumentReference,LegalDocumentReference,TechnicalDocumentReference,TechnicalDocumentReference,LocatedContractingParty,LocatedContractingParty,TenderingProcess,TenderingProcess,TenderingProcess,Unnamed: 23_level_1
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Party,Party,Name,TypeCode,...,ID,Attachment,ID,Attachment,ParentLocatedParty,ParentLocatedParty,ParticipationRequestReceptionPeriod,ParticipationRequestReceptionPeriod,TenderSubmissionDeadlinePeriod,Unnamed: 23_level_2
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,PartyIdentification,PartyName,Unnamed: 11_level_3,Unnamed: 12_level_3,...,Unnamed: 14_level_3,ExternalReference,Unnamed: 16_level_3,ExternalReference,ParentLocatedParty,ParentLocatedParty,EndDate,EndTime,Unnamed: 22_level_3,Unnamed: 23_level_3
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,ID,Name,Unnamed: 11_level_4,Unnamed: 12_level_4,...,Unnamed: 14_level_4,URI,Unnamed: 16_level_4,URI,ParentLocatedParty,ParentLocatedParty,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4
Unnamed: 0_level_5,Unnamed: 1_level_5,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,...,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,PartyName,ParentLocatedParty,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5
Unnamed: 0_level_6,Unnamed: 1_level_6,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,...,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Name,PartyName,Unnamed: 20_level_6,Unnamed: 21_level_6,Unnamed: 22_level_6,Unnamed: 23_level_6
Unnamed: 0_level_7,Unnamed: 1_level_7,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,...,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Name,Unnamed: 20_level_7,Unnamed: 21_level_7,Unnamed: 22_level_7,Unnamed: 23_level_7
zip,file name,entry,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8,Unnamed: 20_level_8,Unnamed: 21_level_8,Unnamed: 22_level_8,Unnamed: 23_level_8
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,453,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1284/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 3001 Renedo de Esgu...,2018-01-02 08:01:52.024000+00:00,1284/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 3001 Renedo de Esgu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,452,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1282/17, Entidad: Diputación Provi...",Refuerzo de Firme en la VP 6603 Mota del Marqu...,2018-01-02 08:02:24.833000+00:00,1282/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de Firme en la VP 6603 Mota del Marqu...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,451,https://contrataciondelestado.es/sindicacion/P...,"Expediente: 1281/17, Entidad: Diputación Provi...",Refuerzo de firme en la VP 4013 Melgar de Arri...,2018-01-02 08:02:51.744000+00:00,1281/17,RES,L02000047,Diputación Provincial de Valladolid,Refuerzo de firme en la VP 4013 Melgar de Arri...,3.0,...,,,,,,,2017-11-02,23:59:00,2017-11-02 23:59:00+00:00,NaT
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,450,https://contrataciondelestado.es/sindicacion/P...,Id licitación: VI/17/04-015; Órgano de Contrat...,Obras de edificación en el barrio de Pumarabul...,2018-01-02 08:02:56.115000+00:00,VI/17/04-015,EV,,Consejería de Servicios y Derechos Sociales,"Edificación de 36 VPP, garaje y trasteros en e...",3.0,...,Pliego_Clausulas_Administrativas_VI-17-04-015.pdf,http://www.asturias.es/Proveedores/FICHEROS/ES...,,,,,,,2017-12-11 14:00:00+00:00,NaT
some.zip,PlataformasAgregadasSinMenores_20180217_180137_1.atom,449,https://contrataciondelestado.es/sindicacion/P...,"Id Licitación: PcPG/2017/194222, Órgano de Con...",Suministro de gas natural canalizado y gas nat...,2018-01-02 09:10:49.572000+00:00,PcPG/2017/194222,ADJ,A12017369,"Consellería de Economía, Emprego e Industria",Suministro de gas natural canalizado y gas nat...,1.0,...,,,,,,,,,2017-09-29 23:59:00+00:00,NaT


In [None]:
aggregate_2018 = 'https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2018.zip'
# file(aggregate_2018, 'foo.zip')

A convenience function leveraging `file` to *read* YAML files

In [None]:
#| export
def yaml_to_dict(
    url: str, # URL for the file to be downloaded
    timeout: float = 2. # How long to wait for a response
    ) -> dict: # YAML data
    "Read YAML data from an URL"

    return yaml.safe_load(file(url, None, timeout))

An example

In [None]:
yaml_to_dict('https://raw.githubusercontent.com/manuvazquez/sproc/main/samples/PLACE.yaml')

{'id': ['id', nan, nan, nan, nan, nan, nan],
 'summary': ['summary', nan, nan, nan, nan, nan, nan],
 'title': ['title', nan, nan, nan, nan, nan, nan],
 'updated': ['updated', nan, nan, nan, nan, nan, nan],
 'Número de Expediente': ['ContractFolderStatus',
  'ContractFolderID',
  nan,
  nan,
  nan,
  nan,
  nan],
 'Estado': ['ContractFolderStatus',
  'ContractFolderStatusCode',
  nan,
  nan,
  nan,
  nan,
  nan],
 'ID': ['ContractFolderStatus',
  'LocatedContractingParty',
  'Party',
  'PartyIdentification',
  'ID',
  nan,
  nan],
 'Nombre': ['ContractFolderStatus',
  'LocatedContractingParty',
  'Party',
  'PartyName',
  'Name',
  nan,
  nan],
 'URL perfil de contratante': ['ContractFolderStatus',
  'LocatedContractingParty',
  'BuyerProfileURIID',
  nan,
  nan,
  nan,
  nan],
 'Ubicación orgánica': ['ContractFolderStatus',
  'LocatedContractingParty',
  'ParentLocatedParty',
  'PartyName',
  'Name',
  nan,
  nan],
 'Objeto del Contrato': ['ContractFolderStatus',
  'ProcurementProject'

ULRs are produced from the given date, `from_date`, onwards.

In [None]:
#| export
def make_urls(
    base_url: str, # URL to the server including the hosting directory
    base_filename: str, # File name without neither date information nor extension
    from_date: datetime.datetime # The starting date
    ) -> list[tuple[str, str]]: # List of tuples (URL, file name)
    "Assemble URLs for files of a given kind that are to be downloaded"

    # a "hack" to get a date one month (never mind the day) after `from_date`
    next_month = from_date.replace(day=28) + datetime.timedelta(days=5)

    today = datetime.datetime.today()

    # print(f'{from_date=}')
    # print(f'{next_month=}')
    # print(f'{today=}')

    end_year = today.year
    end_month = today.month

    urls_filenames = []

    def append(filename: str):

        urls_filenames.append((urllib.parse.urljoin(base_url, filename), filename))

    # if a month after `from_date` is still the same year...
    if next_month.year == from_date.year:
    
        # ...loop through the remaining months
        for month in range(next_month.month, 12+1):

            filename = base_filename + str(next_month.year) + str(month).zfill(2) + '.zip'

            append(filename)

    # for the year of `from_date` until the *previous* year
    for year in range(from_date.year+1, today.year):
    # for year in range(next_month.year+1, today.year):

        # print(f'{year=}')

        filename = base_filename + str(year) + '.zip'

        append(filename)
    
    for month in range(1, today.month):
        
        filename = base_filename + str(today.year) + str(month).zfill(2) + '.zip'

        append(filename)

    return urls_filenames

As an example, let us assemble the URLs of all the *outsider*s files from November 2019 on.

In [None]:
sproc.structure.tables['outsiders']

{'base_url': 'https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/',
 'base_filename': 'PlataformasAgregadasSinMenores_',
 'naming_filename': 'outsiders.yaml'}

In [None]:
sproc.structure.tables['outsiders']['base_url'], sproc.structure.tables['outsiders']['base_filename']

('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/',
 'PlataformasAgregadasSinMenores_')

In [None]:
urls_filenames = make_urls(
    sproc.structure.tables['outsiders']['base_url'],
    sproc.structure.tables['outsiders']['base_filename'],
    from_date=datetime.datetime(2019, 11, 1))
# urls_filenames[:3]
urls_filenames

[('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_201912.zip',
  'PlataformasAgregadasSinMenores_201912.zip'),
 ('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2020.zip',
  'PlataformasAgregadasSinMenores_2020.zip'),
 ('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2021.zip',
  'PlataformasAgregadasSinMenores_2021.zip'),
 ('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2022.zip',
  'PlataformasAgregadasSinMenores_2022.zip'),
 ('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_202301.zip',
  'PlataformasAgregadasSinMenores_202301.zip')]

From the 2022 new year's eve

In [None]:
make_urls(
    sproc.structure.tables['outsiders']['base_url'],
    sproc.structure.tables['outsiders']['base_filename'],
    from_date=datetime.datetime(2022, 1, 1) - datetime.timedelta(days=1))

[('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_2022.zip',
  'PlataformasAgregadasSinMenores_2022.zip'),
 ('https://contrataciondelsectorpublico.gob.es/sindicacion/sindicacion_1044/PlataformasAgregadasSinMenores_202301.zip',
  'PlataformasAgregadasSinMenores_202301.zip')]

In order to *actually* download all the files of a given `kind` from a certain `date.

In [None]:
#| export
def from_date(
    kind: str, # One of 'outsiders', 'insiders', or 'minors'
    date: datetime.datetime, # The starting date
    output_directory: str | pathlib.Path = pathlib.Path.cwd() # Output directory, defaults is the current one
    ) -> list[pathlib.Path]: # File name of every downloaded file
    "Downloads all the files of a given kind from a certain moment in time"

    # in case a `str` was passed
    output_directory = pathlib.Path(output_directory)

    # `kind` should be one of the pre-set types
    assert kind in sproc.structure.tables

    info = sproc.structure.tables[kind]

    # urls_filenames = make_urls(**info, from_date=date)
    urls_filenames = make_urls(info['base_url'], info['base_filename'], from_date=date)

    every_output_file = []

    # in order to avoid showing a useless progress bar
    if not urls_filenames:

        return every_output_file

    for url, filename in tqdm(urls_filenames, desc='Downloading raw data'):
    # for url, filename in rich.progress.track(urls_filenames, description='Downloading raw data'):

        output_file = output_directory / filename

        # file is annotated *regardless* of whether download was needed or it was already there
        every_output_file.append(output_file)

        if output_file.exists():

            # print(f'"{output_file.name}" already exists')
            tqdm.write(f'"{output_file.name}" already exists')
            
            continue

        tqdm.write(f'downloading "{output_file.name}"...')

        # file is actually downloaded
        file(url, output_file)

    return every_output_file

Let us make a new directory...

In [None]:
output_directory = pathlib.Path.cwd().parent / 'downloads'
output_directory.mkdir(exist_ok=True)
print(output_directory)

/home/manu/Sync/UC3M/proyectos/2022/nextProcurement/sproc/downloads


...into which files are to be downloaded

In [None]:
try:
    dl_files = from_date('outsiders', datetime.datetime(2021, 10, 1), output_directory=output_directory)
except:
    print('can\'t download...most likeky due to banning...')

Downloading raw data: 100%|██████████| 4/4 [00:00<00:00, 315.46it/s]

"PlataformasAgregadasSinMenores_202111.zip" already exists
"PlataformasAgregadasSinMenores_202112.zip" already exists
"PlataformasAgregadasSinMenores_2022.zip" already exists
"PlataformasAgregadasSinMenores_202301.zip" already exists





In [None]:
#| hide
from nbdev.doclinks import nbdev_export

In [None]:
#| hide
nbdev_export('80_download.ipynb')