# Scraping the AVB for PDFs of bulletins

In [52]:
import os
import re
import time
import sys

import requests

In [53]:
## Fonctions

In [54]:
def get_urls():
    """Retrieve all URLs from root AVB page"""
    root_url = "https://archives.bruxelles.be/bulletins/date"
    resp = requests.get(root_url)
    print(f"Status: {resp.status_code}")
    print(f"Encoding: {resp.encoding}")
    html = resp.text
    print(f"Text length: {len(html)}")

    pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
    urls = re.findall(pattern, html)
    print(f"{len(urls)} PDF files found")
    return urls

In [55]:
def download(urls, offset=0):
    """Dowloading all files starting from offset"""
    for url in urls[offset:]:
        filename = url.split("/")[-1]
        print(f"Dowloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        with open(f"data/pdf/{filename}", 'wb') as f:
            f.write(response.content)

In [56]:
def check(urls):
    """Check if all files have been downloaded"""
    ok_count = 0
    for url in urls:
        filename = url.split("/")[-1]
        downloads = os.listdir('data/pdf')
        if filename not in downloads:
            print(f"{filename} is missing!")
        else:
            ok_count += 1
    print(f"{ok_count} PDFs found on {len(urls)}!")

## Obtenir les URLs des bulletins communaux 

In [57]:
urls = get_urls()

Status: 200
Encoding: utf-8
Text length: 821431
2833 PDF files found


In [58]:
len(urls)

2833

In [59]:
urls[:]

ocuments/Bxl_1899_Tome_I1_Part_7.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I1_Part_8.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I1_Part_9.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_1.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_10.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_11.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_12.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_13.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_2.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1899_Tome_I2_Part_3.

## Créer un dossier `data/pdf` s'il n'existe pas

In [60]:
pdf_path = '/Volumes/Macintosh HD/Data/ADB/pdf'

In [61]:
from pathlib import Path

Path("data/pdf").mkdir(parents=True, exist_ok=True)

## Télécharger les 3 premiers PDFs

In [62]:
%%time
download(pdf_path, urls[:])

_Part_1.pdf...
   done in 1.2 seconds
Dowloading Bxl_1971_Tome_I_Part_10.pdf...
   done in 1.4 seconds
Dowloading Bxl_1971_Tome_I_Part_11.pdf...
   done in 1.5 seconds
Dowloading Bxl_1971_Tome_I_Part_12.pdf...
   done in 1.3 seconds
Dowloading Bxl_1971_Tome_I_Part_13.pdf...
   done in 1.1 seconds
Dowloading Bxl_1971_Tome_I_Part_14.pdf...
   done in 0.9 seconds
Dowloading Bxl_1971_Tome_I_Part_2.pdf...
   done in 1.3 seconds
Dowloading Bxl_1971_Tome_I_Part_3.pdf...
   done in 1.4 seconds
Dowloading Bxl_1971_Tome_I_Part_4.pdf...
   done in 1.4 seconds
Dowloading Bxl_1971_Tome_I_Part_5.pdf...
   done in 1.7 seconds
Dowloading Bxl_1971_Tome_I_Part_6.pdf...
   done in 1.5 seconds
Dowloading Bxl_1971_Tome_I_Part_7.pdf...
   done in 1.5 seconds
Dowloading Bxl_1971_Tome_I_Part_8.pdf...
   done in 1.4 seconds
Dowloading Bxl_1971_Tome_I_Part_9.pdf...
   done in 1.4 seconds
Dowloading Bxl_1971_Tome_RptAn_Part_1.pdf...
   done in 1.0 seconds
Dowloading Bxl_1971_Tome_RptAn_Part_2.pdf...
   done in 0

## Vérifier si les 3 premiers PDFs ont été téléchargés

In [64]:
check(pdf_path, urls[:])

2833 PDFs found on 2833!
