# Scraping the AVB for PDFs of bulletins

In [1]:
import os
import re
import time
import sys

import requests

## Fonctions

In [2]:
def get_urls():
    """Retrieve all URLs from root AVB page"""
    root_url = "https://archives.bruxelles.be/bulletins/date"
    resp = requests.get(root_url)
    print(f"Status: {resp.status_code}")
    print(f"Encoding: {resp.encoding}")
    html = resp.text
    print(f"Text length: {len(html)}")

    pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
    urls = re.findall(pattern, html)
    print(f"{len(urls)} PDF files found")
    return urls

In [3]:
def download(pdf_path, urls, offset=0):
    """Dowloading all files starting from offset"""
    for url in urls[offset:]:
        filename = url.split("/")[-1]
        print(f"Dowloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        with open(f"{pdf_path}/{filename}", 'wb') as f:
            f.write(response.content)

In [4]:
def check(pdf_path, urls):
    """Check if all files have been downloaded"""
    ok_count = 0
    for url in urls:
        filename = url.split("/")[-1]
        downloads = os.listdir(pdf_path)
        if filename not in downloads:
            print(f"{filename} is missing!")
        else:
            ok_count += 1
    print(f"{ok_count} PDFs found on {len(urls)}!")

## Obtenir les URLs des bulletins communaux 

In [5]:
urls = get_urls()

Status: 200
Encoding: utf-8
Text length: 821431
2833 PDF files found


In [6]:
pdf_path = '../data/pdf'

In [7]:
from pathlib import Path

Path(pdf_path).mkdir(parents=True, exist_ok=True)

## Télécharger les 3 premiers PDFs

In [10]:
%%time
download(pdf_path, urls[200:400])

Dowloading Bxl_1869_Tome_I1_Part_3.pdf...
   done in 9.8 seconds
Dowloading Bxl_1869_Tome_I1_Part_4.pdf...
   done in 0.9 seconds
Dowloading Bxl_1869_Tome_I2_Part_1.pdf...
   done in 9.8 seconds
Dowloading Bxl_1869_Tome_I2_Part_2.pdf...
   done in 9.3 seconds
Dowloading Bxl_1869_Tome_I2_Part_3.pdf...
   done in 8.8 seconds
Dowloading Bxl_1869_Tome_I2_Part_4.pdf...
   done in 1.3 seconds
Dowloading Bxl_1869_Tome_II1_Part_1.pdf...
   done in 8.6 seconds
Dowloading Bxl_1869_Tome_II1_Part_2.pdf...
   done in 10.2 seconds
Dowloading Bxl_1869_Tome_II1_Part_3.pdf...
   done in 8.0 seconds
Dowloading Bxl_1869_Tome_II1_Part_4.pdf...
   done in 11.3 seconds
Dowloading Bxl_1869_Tome_II1_Part_5.pdf...
   done in 3.0 seconds
Dowloading Bxl_1870_Tome_I1_Part_1.pdf...
   done in 8.4 seconds
Dowloading Bxl_1870_Tome_I1_Part_2.pdf...
   done in 11.5 seconds
Dowloading Bxl_1870_Tome_I1_Part_3.pdf...
   done in 9.4 seconds
Dowloading Bxl_1870_Tome_I1_Part_4.pdf...
   done in 9.1 seconds
Dowloading Bxl_18

KeyboardInterrupt: 

## Vérifier si les 3 premiers PDFs ont été téléchargés

In [18]:
check(pdf_path, urls[:3])

3 PDFs found on 3!
