# Scraping the AVB for PDFs of bulletins

In [2]:
import os
import re
import time
import sys

import requests

## Fonctions

In [3]:
def get_urls():
    """Retrieve all URLs from root AVB page"""
    root_url = "https://archives.bruxelles.be/bulletins/date"
    resp = requests.get(root_url)
    print(f"Status: {resp.status_code}")
    print(f"Encoding: {resp.encoding}")
    html = resp.text
    print(f"Text length: {len(html)}")

    pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
    urls = re.findall(pattern, html)
    print(f"{len(urls)} PDF files found")
    return urls

In [4]:
def download(urls, offset=0):
    """Dowloading all files starting from offset"""
    for url in urls[offset:]:
        filename = url.split("/")[-1]
        print(f"Dowloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        with open(f"data/pdf/{filename}", 'wb') as f:
            f.write(response.content)

In [26]:
def check(urls):
    """Check if all files have been downloaded"""
    ok_count = 0
    for url in urls:
        filename = url.split("/")[-1]
        downloads = os.listdir('data/pdf')
        if filename not in downloads:
            print(f"{filename} is missing!")
        else:
            ok_count += 1
    print(f"{ok_count} PDFs found on {len(urls)}!")

## Obtenir les URLs des bulletins communaux 

In [8]:
urls = get_urls()

Status: 200
Encoding: utf-8
Text length: 821431
2833 PDF files found


In [12]:
len(urls)

2833

In [13]:
urls[:3]

['https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_1.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_2.pdf',
 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_3.pdf']

## Créer un dossier `data/pdf` s'il n'existe pas

In [14]:
from pathlib import Path

Path("data/pdf").mkdir(parents=True, exist_ok=True)

## Télécharger les 3 premiers PDFs

In [29]:
download(urls[:3])

Dowloading Bxl_1847_Tome_I1_Part_1.pdf...
   done in 15.2 seconds
Dowloading Bxl_1847_Tome_I1_Part_2.pdf...
   done in 14.2 seconds
Dowloading Bxl_1847_Tome_I1_Part_3.pdf...
   done in 13.8 seconds


## Vérifier si les 3 premiers PDFs ont été téléchargés

In [30]:
check(urls[:3])

3 PDFs found on 3!
