In [15]:
#imports#
import os
import time
#from gallipy import Resource, Ark
import requests
import xml.etree.ElementTree as ET
import re
from pathlib import Path

In [16]:
#create directory to save newspapers from gallica 
HERE = _dh[-1]
OUT_DIR = Path(HERE) / "gallica_data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [17]:
cds= { 
      "L'Action française": "cb326819451_date",
      "L'Aurore": "cb32706846t_date",
      "Le Constitutionnel": "cb32747578p_date",
      "La Croix": "cb343631418_date",
      "Figaro : journal non politique": "cb34355551z_date",
      "Le Populaire": "cb34393339w_date",
      "L'Humanité": "cb327877302_date",
      "Le Temps": "cb34431794k_date",
      "Le Petit Journal": "cb32895690j_date",
      "Le Petit Parisien": "cb34419111x_date",
      "La Justice": "cb32802914p_date"
    }

In [18]:
repo_titles= { 
      "L'Action française": "L_Action_francaise",
      "L'Aurore": "L_Aurore",
      "Le Constitutionnel": "Le_Constitutionnel",
      "La Croix": "La_Croix",
      "Figaro : journal non politique": "Le_Figaro",
      "Le Populaire": "Le_Populaire",
      "L'Humanité": "L_Humanite",
      "Le Temps": "Le_Temps",
      "Le Petit Journal": "Le_Petit_Journal",
      "Le Petit Parisien": "Le_Petit_Parisien",
      "La Justice": "La_Justice"
    }

In [None]:
TITLES = [
    "L'Action française",
    "L'Aurore",
    "Le Constitutionnel",
    "La Croix",
    "Figaro : journal non politique",
    "Le Populaire",
    "L'Humanité",
    "Le Temps",
    "Le Petit Journal",
    "Le Petit Parisien",
    "La Justice",
    
]
title_part = " OR ".join(f'dc.title adj "{t}"' for t in TITLES)

CQL = (
    f'({title_part}) '
    'and dc.type all "fascicule" and dc.date adj "19140218"'
)
def create_cql(journal, year) -> str:
    cd = cds[journal]
    return (
        
        f'dc.title adj "{journal}"'
        
        f' and (dc.date = "{year}")'
        f' and arkPress all "{cds[journal]}"'
    )
HITS_PER_PAGE = 15

PAUSE_BETWEEN_HITS = 60.0    
PAUSE_BETWEEN_ISSUES = 0.2


In [26]:
FIRST_YEAR = {
    "L'Action française": 1908,
    "L'Aurore":            1897,
    "Le Constitutionnel":  1870,
    "La Croix":            1880,
    "Figaro : journal non politique":           1870,
    "Le Populaire":        1918,
    "L'Humanité":          1904,
    "Le Temps":            1870,
    "Le Petit Journal":     1870,
}

LAST_YEAR = {
    "L'Action française": 1940,   # banned 1944, but we harvest ≤ 1940
    "L'Aurore":           1940,   # continues later, cap at 1940
    "Le Constitutionnel": 1914,
    "La Croix":           1940,   # still published; cap at 1940
    "Figaro : journal non politique":          1940,
    "Le Populaire":       1940,
    "L'Humanité":         1939,
    "Le Temps":           1940,   # actually ceases 1942; cap at 1940
    "Le Petit Journal":   1940,   # ceases 1944; cap at 1940
   
}

In [21]:
# Gallica endpoints
SRU_URL = "https://gallica.bnf.fr/SRU"
DOC_URL = "https://gallica.bnf.fr/services/Document"

In [None]:

#helpers

def run_sru_page(start_record: int, title, year):
    print(create_cql(title, year))
    params = {
        "version": "1.2",
        "operation": "searchRetrieve",
        "query": create_cql(title, year),

        "startRecord": start_record,
        "maximumRecords": HITS_PER_PAGE,
        "recordSchema": "dc",
        "collapsing": "false"      # ← key line: get *every* fascicule
    }
    
    resp = requests.get(SRU_URL, params=params)
    resp.raise_for_status()
    xml_text = resp.text
    root = ET.fromstring(xml_text)
    ns = {"srw": "http://www.loc.gov/zing/srw/"}
    num_elem = root.find("./srw:numberOfRecords", ns)
    total_hits = int(num_elem.text) if num_elem is not None else 0

    return xml_text, total_hits

def extract_arks_from_sru(xml_text: str):
    ns = {
        "srw": "http://www.loc.gov/zing/srw/",
    }
    root = ET.fromstring(xml_text)
    arks = []
    print(f"Found {len(root.findall('.//srw:record', ns))} records in SRU response.", root)

    for record in root.findall(".//srw:record", ns):
        uri_elem = record.find(".//srw:extraRecordData/uri", ns)
        if uri_elem is not None:
            obj_id = uri_elem.text.strip()  
            arks.append(f"ark:/12148/{obj_id}")
    return arks



In [23]:
import random, time, requests, pathlib, urllib3
from requests.exceptions import ConnectionError, ReadTimeout

UA = "GallicaHarvester/0.3 (mara00008@stud.uni-saarland.de)"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})

def safe_get(url: str,
             tries: int = 6,
             base_wait: float = 10.0,
             max_wait: float = 120.0) -> str:
    
    for attempt in range(1, tries + 1):
        try:
            r = SESSION.get(url, timeout=30)
        except (ConnectionError, ReadTimeout) as e:
            wait = min(max_wait, base_wait * 2 ** (attempt - 1))
            wait += random.uniform(3, 12)
            print(f"[{attempt}/{tries}] connection problem "
                  f"({e.__class__.__name__}), sleep {wait:.1f}s")
            time.sleep(wait)
            continue

        # HTTP response received:
        if r.status_code == 200 and b"denied" not in r.content[:400]:
            return r.text

        # otherwise it is 429 “Too many requests” or 503 “Service unavailable”
        retry_after = r.headers.get("Retry-After")
        wait = int(retry_after or 0)
        wait = max(wait, base_wait * 2 ** (attempt - 1))
        wait = min(wait, max_wait)
        wait += random.uniform(3, 12)
        print(f"[{attempt}/{tries}] {r.status_code} blocked, "
              f"sleep {wait:.1f}s (Retry-After={retry_after})")
        time.sleep(wait)

    # All tries exhausted
    raise RuntimeError(f"safe_get(): gave up after {tries} attempts → {url}")

def download_issue_pdf(ark, outdir):
    out = outdir / f"{ark.split('/')[-1]}.txt"
    if out.exists():
        return False            # don’t hit the server again
    url = f"https://gallica.bnf.fr/{ark}/f1n4.texteBrut"
    txt = safe_get(url)
    out.write_text(txt, encoding="utf-8")
    # polite pause before next OCR page
    time.sleep(random.uniform(2, 8))
    return True


In [None]:
def harvest_all_press_issues():
    for journal in TITLES:
        jdir = OUT_DIR / repo_titles[journal]
        jdir.mkdir(parents=True, exist_ok=True)

        for year in range(FIRST_YEAR[journal], LAST_YEAR[journal] + 1):
            print(f"\nHarvesting {journal} – {year}")
            ydir = jdir / str(year)
            ydir.mkdir(exist_ok=True)

            page = 1                 
            downloaded = 0

            while True:
                start_record = (page - 1) * HITS_PER_PAGE + 1
                xml, total = run_sru_page(start_record, journal, year)

                if page == 1:
                    print(f"Gallica reports {total} issues")

                arks = extract_arks_from_sru(xml)
                if not arks:
                    print("No more ARKs -> stop year")
                    break

                print(f"  SRU page {page}: {len(arks)} ARKs")
                for ark in arks:
                    if download_issue_pdf(ark, ydir):   
                        downloaded += 1

                # Have we reached / passed the last block 
                if page * HITS_PER_PAGE >= total:
                    break

                page += 1
                time.sleep(PAUSE_BETWEEN_HITS)

            print(f"Finished {year}: {downloaded}/{total} issues saved")

In [25]:
harvest_all_press_issues()


Harvesting Le Petit Parisien – 1876
dc.title adj "Le Petit Parisien" and (dc.date = "1876") and arkPress all "cb34419111x_date"
Gallica reports 75 issues
Found 15 records in SRU response. <Element '{http://www.loc.gov/zing/srw/}searchRetrieveResponse' at 0x7f8ca4bd7ec0>
  SRU page 1: 15 ARKs


dc.title adj "Le Petit Parisien" and (dc.date = "1876") and arkPress all "cb34419111x_date"
Found 15 records in SRU response. <Element '{http://www.loc.gov/zing/srw/}searchRetrieveResponse' at 0x7f8ca4bd78d0>
  SRU page 2: 15 ARKs


KeyboardInterrupt: 