### First installed gallipy through pip install git+https://github.com/GeoHistoricalData/gallipy.git ###


In [2]:
#imports#
import os
import time
from gallipy import Resource, Ark
import requests
import xml.etree.ElementTree as ET
import re
from pathlib import Path

In [3]:
#create directory to save newspapers from gallica 
HERE = _dh[-1]
OUT_DIR = Path(HERE) / "gallica_data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [4]:
cds= { 
      "L'Action française": "cb326819451_date",
      "L'Aurore": "cb32706846t_date",
      "Le Constitutionnel": "cb32747578p_date",
      "La Croix": "cb343631418_date",
      "Le Figaro": "cb34355551z_date",
      "Le Populaire": "cb34393339w_date",
      "L'Humanité": "cb327877302_date",
      "Le Temps": "cb34431794k_date",
    }

In [5]:
repo_titles= { 
      "L'Action française": "L_Action_francaise",
      "L'Aurore": "L_Aurore",
      "Le Constitutionnel": "Le_Constitutionnel",
      "La Croix": "La_Croix",
      "Le Figaro": "Le_Figaro",
      "Le Populaire": "Le_Populaire",
      "L'Humanité": "L_Humanite",
      "Le Temps": "Le_Temps",
    }

In [57]:
TITLES = [
    "L'Action française", "L'Aurore", "Le Constitutionnel", "La Croix",
    "Le Figaro", "Le Populaire", "L'Humanité", "Le Temps",
]
title_part = " OR ".join(f'dc.title adj "{t}"' for t in TITLES)

CQL = (
    f'({title_part}) '
    'and dc.type all "fascicule" and dc.date adj "19140218"'
)
def create_cql(journal, year) -> str:
    # start = f"{year}0101"          # 1 Jan YEAR
    # end   = f"{year}1231"          # 31 Dec YEAR
    # title_part = " OR ".join(f'dc.title adj "{t}"' for t in TITLES)
    cd = cds[journal]
    return (
        
        f'dc.title adj "{journal}"'
        #f' and dc.type all "fascicule"'
        f' and (dc.date = "{year}")'
        f' and arkPress all "{cds[journal]}"'

        # f'AND dc.date >= {start} AND dc.date <= {end}'
    )
HITS_PER_PAGE = 15

PAUSE_BETWEEN_HITS = 60.0    
PAUSE_BETWEEN_ISSUES = 0.2


In [56]:
FIRST_YEAR = {
    "L'Action française": 1909,
    "L'Aurore":            1897,
    "Le Constitutionnel":  1870,
    "La Croix":            1880,
    "Le Figaro":           1870,
    "Le Populaire":        1921,
    "L'Humanité":          1904,
    "Le Temps":            1870,
}

LAST_YEAR = {
    "L'Action française": 1940,   # banned 1944, but we harvest ≤ 1940
    "L'Aurore":           1940,   # continues later, cap at 1940
    "Le Constitutionnel": 1914,
    "La Croix":           1940,   # still published; cap at 1940
    "Le Figaro":          1940,
    "Le Populaire":       1940,
    "L'Humanité":         1940,
    "Le Temps":           1940,   # actually ceases 1942; cap at 1940
}

In [8]:
# Gallica endpoints
SRU_URL = "https://gallica.bnf.fr/SRU"
DOC_URL = "https://gallica.bnf.fr/services/Document"

In [14]:

#helpers

def run_sru_page(start_record: int, title, year):
    print(create_cql(title, year))
    params = {
        "version": "1.2",
        "operation": "searchRetrieve",
        "query": create_cql(title, year),

        "startRecord": start_record,
        "maximumRecords": HITS_PER_PAGE,
        "recordSchema": "dc",
        "collapsing": "false"      # ← key line: get *every* fascicule
    }
    
    resp = requests.get(SRU_URL, params=params)
    resp.raise_for_status()
    xml_text = resp.text
    root = ET.fromstring(xml_text)
    ns = {"srw": "http://www.loc.gov/zing/srw/"}
    num_elem = root.find("./srw:numberOfRecords", ns)
    total_hits = int(num_elem.text) if num_elem is not None else 0

    return xml_text, total_hits

def extract_arks_from_sru(xml_text: str):
    ns = {
        "srw": "http://www.loc.gov/zing/srw/",
    }
    root = ET.fromstring(xml_text)
    arks = []
    print(f"Found {len(root.findall('.//srw:record', ns))} records in SRU response.", root)

    for record in root.findall(".//srw:record", ns):
        uri_elem = record.find(".//srw:extraRecordData/uri", ns)
        if uri_elem is not None:
            obj_id = uri_elem.text.strip()  
            arks.append(f"ark:/12148/{obj_id}")
    return arks

def download_issue_pdf(ark, outdir) -> bool:
    r = Resource(ark)
    either = r.content_sync(mode="texteBrut")      # full-issue OCR

    if either.is_left:                # no OCR => skip
        print("   ! Skipped (no OCR)")
        return False

    text = either.value               # <-- already a str
    if not text.strip():              # empty / whitespace-only?
        print("   ! Skipped (empty OCR payload)")
        return False

    ark_id = ark.replace("ark:/", "").replace("/", "_")
    out_path = os.path.join(outdir, f"{ark_id}.txt")

    if os.path.exists(out_path):
        return True                   # already on disk

    # write **text**, not bytes
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(text)

    print(f"   [SAVED OCR] {ark_id}.txt  ({len(text)//1024} KB)")
    return True



In [52]:
def harvest_all_press_issues():
    for journal in TITLES:
        jdir = OUT_DIR / repo_titles[journal]
        jdir.mkdir(parents=True, exist_ok=True)

        for year in range(FIRST_YEAR[journal], LAST_YEAR[journal] + 1):
            print(f"\nHarvesting {journal} – {year}")
            ydir = jdir / str(year)
            ydir.mkdir(exist_ok=True)

            page = 1                 # ← reset for every year
            downloaded = 0

            while True:
                start_record = (page - 1) * HITS_PER_PAGE + 1
                xml, total = run_sru_page(start_record, journal, year)

                if page == 1:
                    print(f"Gallica reports {total} issues")

                arks = extract_arks_from_sru(xml)
                if not arks:
                    print("No more ARKs -> stop year")
                    break

                print(f"  SRU page {page}: {len(arks)} ARKs")
                for ark in arks:
                    if download_issue_pdf(ark, ydir):   # or _pdf, your helper
                        downloaded += 1

                # Have we reached / passed the last block ?
                if page * HITS_PER_PAGE >= total:
                    break

                page += 1
                time.sleep(PAUSE_BETWEEN_HITS)

            print(f"Finished {year}: {downloaded}/{total} issues saved")

In [53]:
import random, time, requests, pathlib

UA = "GallicaHarvester/0.2 (+youremail@example.com)"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": UA})

def safe_get(url, tries=6, base_wait=10):
    for i in range(tries):
        r = SESSION.get(url, timeout=30)
        if r.status_code == 200 and b"denied" not in r.content[:200]:
            return r.text
        wait = max(base_wait, int(r.headers.get("Retry-After", 0)))
        wait += random.uniform(3, 12)           # jitter
        print(f"  blocked (try {i+1}), sleeping {wait:.1f}s")
        time.sleep(wait)
    raise RuntimeError(f"still blocked after {tries} tries → {url}")

def download_issue_pdf(ark, outdir):
    out = outdir / f"{ark.split('/')[-1]}.txt"
    if out.exists():
        return False            # don’t hit the server again
    url = f"https://gallica.bnf.fr/{ark}/f1n4.texteBrut"
    txt = safe_get(url)
    out.write_text(txt, encoding="utf-8")
    # polite pause before next OCR page
    time.sleep(random.uniform(2, 8))
    return True


In [58]:
harvest_all_press_issues()


Harvesting L'Action française – 1909
dc.title adj "L'Action française" and (dc.date = "1909") and arkPress all "cb326819451_date"
Gallica reports 365 issues
Found 15 records in SRU response. <Element '{http://www.loc.gov/zing/srw/}searchRetrieveResponse' at 0x7ff1de2c7f60>
  SRU page 1: 15 ARKs
dc.title adj "L'Action française" and (dc.date = "1909") and arkPress all "cb326819451_date"
Found 15 records in SRU response. <Element '{http://www.loc.gov/zing/srw/}searchRetrieveResponse' at 0x7ff1de9e2480>
  SRU page 2: 15 ARKs
dc.title adj "L'Action française" and (dc.date = "1909") and arkPress all "cb326819451_date"
Found 15 records in SRU response. <Element '{http://www.loc.gov/zing/srw/}searchRetrieveResponse' at 0x7ff1dd638950>
  SRU page 3: 15 ARKs
  blocked (try 1), sleeping 15.2s
  blocked (try 2), sleeping 14.1s
  blocked (try 3), sleeping 19.8s
  blocked (try 1), sleeping 17.1s
  blocked (try 2), sleeping 13.2s
dc.title adj "L'Action française" and (dc.date = "1909") and arkPress

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))