In [2]:
import os, time, json, pathlib, re
import requests
import pandas as pd
from tqdm import tqdm

# -------------------------------
# Settings
# -------------------------------
TOPIC          = "romance"     # bookshelf/subject (case-insensitive)
LANGS          = "en"          # two-letter codes, comma-separated if multiple
COPYRIGHT      = "false"       # 'false' => public domain in the USA (Gutendex semantics)
PAGE_SIZE_NOTE = "Gutendex returns up to 32 results per page by default."  # FYI
MAX_BOOKS      = 100          # how many to pull in total
RATE_LIMIT_S   = 0.15          # be polite
OUTDIR         = pathlib.Path("pg_romance_texts")
OUTDIR.mkdir(exist_ok=True, parents=True)

# Preferred download formats in order
PREFERRED_MIME = [
    "text/plain; charset=utf-8",
    "text/plain",
    "text/html; charset=utf-8",
    "text/html"
]

def gutendex_page(url):
    """GET a Gutendex page and return parsed JSON."""
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    return r.json()

def build_url(page_url=None):
    """Compose the initial URL with filters, or return the next-page URL."""
    if page_url:
        return page_url
    # Docs: topic filters bookshelves/subjects; languages & copyright are supported
    # e.g., https://gutendex.com/books?topic=romance&languages=en&copyright=false
    return f"https://gutendex.com/books?topic={TOPIC}&languages={LANGS}&copyright={COPYRIGHT}"

def pick_best_text(formats: dict):
    """Choose the best text-like URL using preferred MIME order."""
    for mime in PREFERRED_MIME:
        if mime in formats:
            return mime, formats[mime]
    # fall-back: any text/* if present
    for k, v in formats.items():
        if k.startswith("text/"):
            return k, v
    return None, None

# -------------------------------
# Crawl Gutendex for Romance
# -------------------------------
books = []
url = build_url()
pbar = tqdm(total=MAX_BOOKS, desc="Fetching Romance (Gutendex topic)")
while url and len(books) < MAX_BOOKS:
    js = gutendex_page(url)
    results = js.get("results", [])
    for b in results:
        # Basic fields we care about
        bid = b.get("id")
        title = b.get("title")
        authors = [a.get("name") for a in b.get("authors", []) if a.get("name")]
        download_count = b.get("download_count")
        subjects = b.get("subjects", [])
        bookshelves = b.get("bookshelves", [])
        languages = b.get("languages", [])
        formats = b.get("formats", {})

        mime, dl = pick_best_text(formats)
        books.append({
            "pg_id": bid,
            "title": title,
            "authors": authors,
            "download_count": download_count,
            "subjects": subjects,
            "bookshelves": bookshelves,
            "languages": languages,
            "chosen_mime": mime,
            "text_url": dl
        })
        if len(books) >= MAX_BOOKS:
            break
    pbar.update(len(results))
    url = js.get("next")  # Gutendex provides a full URL for the next page
    time.sleep(RATE_LIMIT_S)
pbar.close()

df = pd.DataFrame(books)
print(f"Collected {len(df)} romance items from Gutendex (topic={TOPIC}, lang={LANGS}, PD={COPYRIGHT}).")
display(df.head(5))

Fetching Romance (Gutendex topic):   3%|▎         | 32/1000 [00:14<07:32,  2.14it/s]
Fetching Romance (Gutendex topic): 128it [00:23,  5.42it/s]


Collected 100 romance items from Gutendex (topic=romance, lang=en, PD=false).


Unnamed: 0,pg_id,title,authors,download_count,subjects,bookshelves,languages,chosen_mime,text_url
0,1513,Romeo and Juliet,"[Shakespeare, William]",87684,"[Conflict of generations -- Drama, Juliet (Fic...","[Category: British Literature, Category: Class...",[en],text/html,https://www.gutenberg.org/ebooks/1513.html.images
1,1342,Pride and Prejudice,"[Austen, Jane]",76463,"[Courtship -- Fiction, Domestic fiction, Engla...","[Best Books Ever Listings, Category: British L...",[en],text/html,https://www.gutenberg.org/ebooks/1342.html.images
2,2641,A Room with a View,"[Forster, E. M. (Edward Morgan)]",57628,"[British -- Italy -- Fiction, England -- Ficti...","[Category: British Literature, Category: Novel...",[en],text/html,https://www.gutenberg.org/ebooks/2641.html.images
3,67979,The Blue Castle: a novel,"[Montgomery, L. M. (Lucy Maud)]",47359,"[Canada -- History -- 1914-1945 -- Fiction, Ch...","[Category: Novels, Category: Romance]",[en],text/html,https://www.gutenberg.org/ebooks/67979.html.im...
4,1260,Jane Eyre: An Autobiography,"[Brontë, Charlotte]",41093,"[Bildungsromans, Charity-schools -- Fiction, C...","[Category: British Literature, Category: Class...",[en],text/html,https://www.gutenberg.org/ebooks/1260.html.images
