In [1]:
import fitz  # PyMuPDF

doc = fitz.open("Scotts_1917.pdf")
subdoc = fitz.open()

# Extract first 20 pages
for i in range(33, 96):
    subdoc.insert_pdf(doc, from_page=i, to_page=i)

subdoc.save("usa_pages.pdf")


In [2]:
import fitz  # PyMuPDF
import pandas as pd
import re

# Load the PDF
doc = fitz.open("usa_pages.pdf")

data = []

current_country = None
current_year = None

# Regex patterns
year_pattern = re.compile(r"\b(18\d{2}|19\d{2})\b")
entry_pattern = re.compile(r'(?P<value>\d+c|\$?\d+\.\d+|\d+\s?c)\s(?P<color>[a-zA-Z\.\- ]+)\s+(?P<used>\d+\.\d+)?\s*(?P<unused>\d+\.\d+)?')

# Country heading heuristic
def is_country_heading(line):
    return (
        line.isupper()
        and len(line.split()) < 6
        and not line.strip().endswith('.')
        and not line.strip().startswith("TABLES")
        and "CENTS" not in line
        and any(c.isalpha() for c in line)
    )

for page in doc:
    text = page.get_text()
    lines = text.split('\n')

    for line in lines:
        line = line.strip()

        # Detect country section headings
        if is_country_heading(line):
            current_country = line.title()
            continue

        # Detect year
        year_match = year_pattern.search(line)
        if year_match:
            current_year = year_match.group()

        # Attempt to match a stamp entry
        match = entry_pattern.search(line)
        if match:
            value = match.group("value")
            color = match.group("color").strip()
            used = match.group("used")
            unused = match.group("unused")
            data.append({
                "Year": current_year,
                "Stamp Value": value,
                "Color": color,
                "Used Price": used,
                "Unused Price": unused,
                "Raw Line": line
            })

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv("stamps_usa_over_time.csv", index=False)

print("✅ Saved as 'stamps_usa_over_time.csv'")


✅ Saved as 'stamps_usa_over_time.csv'


In [3]:
!pip install pymupdf

Collecting pymupdf
  Obtaining dependency information for pymupdf from https://files.pythonhosted.org/packages/4a/26/8c72973b8833a72785cedc3981eb59b8ac7075942718bbb7b69b352cdde4/pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata
  Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-win_amd64.whl (18.7 MB)
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/18.7 MB 330.3 kB/s eta 0:00:57
   ---------------------------------------- 0.0/18.7 MB 393.8 kB/s eta 0:00:48
   ---------------------------------------- 0.1/18.7 MB 798.9 kB/s eta 0:00:24
   - -------------------------------------- 0.9/18.7 MB 5.1 MB/s eta 0:00:04
   ----- ---------------------------------- 2.6/18.7 MB 11.2 MB/s eta 0:00:02
   ---------- ----------------------------- 5.1/18.7 MB 19.3 MB/s eta 0:00:01
   --------------- -------------