## Bulk download ELODIE spectra for FGK list

This notebook:
- Reads `../data/fgk.txt`
- Filters rows with `origin == "ELODIE"`
- Extracts object identifiers (prefers `HD...` names)
- Downloads all available ELODIE spectra (+ CCF) with concurrency
- Saves:
  - raw FITS to `../data/spectra/elodie/`
  - Zarr to `../data/elodie.zarr`
  - not-found identifiers to `../data/elodie_not_found.jsonl`

> Note: This runs real network downloads and may take a while.

In [1]:
import sys
import os
import pandas as pd
# Add the parent directory of the project to sys.path to ensure spectra_download is importable
notebook_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
sys.path.append(os.path.dirname(notebook_dir))

In [2]:
from __future__ import annotations

import logging
from pathlib import Path

from spectra_download import ElodieSource, SpectraRequest, bulk_download

logging.basicConfig(level=logging.INFO)

ROOT = Path("..").resolve()
FGK_PATH = ROOT / "data" / "fgk.txt"
RAW_SAVE_PATH = ROOT / "data" / "spectra" / "elodie"
ZARR_PATH = ROOT / "data" / "elodie.zarr"
NOT_FOUND_PATH = ROOT / "data" / "elodie_not_found.jsonl"
ERROR_PATH = ROOT / "data" / "elodie_errors.jsonl"

RAW_SAVE_PATH.mkdir(parents=True, exist_ok=True)

sources = {
    "elodie": ElodieSource(timeout=20, max_retries=3),
}

print("FGK:", FGK_PATH)
print("raw_save_path:", RAW_SAVE_PATH)
print("zarr_paths:", ZARR_PATH)
print("not_found_path:", NOT_FOUND_PATH)
print("error_path:", ERROR_PATH)

FGK: /Users/mjablons/Documents/spectra/data/fgk.txt
raw_save_path: /Users/mjablons/Documents/spectra/data/spectra/elodie
zarr_paths: /Users/mjablons/Documents/spectra/data/elodie.zarr
not_found_path: /Users/mjablons/Documents/spectra/data/elodie_not_found.jsonl


In [3]:
import csv
import re

HD_RE = re.compile(r"^(HD\d+)$", re.IGNORECASE)


def iter_elodie_identifiers(path: Path) -> list[str]:
    with path.open("r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter="\t")
        out: list[str] = []
        seen: set[str] = set()
        for row in reader:
            origin = (row.get("origin") or "").strip().upper()
            if origin != "ELODIE":
                continue

            # Prefer HD name if available.
            candidates = [
                (row.get("star_alt1") or "").strip(),
                (row.get("star_alt2") or "").strip(),
                (row.get("star") or "").strip(),
            ]
            identifier = ""
            for c in candidates:
                if not c or c == "-":
                    continue
                m = HD_RE.match(c)
                if m:
                    identifier = m.group(1).upper()
                    break
                if not identifier:
                    identifier = c

            if not identifier:
                continue
            if identifier in seen:
                continue
            seen.add(identifier)
            out.append(identifier)

    return out


identifiers = iter_elodie_identifiers(FGK_PATH)
print("ELODIE identifiers:", len(identifiers))
print("First 20:", identifiers[:20])

ELODIE identifiers: 113
First 20: ['HD216174', 'HD219615', 'HD103095', 'HD124897', 'HD164922', 'HD177153', 'HD182736', 'HD198149', 'HD201091', 'HD201092', 'HD201891', 'HD215665', 'HD215648', 'HD216131', 'HD217014', 'HD219134', 'HD219449', 'HD221170', 'HD222368', 'HD16160']


In [4]:
# Concurrency settings
# - max_workers: total concurrent requests
# - per_source_max_workers: cap per source (useful to avoid hammering an archive)
MAX_WORKERS = 8
PER_SOURCE_MAX_WORKERS = 4

# Resume/cache settings
RESUME = True
CACHE_PATH = ROOT / "data" / "elodie_bulk_cache.jsonl"

# Per-identifier persistence parameters
# - include_ccf: fetch & persist CCF products
# - progress_every: log every N persisted products (always logs first/last)
extra_params = {
    "raw_save_path": str(RAW_SAVE_PATH),
    "zarr_paths": str(ZARR_PATH),
    "not_found_path": str(NOT_FOUND_PATH),
    "error_path": str(ERROR_PATH),
    "include_ccf": True,
    "progress_every": 10,
    # If you want to treat CCFs as plain spectra in Zarr, set False.
    "process_ccf": True,
}

# Load cache of previously completed identifiers
completed: set[str] = set()
if RESUME and CACHE_PATH.exists():
    for line in CACHE_PATH.read_text(encoding="utf-8").splitlines():
        if not line.strip():
            continue
        try:
            import json

            rec = json.loads(line)
            if rec.get("source") == "elodie" and rec.get("status") in {"ok", "not_found"}:
                completed.add(str(rec.get("identifier")))
        except Exception:
            pass

filtered = [ident for ident in identifiers if ident not in completed]
print("Total identifiers:", len(identifiers))
print("Already completed (from cache):", len(completed))
print("To process now:", len(filtered))

requests = [SpectraRequest(source="elodie", identifier=ident, extra_params=extra_params) for ident in filtered]
print("Requests:", len(requests))
print("Cache path:", CACHE_PATH)

Total identifiers: 113
Already completed (from cache): 2
To process now: 111
Requests: 111
Cache path: /Users/mjablons/Documents/spectra/data/elodie_bulk_cache.jsonl


In [5]:
# Execute concurrent bulk download with a progress bar + cache-as-you-go
import json

from tqdm.auto import tqdm

# Create progress bar
pbar = tqdm(total=len(requests), desc="ELODIE", unit="star")

# Append results to cache immediately as they finish (so you can restart safely).
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)

lock = None
try:
    import threading

    lock = threading.Lock()
except Exception:
    lock = None


def on_result(res, done, total):  # type: ignore[no-untyped-def]
    if lock is not None:
        lock.__enter__()
    try:
        pbar.update(1)
        status = "ok" if res.success else "error"
        if res.success and len(res.spectra) == 0:
            status = "not_found"
        rec = {
            "source": res.request.source,
            "identifier": res.request.identifier,
            "status": status,
            "error": res.error,
            "n_products": len(res.spectra),
        }
        with CACHE_PATH.open("a", encoding="utf-8") as f:
            f.write(json.dumps(rec) + "\n")
    finally:
        if lock is not None:
            lock.__exit__(None, None, None)


results = bulk_download(
    requests,
    sources,
    max_workers=MAX_WORKERS,
    per_source_max_workers=PER_SOURCE_MAX_WORKERS,
    on_result=on_result,
)

pbar.close()

ok = [r for r in results if r.success]
fail = [r for r in results if not r.success]

print("Completed:", len(results))
print("Success:", len(ok))
print("Failed:", len(fail))

# Show a few failures
for r in fail[:10]:
    print("FAILED", r.request.identifier, "->", r.error)

print("Cache updated:", CACHE_PATH)

ELODIE:   0%|          | 0/111 [00:00<?, ?star/s]

INFO:spectra_download.bulk:Bulk download start
INFO:spectra_download.bulk:Bulk download start
INFO:spectra_download.sources.elodie:Downloading ELODIE object page
INFO:spectra_download.bulk:Bulk download start
INFO:spectra_download.sources.elodie:Downloading ELODIE object page
INFO:spectra_download.bulk:Bulk download start
INFO:spectra_download.sources.elodie:Downloading ELODIE object page
INFO:spectra_download.sources.elodie:Downloading ELODIE object page
INFO:spectra_download.sources.elodie:ELODIE spectra links extracted
INFO:spectra_download.sources.elodie:ELODIE search_ccf pages extracted
INFO:spectra_download.sources.elodie:ELODIE spectra links extracted
INFO:spectra_download.sources.elodie:ELODIE search_ccf pages extracted
INFO:spectra_download.sources.elodie:ELODIE spectra links extracted
INFO:spectra_download.sources.elodie:ELODIE search_ccf pages extracted
INFO:spectra_download.sources.elodie:ELODIE spectra links extracted
INFO:spectra_download.sources.elodie:ELODIE search_ccf 

Completed: 111
Success: 94
Failed: 17
FAILED HD164922 -> Failed to download http://atlas.obs-hp.fr/elodie/fE.cgi?n=e501&c=o&a=hexp&z=d&fql=[datenuit%20='20030620'],[imanum%20='0030']: <urlopen error timed out>
FAILED HD217014 -> Failed to download http://atlas.obs-hp.fr/elodie/fE.cgi?n=e501&c=o&a=hexp&z=d&fql=[datenuit%20='19970923'],[imanum%20='0018']: <urlopen error timed out>
FAILED HD222368 -> Failed to download http://atlas.obs-hp.fr/elodie/CC.cgi?c=i&dataset=19980829&imanum=0017&masque=R37000K0&type=obj: <urlopen error timed out>
FAILED HD29139 -> Failed to download http://atlas.obs-hp.fr/elodie/fE.cgi?n=e500&c=i&z=s1d&a=mime:application/fits&o=elodie:20040428/0013: <urlopen error timed out>
FAILED HD43023 -> Failed to download http://atlas.obs-hp.fr/elodie/fE.cgi?n=e501&c=o&a=hexp&z=d&fql=[datenuit%20='20021024'],[imanum%20='0010']: <urlopen error timed out>
FAILED HD75732 -> Failed to download http://atlas.obs-hp.fr/elodie/fE.cgi?n=e501&c=o&a=hexp&z=d&fql=[datenuit%20='19980213

In [6]:
# Quick sanity checks / outputs
print("Raw FITS dir exists:", RAW_SAVE_PATH.exists())
print("Example raw FITS files:", sorted([p.name for p in RAW_SAVE_PATH.glob('*.fits')])[:10])

print("Zarr exists:", ZARR_PATH.exists())
print("Not-found file exists:", NOT_FOUND_PATH.exists())
if NOT_FOUND_PATH.exists():
    # Show last few lines
    lines = NOT_FOUND_PATH.read_text(encoding='utf-8').splitlines()
    print("Not-found count:", len(lines))
    print("Last 5:")
    for line in lines[-5:]:
        print(line)

Raw FITS dir exists: True
Example raw FITS files: ['HD102870.fits', 'HD102870_ccf.fits', 'HD10476_1.fits', 'HD10476_10.fits', 'HD10476_2.fits', 'HD10476_3.fits', 'HD10476_4.fits', 'HD10476_5.fits', 'HD10476_6.fits', 'HD10476_7.fits']
Zarr exists: True
Not-found file exists: True
Not-found count: 2
Last 5:
{"ts": 1767929041.41737, "source": "elodie", "identifier": "HD216174", "reason": "no_records"}
{"ts": 1767929041.419203, "source": "elodie", "identifier": "HD103095", "reason": "no_records"}
