# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 
import re

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt


def clean_non_western_encoded_characters_commands(text: str) -> str:
    """ Remove non-western encoded characters from a string
    List may need to grow.
    
    :param text: the text to clean
    :return: the cleaned text
    """
    text = re.sub(r"(\\begin{CJK}{UTF8}{gbsn})(.*?)(\\end{CJK})", r"\2", text)
    return text


def get_initials(name: str) -> str:
    """ Get the short name, e.g., A.-B. FamName
    :param name: full name
    :returns: initials
    """
    initials = []
    # account for non western names often in ()
    if '(' in name:
        name = clean_non_western_encoded_characters_commands(name)
        suffix = re.findall(r"\((.*?)\)", name)[0]
        name = name.replace(f"({suffix})", '')
    else:
        suffix = ''
    split = name.split()
    for token in split[:-1]:
        if '-' in token:
            current = '-'.join([k[0] + '.' for k in token.split('-')])
        else:
            current = token[0] + '.'
        initials.append(current)
    initials.append(split[-1].strip())
    if suffix:
        initials.append(f"({suffix})")
    return ' '.join(initials)

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# deal with the author list and edge cases of people that cannot be consistent on their name  

def filter_non_scientists(name: str) -> bool:
    """ Loose filter on expected authorships

    removing IT, administration, technical staff
    :param name: name
    :returns: False if name is not a scientist
    """
    remove_list = ['Licht', 'Binroth', 'Witzel', 'Jordan',
                   'Zähringer', 'Scheerer', 'Hoffmann', 'Düe',
                   'Hellmich', 'Enkler-Scharpegge', 'Witte-Nguy',
                   'Dehen', 'Beckmann', 'Jager', 'Jäger'
                  ]

    for k in remove_list:
        if k in name:
            return False
    return True

def add_author_to_list(author_list: list) -> list:
    """ Add author to list if not already in list
    
    :param author: author name
    :param author_list: list of authors
    :returns: updated list of authors
    """
    add_list = ['T. Henning']

    for author in add_list:
        if author not in author_list:
            author_list.append(author)
    return author_list

# get list from MPIA website
# filter for non-scientists (mpia.get_mpia_mitarbeiter_list() does some filtering)
mpia_authors = [k[1] for k in mpia.get_mpia_mitarbeiter_list() if filter_non_scientists(k[1])]
# add some missing author because of inconsistencies in their MPIA name and author name on papers
mpia_authors = add_author_to_list(mpia_authors)

In [4]:
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

def robust_call(fn, value, *args, **kwargs):
    try:
        return fn(value, *args, **kwargs)
    except Exception:
        return value

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [robust_call(mpia.get_initials, k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

A. d. Graaff  ->  A. D. Graaff  |  ['A. D. Graaff']
Wang  ->  N. Wang  |  ['Wang']
K. Kreckel  ->  K. Kreckel  |  ['K. Kreckel']
J. Neumann  ->  J. Neumann  |  ['J. Neumann']
E. Schinnerer  ->  E. Schinnerer  |  ['E. Schinnerer']
S. Joharle  ->  S. Joharle  |  ['S. Joharle']


J. Liu  ->  J. Liu  |  ['J. Liu']
Arxiv has 91 new papers today
          5 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [5]:
documents = []
failed = []
for paper in tqdm(candidates):
    # debug crap
    paper['identifier'] = paper['identifier'].lower().replace('arxiv:', '').replace(r'\n', '').strip()
    paper_id = paper['identifier']
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [get_initials(k) for k in doc.authors], 
                mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print("Issues with the citations")
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2507.01096


extracting tarball to tmp_2507.01096... done.


A. d. Graaff  ->  A. D. Graaff  |  ['A. D. Graaff']


Found 127 bibliographic references in tmp_2507.01096/main.bbl.
Retrieving document from  https://arxiv.org/e-print/2507.01343
extracting tarball to tmp_2507.01343...

 done.
Retrieving document from  https://arxiv.org/e-print/2507.01508



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2507.01508...

 done.



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


K. Kreckel  ->  K. Kreckel  |  ['K. Kreckel']
J. Neumann  ->  J. Neumann  |  ['J. Neumann']
E. Schinnerer  ->  E. Schinnerer  |  ['E. Schinnerer']


Found 162 bibliographic references in tmp_2507.01508/main.bbl.
Retrieving document from  https://arxiv.org/e-print/2507.01510


extracting tarball to tmp_2507.01510...

 done.
Retrieving document from  https://arxiv.org/e-print/2507.01939


extracting tarball to tmp_2507.01939...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [6]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2507.01096-b31b1b.svg)](https://arxiv.org/abs/2507.01096) | **Breaking Through the Cosmic Fog: JWST/NIRSpec Constraints on Ionizing Photon Escape in Reionization-Era Galaxies**  |
|| E. Giovinazzo, et al. -- incl., <mark>A. d. Graaff</mark> |
|*Appeared on*| *2025-07-03*|
|*Comments*| *Submitted to A&A*|
|**Abstract**|            The escape fraction of Lyman continuum photons (fesc(LyC)) is the last key unknown in our understanding of cosmic reionization. Directly estimating the escape fraction (fesc) of ionizing photons in the epoch of reionization (EoR) is impossible, due to the opacity of the intergalactic medium (IGM). However, a high fesc leaves clear imprints in the spectrum of a galaxy, due to reduced nebular line and continuum emission, which also leads to bluer UV continuum slopes (betaUV). Here, we exploit the large archive of deep JWST/NIRSpec spectra from the DAWN JWST Archive to analyze over 1'400 galaxies at 5 < zspec < 10 and constrain their fesc based on SED fitting enhanced with a picket fence model. We identify 71 high-confidence sources with significant fesc based on Bayes factor analysis strongly favouring fesc > 0 over fesc = 0 solutions. We compare the characteristics of this high-escape subset against both the parent sample and established diagnostics including betaUV slope, O32, and SFR surface density (SigmaSFR). For the overall sample, we find that most sources have a low escape fraction (<1%), however, a small subset of sources seems to emit a large number of their ionizing photons into the IGM, such that the average fesc is found to be ~10%, as needed for galaxies to drive reionization. Although uncertainties remain regarding recent burstiness and the intrinsic stellar ionizing photon output at low metallicities, our results demonstrate the unique capability of JWST/NIRSpec to identify individual LyC leakers, measure average fesc and thus constrain the drivers of cosmic reionization.         |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2507.01508-b31b1b.svg)](https://arxiv.org/abs/2507.01508) | **Duration and properties of the embedded phase of star formation in 37 nearby galaxies from PHANGS-JWST**  |
|| L. Ramambason, et al. -- incl., <mark>K. Kreckel</mark>, <mark>J. Neumann</mark>, <mark>E. Schinnerer</mark> |
|*Appeared on*| *2025-07-03*|
|*Comments*| *20 pages, 14 figures, submitted to A&A, comments welcome*|
|**Abstract**|            Light reprocessed by dust grains emitting in the infrared allows the study of the physics at play in dusty, embedded regions, where ultraviolet and optical wavelengths are attenuated. Infrared telescopes such as JWST have made it possible to study the earliest feedback phases, when stars are shielded by cocoons of gas and dust. This phase is crucial for unravelling the effects of feedback from young stars, leading to their emergence and the dispersal of their host molecular clouds. Here we show that the transition from the embedded to the exposed phase of star formation is short (< 4 Myr) and sometimes almost absent (< 1 Myr), across a sample of 37 nearby star-forming galaxies, covering a wide range of morphologies from massive barred spirals to irregular dwarfs. The short duration of the dust-clearing timescales suggests a predominant role of pre-supernova feedback mechanisms in revealing newborn stars, confirming previous results on smaller samples and allowing, for the first time, a statistical analysis of their dependencies. We find that the timescales associated with mid-infrared emission at 21 {\mu}m, tracing a dust-embedded feedback phase, are controlled by a complex interplay between giant molecular cloud properties (masses and velocity dispersions) and galaxy morphology. We report relatively longer durations of the embedded phase of star formation in barred spiral galaxies, while this phase is significantly reduced in low-mass irregular dwarf galaxies. We discuss tentative trends with gas-phase metallicity, which may favor faster cloud dispersal at low metallicities.         |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2507.01343-b31b1b.svg)](https://arxiv.org/abs/2507.01343) | **47 Tuc in Rubin Data Preview 1: Exploring Early LSST Data and Science Potential**  |
|| Y. Choi, et al. -- incl., <mark>Wang</mark> |
|*Appeared on*| *2025-07-03*|
|*Comments*| *16 pages, 15 figures, comments are welcome*|
|**Abstract**|            We present analyses of the early data from Rubin Observatory's Data Preview 1 (DP1) for the globular cluster 47 Tuc field. The DP1 dataset for 47 Tuc includes four nights of observations from the Rubin Commissioning Camera (LSSTComCam), covering multiple bands (ugriy). We address challenges of crowding near the cluster core and toward the SMC in DP1, and demonstrate improved star-galaxy separation by fitting fifth-degree polynomials to the stellar loci in color-color diagrams and applying multi-dimensional sigma clipping. We compile a catalog of 3,576 probable 47 Tuc member stars selected via a combination of isochrone, Gaia proper-motion, and color-color space matched filtering. We explore the sources of photometric scatter in the 47 Tuc color-color sequence, evaluating contributions from various potential sources, including differential extinction within the cluster. Finally, we recover five known variable stars, including three RR Lyrae and two eclipsing binaries. Although the DP1 lightcurves have sparse temporal sampling, they appear to follow the patterns of densely-sampled literature lightcurves well. Despite some data limitations for crowded-field stellar analysis, DP1 demonstrates the promising scientific potential for future LSST data releases.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2507.01510-b31b1b.svg)](https://arxiv.org/abs/2507.01510) | **Impact of a granular mass distribution on the orbit of S2 in the Galactic center**  |
|| M. S. Bordoni, et al. -- incl., <mark>S. Joharle</mark> |
|*Appeared on*| *2025-07-03*|
|*Comments*| *Paper of 11 pages, 7 figures, 3 tables. Accepted for publication in Astronomy & Astrophysics on June 28, 2025*|
|**Abstract**|            The orbit of the S2 star around Sagittarius A* provides a unique opportunity to test general relativity and study dynamical processes near a supermassive black hole. Observations have shown that the orbit of S2 is consistent with a Schwarzschild orbit at a 10$\sigma$ confidence level, constraining the amount of extended mass within its orbit to less than 1200 M$_\odot$, under the assumption of a smooth, spherically symmetric mass distribution. In this work we investigate the effects on the S2 orbit of granularity in the mass distribution, assuming it consists of a cluster of equal-mass objects surrounding Sagittarius A*. Using a fast dynamical approach validated by full N-body simulations, we perform a large set of simulations of the motion of S2 with different realizations of the cluster objects distribution. We find that granularity can induce significant deviations from the orbit in case of a smooth potential, causing precession of the orbital plane and a variation of the in-plane precession. Interactions with the cluster objects also induce a sort of "Brownian motion" of Sagittarius A*. Mock data analysis reveals that these effects could produce observable deviations in the trajectory of S2 from a Schwarzschild orbit, especially near apocenter. During the next apocenter passage of S2 in 2026, astrometric residuals in Declination may exceed the astrometric accuracy threshold of GRAVITY of about 30 $\mu as$, as it happens in 35 to 60% of simulations for black holes of 20 to 100 M$_\odot$. This presents a unique opportunity to detect, for the first time, scattering effects on the orbit of S2 caused by stellar-mass black holes, thanks to the remarkable precision achievable with GRAVITY. We also demonstrate that any attempt to constrain the extended mass enclosed within the orbit of S2 must explicitly account for granularity in the stellar-mass black hole population.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2507.01939-b31b1b.svg)](https://arxiv.org/abs/2507.01939) | **SpecCLIP: Aligning and Translating Spectroscopic Measurements for Stars**  |
|| X. Zhao, et al. -- incl., <mark>J. Liu</mark> |
|*Appeared on*| *2025-07-03*|
|*Comments*| *26 pages, 6 figures, 5 tables. To be submitted to AAS Journals. Comments welcome*|
|**Abstract**|            In recent years, large language models (LLMs) have transformed natural language understanding through vast datasets and large-scale parameterization. Inspired by this success, we present SpecCLIP, a foundation model framework that extends LLM-inspired methodologies to stellar spectral analysis. Stellar spectra, akin to structured language, encode rich physical and chemical information about stars. By training foundation models on large-scale spectral datasets, our goal is to learn robust and informative embeddings that support diverse downstream applications. As a proof of concept, SpecCLIP involves pre-training on two spectral types--LAMOST low-resolution and Gaia XP--followed by contrastive alignment using the CLIP (Contrastive Language-Image Pre-training) framework, adapted to associate spectra from different instruments. This alignment is complemented by auxiliary decoders that preserve spectrum-specific information and enable translation (prediction) between spectral types, with the former achieved by maximizing mutual information between embeddings and input spectra. The result is a cross-spectrum framework enabling intrinsic calibration and flexible applications across instruments. We demonstrate that fine-tuning these models on moderate-sized labeled datasets improves adaptability to tasks such as stellar-parameter estimation and chemical-abundance determination. SpecCLIP also enhances the accuracy and precision of parameter estimates benchmarked against external survey data. Additionally, its similarity search and cross-spectrum prediction capabilities offer potential for anomaly detection. Our results suggest that contrastively trained foundation models enriched with spectrum-aware decoders can advance precision stellar spectroscopy.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [7]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    print("found figures", fig_fnames)
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        if not os.path.exists(fname):
            print("file not found", fname)
            continue
        print("copying ", fname, "to", directory)
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [8]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

found figures ['tmp_2507.01096/./Figures/fesc_MUV_scaled_wErr.png', 'tmp_2507.01096/./Figures/jades-gdn09-v4_prism-clear_1181_71983_sfh.png', 'tmp_2507.01096/./Figures/Model_galaxy.png']
copying  tmp_2507.01096/./Figures/fesc_MUV_scaled_wErr.png to _build/html/
copying  tmp_2507.01096/./Figures/jades-gdn09-v4_prism-clear_1181_71983_sfh.png to _build/html/
copying  tmp_2507.01096/./Figures/Model_galaxy.png to _build/html/
exported in  _build/html/2507.01096.md
    + _build/html/tmp_2507.01096/./Figures/fesc_MUV_scaled_wErr.png
    + _build/html/tmp_2507.01096/./Figures/jades-gdn09-v4_prism-clear_1181_71983_sfh.png
    + _build/html/tmp_2507.01096/./Figures/Model_galaxy.png
found figures ['tmp_2507.01508/./Fig7.png', 'tmp_2507.01508/./Fig8_1.png', 'tmp_2507.01508/./Fig8_2.png', 'tmp_2507.01508/./Fig3.png']
copying  tmp_2507.01508/./Fig7.png to _build/html/
copying  tmp_2507.01508/./Fig8_1.png to _build/html/
copying  tmp_2507.01508/./Fig8_2.png to _build/html/
copying  tmp_2507.01508/./F

## Display the papers

Not necessary but allows for a quick check.

In [9]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\EG}[1]{\textcolor{Purple}{#1}}$
$\newcommand{\fesc}{f_{\rm{esc}}\xspace}$
$\newcommand{\uvbeta}{\beta_{\rm UV}\xspace}$
$\newcommand{\arraystretch}{1.2}$
$\newcommand{\arraystretch}{1.5}$
$\newcommand{\arraystretch}{1.5}$</div>



<div id="title">

# Breaking Through the Cosmic Fog: JWST/NIRSpec Constraints on Ionizing Photon Escape in Reionization-Era Galaxies

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2507.01096-b31b1b.svg)](https://arxiv.org/abs/2507.01096)<mark>Appeared on: 2025-07-03</mark> -  _Submitted to A&A_

</div>
<div id="authors">

E. Giovinazzo, et al. -- incl., <mark>A. d. Graaff</mark>

</div>
<div id="abstract">

**Abstract:** ${The escape fraction of Lyman continuum photons ($f_{\rm esc}(\rm LyC)$) is the last key unknown in our understanding of cosmic reionization. Directly estimating the escape fraction (\fesc) of ionizing photons in the epoch of reionization (EoR) is impossible, due to the opacity of the intergalactic medium (IGM). However, a high \fesc leaves clear imprints in the spectrum of a galaxy, due to reduced nebular line and continuum emission, which also leads to bluer UV continuum slopes (\uvbeta). Here, we exploit the large archive of deep JWST/NIRSpec spectra from the DAWN JWST Archive to analyze over 1'400 galaxies at $5<z_\mathrm{spec}<10$ and constrain their \fesc based on SED fitting enhanced with a picket fence model.}$ ${We identify 71 high-confidence sources with significant \fesc based on Bayes factor analysis strongly favouring \fesc> 0 over \fesc= 0 solutions. We compare the characteristics of this high-escape subset against both the parent sample and established diagnostics including \uvbeta slope, O32, and SFR surface density ($\Sigma_{\rm SFR}$).}$ ${For the overall sample, we find that most sources have a low escape fraction (<1$\%$), however, a small subset of sources seems to emit a large number of their ionizing photons into the IGM, such that the average \fesc is found to be $\sim$10\%, as needed for galaxies to drive reionization.}$ ${Although uncertainties remain regarding recent burstiness and the intrinsic stellar ionizing photon output at low metallicities, our results demonstrate the unique capability of JWST/NIRSpec to identify individual LyC leakers, measure average \fesc and thus constrain the drivers of cosmic reionization.}$

</div>

<div id="div_fig1">

<img src="tmp_2507.01096/./Figures/fesc_MUV_scaled_wErr.png" alt="Fig7" width="100%"/>

**Figure 7. -** **Top:**$\fesc$ vs $\rm M_{\rm UV}$. The parent sample is shown with the pink dots, the high confidence sample is the dark diamonds and the average $\fesc$ in bins of $\rm M_{\rm UV}$ is shown as the black squares. The mean $\fesc$ does not show a trend with $\rm M_{\rm UV}$. The average $\fesc$ of our sample is consistently measured between 10-15\% in all bins.
    **Bottom:** The fraction of sources with $f_{\rm esc}$>0.1 in each UV magnitude bin. This fraction also shows no trend with $\rm M_{\rm UV}$. (*fig:fesc_MUV*)

</div>
<div id="div_fig2">

<img src="tmp_2507.01096/./Figures/jades-gdn09-v4_prism-clear_1181_71983_sfh.png" alt="Fig3" width="100%"/>

**Figure 3. -** Example galaxy with a Bayes factor $>$ 100. **Top: ** Here we show the observed spectrum (blue line) and the two models, one with high $\fesc$(orange solid line) and one with $\fesc$ = 0 (magenta dashed line). The grey shaded region represents the masked region. A clear difference between the two models can be seen in the $\uvbeta$ part of the spectrum, where the high $\fesc$ solution fits the data much better than the other solution.
    **Middle: ** Difference between the two models highlighting the difference in the $\uvbeta$ slope and to some extent in the emission line strengths.
    **Bottom: ** Comparison of the SFH for the two models. The models are extremely different, as reproducing the weak lines and steep $\uvbeta$ slope without $\fesc$ is only possible with a recent quenching of star formation. This indicates a degeneracy between SFH and $\fesc$, which is discussed more in Section \ref{ch:discussion}.  (*fig:rubies_spec*)

</div>
<div id="div_fig3">

<img src="tmp_2507.01096/./Figures/Model_galaxy.png" alt="Fig2" width="100%"/>

**Figure 2. -** **Top**: Spectrum of a model galaxy with various $\fesc$. The model galaxy is at  $z=6$, has a mass of $\rm M_{*} = 10^9 M_{\odot}$, metallicity $Z= 0.05 Z_{\odot}$, ionization parameter $\rm logU = -2$, constant star formation  switched on at 5 Myr and dust modelled with the Calzetti  ([Calzetti, Armus and Bohlin 2000]())  dust curve with $\rm A_{\rm v} = 0.2$.
    As $\fesc$ increases the $\uvbeta$ slope becomes steeper due to reduced nebular continuum emission, and the emission lines become weaker. The continuum emission is also affected, as its nebular component scales with $\fesc$. The full spectrum thus contains information on the escape fractions, which we exploit to constrain the $\fesc$ of galaxies with NIRSpec spectra.
    **Bottom left**: Zoom in on H$\alpha$. **Bottom right**: Zoom in on H$\beta$+[$\ion${O}{iii}] (*fig:model_galaxy*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2507.01096"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\toadd}[1]{{\color{red}{{[TO ADD: \bf #1]}}}}$
$\newcommand{\hi}{H~{\sc i}}$
$\newcommand{\hii}{H~{\sc ii}}$
$\newcommand{\ha}{\ifmmode{\rm H}\alpha \else H\alpha\fi}$
$\newcommand{\hb}{\ifmmode{\rm H}\beta \else H\beta\fi}$
$\newcommand{\lya}{\ifmmode{\rm Ly}\alpha \else Ly\alpha\fi}$
$\newcommand{\hei}{He~{\sc i}}$
$\newcommand{\Hei}{He~{\sc i} \lambda4471}$
$\newcommand{\heii}{He~{\sc ii}}$
$\newcommand{\Heiiuv}{He~{\sc ii} \lambda1640}$
$\newcommand{\Heiiopt}{He~{\sc ii} \lambda4686}$
$\newcommand{\ebv}{\ifmmode E_{\rm B-V} \else E_{\rm B-V}\fi}$
$\newcommand{\av}{\ifmmode A_{\rm V} \else A_{\rm V}\fi}$
$\newcommand{\alphaCO}{\ifmmode \alpha_{\rm CO} \else \alpha_{\rm CO}\fi}$
$\newcommand{\oh}{\ifmmode 12 + \log({\rm O/H}) \else12 + \log({\rm O/H})\fi}$
$\newcommand{\nii}{[N~{\sc ii}]}$
$\newcommand{\niii}{[N~{\sc iii}]}$
$\newcommand{\oi}{[O~{\sc i}]}$
$\newcommand{\oii}{[O~{\sc ii}]}$
$\newcommand{\oiii}{[O~{\sc iii}]}$
$\newcommand{\oiv}{[O~{\sc iv}]}$
$\newcommand{\sii}{[S~{\sc ii}]}$
$\newcommand{\siii}{[S~{\sc iii}]}$
$\newcommand{\siv}{[S~{\sc iv}]}$
$\newcommand{\ci}{[C~{\sc i}]}$
$\newcommand{\cii}{[C~{\sc ii}]}$
$\newcommand{\civ}{[C~{\sc iv}]}$
$\newcommand{\neiii}{[Ne~{\sc iii}]}$
$\newcommand{\neii}{[Ne~{\sc ii}]}$
$\newcommand{\nev}{[Ne~{\sc v}]}$
$\newcommand{\nevi}{[Ne~{\sc vi}]}$
$\newcommand{\neiv}{[Ne~{\sc iv}]}$
$\newcommand{\feii}{[Fe~{\sc ii}]}$
$\newcommand{\feiii}{[Fe~{\sc iii}]}$
$\newcommand{\feiv}{[Fe~{\sc iv}]}$
$\newcommand{\fev}{[Fe~{\sc v}]}$
$\newcommand{\fevi}{[Fe~{\sc vi}]}$
$\newcommand{\fevii}{[Fe~{\sc vii}]}$
$\newcommand{\silii}{[Si~{\sc ii}]}$
$\newcommand{\arii}{[Ar~{\sc ii}]}$
$\newcommand{\ariii}{[Ar~{\sc iii}]}$
$\newcommand{\ariv}{[Ar~{\sc iv}]}$
$\newcommand{\mgiv}{[Mg~{\sc iv}]}$
$\newcommand{\Niiib}{N~{\sc iii} \lambda4512Å}$
$\newcommand{\Nv}{N~{\sc v} \lambda4612Å}$
$\newcommand{\Niii}{N~{\sc iii} \lambda4640Å}$
$\newcommand{\Civb}{C~{\sc iv} \lambda4658Å}$
$\newcommand{\Heii}{He~{\sc ii} \lambda4686}$
$\newcommand{\Ciii}{C~{\sc iii} \lambda5696}$
$\newcommand{\Civ}{C~{\sc iv} \lambda5808}$
$\newcommand{\Ciiiuv}{C~{\sc iii}] \lambda1909}$
$\newcommand{\Oiiiuv}{O~{\sc iii}] \lambda1666}$
$\newcommand{\arraystretch}{1.3}$
$\newcommand{\micron}{\mum}$
$\newcommand{\kms}{km s^{-1}}$
$\newcommand{\kmsmpc}{km s^{-1} Mpc^{-1}}$
$\newcommand{\cmc}{cm^{-3}}$
$\newcommand{\erg}{erg s^{-1} cm^{-2} Å^{-1}}$
$\newcommand{\ergs}{erg s^{-1}}$
$\newcommand{\ergscm}{erg s^{-1} cm^{-2}}$
$\newcommand{\msun}{\ifmmode M_{\odot} \else M_{\odot}\fi}$
$\newcommand{\msunyr}{\ifmmode M_{\odot} {\rm yr}^{-1} \else M_{\odot} yr^{-1}\fi}$
$\newcommand{\zsun}{\ifmmode Z_{\odot} \else Z_{\odot}\fi}$
$\newcommand{\lsun}{\ifmmode L_{\odot} \else L_{\odot}\fi}$
$\newcommand{\mup}{\ifmmode M_{\rm up} \else M_{\rm up}\fi}$
$\newcommand{\mlow}{\ifmmode M_{\rm low} \else M_{\rm low}\fi}$
$\newcommand{\aap}{A\&A}$
$\newcommand{\aaps}{A\&AS}$
$\newcommand{\aas}{A\&AS}$
$\newcommand{\aj}{AJ}$
$\newcommand{\apj}{ApJ}$
$\newcommand{\apjl}{ApJL}$
$\newcommand{\apjs}{ApJS}$
$\newcommand{\mnras}{MNRAS}$
$\newcommand{\pasp}{PASP}$
$\newcommand{\rmxaa}{Revista Mexicana de Astronomía y Astrofísica}$
$\newcommand{\Nii}{[N\small II]\normalsize \lambda\lambda6548,6584A}$
$\newcommand{\Sii}{[S~{\sc ii}] \lambda\lambda6716,6731Å}$
$\newcommand{\Siii}{[S~{\sc iii}] \lambda\lambda9068,9532Å}$
$\newcommand{\Oi}{[O~{\sc i}] \lambda6300Å}$
$\newcommand{\Oii}{[O~{\sc ii}] \lambda\lambda3726, 3728Å}$
$\newcommand{\Oiii}{[O~{\sc iii}] \lambda\lambda4959,5007Å}$
$\newcommand{\oiiil}{[O~{\sc iii}]\lambda 5007Å}$
$\newcommand{\oiiill}{[O~{\sc iii}]\lambda 4959Å}$
$\newcommand{\fesc}{\ifmmode f_{\rm esc} \else f_{\rm esc}\fi}$
$\newcommand{\feschii}{\ifmmode f_{\rm esc,HII} \else f_{\rm esc,HII}\fi}$</div>



<div id="title">

# Duration and properties of the embedded phase of star formation in 37 nearby galaxies from PHANGS-JWST

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2507.01508-b31b1b.svg)](https://arxiv.org/abs/2507.01508)<mark>Appeared on: 2025-07-03</mark> -  _20 pages, 14 figures, submitted to A&A, comments welcome_

</div>
<div id="authors">

L. Ramambason, et al. -- incl., <mark>K. Kreckel</mark>, <mark>J. Neumann</mark>, <mark>E. Schinnerer</mark>

</div>
<div id="abstract">

**Abstract:** Light reprocessed by dust grains emitting in the infrared allows the study of the physics at play in dusty, embedded regions, where ultraviolet and optical wavelengths are attenuated. Infrared telescopes such as JWST have made it possible to study the earliest feedback phases, when stars are shielded by cocoons of gas and dust. This phase is crucial for unravelling the effects of feedback from young stars, leading to their emergence and the dispersal of their host molecular clouds. Here we show that the transition from the embedded to the exposed phase of star formation is short ( $< 4$ Myr) and sometimes almost absent ( $< 1$ Myr), across a sample of 37 nearby star-forming galaxies, covering a wide range of morphologies from massive barred spirals to irregular dwarfs. The short duration of the dust-clearing timescales suggests a predominant role of pre-supernova feedback mechanisms in revealing newborn stars, confirming previous results on smaller samples and allowing, for the first time, a statistical analysis of their dependencies. We find that the timescales associated with mid-infrared emission at 21 $\mu$ m, tracing a dust-embedded feedback phase, are controlled by a complex interplay between giant molecular cloud properties (masses and velocity dispersions) and galaxy morphology. We report relatively longer durations of the embedded phase of star formation in barred spiral galaxies, while this phase is significantly reduced in low-mass irregular dwarf galaxies. We discuss tentative trends with gas-phase metallicity, which may favour faster cloud dispersal at low metallicities.

</div>

<div id="div_fig1">

<img src="tmp_2507.01508/./Fig7.png" alt="Fig13" width="100%"/>

**Figure 13. -** Spearman's rank correlation coefficients and associated p-values measured between galaxy properties (columns) and our measurements (rows). Statistically significant correlations according to the Holm-Bonferroni method (described in Section \ref{subsec_significance}) are highlighted as black squares, and marginally significant correlations ($\log p$-values < -2) are shown as blue squares. Our measurements are the total timescale of 21 $\mu$m emission ($t_{\rm 21 \mu m}$), the ratio between timescales of SFR and gas($t_{\rm 21 \mu m}$/$t_{\rm CO}$), and the diffuse emission fractions of 21 $\mu$m (f$_{\rm diffuse}^{21 \mu m}$). We correlate these measurements with various parameters grouped in six categories, described in \ref{subsec_selected_params}, along with the corresponding references. (*correlation*)

</div>
<div id="div_fig2">

<img src="tmp_2507.01508/./Fig8_1.png" alt="Fig14.1" width="50%"/><img src="tmp_2507.01508/./Fig8_2.png" alt="Fig14.2" width="50%"/>

**Figure 14. -** **Left**: Total duration of the 21 $\mu$m vs. CO-luminosity-weighted average velocity dispersion of GMCs. The colorbar shows the CO luminosity-weighted average mass of GMCs. Galaxies with high surface density contrasts are identified with squares. We show in gray a linear regression fitted to the data and the gray-shaded area represents the 95\% confidence interval on the regression, obtained with bootstrapping data. **Right**: Total duration of the 21 $\mu$m vs. the Hubble morphological type. The colorbar shows the metallicity measurements for the galaxies observed with MUSE. (*plot_corr_t21*)

</div>
<div id="div_fig3">

<img src="tmp_2507.01508/./Fig3.png" alt="Fig12" width="100%"/>

**Figure 12. -** Measured deviation of the gas-to-stellar flux ratio with respect to the galactic average, as a function of aperture sizes for each galaxy, obtained by contrasting CO emission as a gas tracer with respectively H$\alpha$(black triangles) and 21 $\mu$m (blue circles) as a SFR tracer. The positive deviations correspond to measurements focusing on gas peaks (traced by CO), while the negative deviations are obtained focusing on stellar peaks (traced respectively by H$\alpha$ or 21 $\mu$m). For each data point, we also show the effective 1$\sigma$ error, after the covariance between data points is taken into account. The horizontal plain line corresponding to a deviation of zero in log represents the galactic average. The dotted gray lines connecting the measurements correspond to a polynomial fit of the tuning-fork branches, following $\ci$te{Kruijssen_tf_2018}. The arrows indicate the typical separation length, $\lambda$, at which the two tracers decorrelate. The two last panels show the histograms of $\lambda$ and inferred feedback timescales ($t_{\rm fb}$, defined in Section \ref{subsec_measuring_timescales}), derived for the whole sample using either H$\alpha$ or 21 $\mu$m as a proxy for SFR, as well as the median and 1$\sigma$ standard deviations associated with these distributions. (*tuning_fork_ha_21*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2507.01508"></div>

# Create HTML index

In [10]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

554  publications files modified in the last 7 days.


In [11]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

14  publications in the last 7 days.


In [12]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [13]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [14]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

7  publications in the last day.


In [15]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
