# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 
import re

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt


def clean_non_western_encoded_characters_commands(text: str) -> str:
    """ Remove non-western encoded characters from a string
    List may need to grow.
    
    :param text: the text to clean
    :return: the cleaned text
    """
    text = re.sub(r"(\\begin{CJK}{UTF8}{gbsn})(.*?)(\\end{CJK})", r"\2", text)
    return text


def get_initials(name: str) -> str:
    """ Get the short name, e.g., A.-B. FamName
    :param name: full name
    :returns: initials
    """
    initials = []
    # account for non western names often in ()
    if '(' in name:
        name = clean_non_western_encoded_characters_commands(name)
        suffix = re.findall(r"\((.*?)\)", name)[0]
        name = name.replace(f"({suffix})", '')
    else:
        suffix = ''
    split = name.split()
    for token in split[:-1]:
        if '-' in token:
            current = '-'.join([k[0] + '.' for k in token.split('-')])
        else:
            current = token[0] + '.'
        initials.append(current)
    initials.append(split[-1].strip())
    if suffix:
        initials.append(f"({suffix})")
    return ' '.join(initials)

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# deal with the author list and edge cases of people that cannot be consistent on their name  

def filter_non_scientists(name: str) -> bool:
    """ Loose filter on expected authorships

    removing IT, administration, technical staff
    :param name: name
    :returns: False if name is not a scientist
    """
    remove_list = ['Licht', 'Binroth', 'Witzel', 'Jordan',
                   'Zähringer', 'Scheerer', 'Hoffmann', 'Düe',
                   'Hellmich', 'Enkler-Scharpegge', 'Witte-Nguy',
                   'Dehen', 'Beckmann', 'Jager', 'Jäger'
                  ]

    for k in remove_list:
        if k in name:
            return False
    return True

def add_author_to_list(author_list: list) -> list:
    """ Add author to list if not already in list
    
    :param author: author name
    :param author_list: list of authors
    :returns: updated list of authors
    """
    add_list = ['T. Henning']

    for author in add_list:
        if author not in author_list:
            author_list.append(author)
    return author_list

# get list from MPIA website
# filter for non-scientists (mpia.get_mpia_mitarbeiter_list() does some filtering)
mpia_authors = [k[1] for k in mpia.get_mpia_mitarbeiter_list() if filter_non_scientists(k[1])]
# add some missing author because of inconsistencies in their MPIA name and author name on papers
mpia_authors = add_author_to_list(mpia_authors)

In [4]:
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

def robust_call(fn, value, *args, **kwargs):
    try:
        return fn(value, *args, **kwargs)
    except Exception:
        return value

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [robust_call(mpia.get_initials, k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

K. Jahnke  ->  K. Jahnke  |  ['K. Jahnke']
I. J. M. Crossfield  ->  I. J. M. Crossfield  |  ['I. J. M. Crossfield']
M. Zhang  ->  M. Zhang  |  ['M. Zhang']
J. Liu  ->  J. Liu  |  ['J. Liu']
P. Garcia  ->  A. P. Garcia  |  ['P. Garcia']
Y. Wang  ->  Y. Wang  |  ['Y. Wang']
Arxiv has 63 new papers today
          5 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [5]:
documents = []
failed = []
for paper in tqdm(candidates):
    # debug crap
    paper['identifier'] = paper['identifier'].lower().replace('arxiv:', '').replace(r'\n', '').strip()
    paper_id = paper['identifier']
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [get_initials(k) for k in doc.authors], 
                mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print("Issues with the citations")
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2508.02779


extracting tarball to tmp_2508.02779...

 done.


Found 85 bibliographic references in tmp_2508.02779/AandA.bbl.
Retrieving document from  https://arxiv.org/e-print/2508.02782


extracting tarball to tmp_2508.02782...

 done.
Retrieving document from  https://arxiv.org/e-print/2508.03019


extracting tarball to tmp_2508.03019...

 done.
Retrieving document from  https://arxiv.org/e-print/2508.03229


extracting tarball to tmp_2508.03229... done.


Found 112 bibliographic references in tmp_2508.03229/main.bbl.
Retrieving document from  https://arxiv.org/e-print/2508.03291


extracting tarball to tmp_2508.03291...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [6]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2508.02779-b31b1b.svg)](https://arxiv.org/abs/2508.02779) | **EUCLID: Photometric redshift calibration with self-organising maps**  |
|| W. Roster, et al. -- incl., <mark>K. Jahnke</mark> |
|*Appeared on*| *2025-08-06*|
|*Comments*| *20 pages, 16 figures*|
|**Abstract**|            The Euclid survey aims to trace the evolution of cosmic structures up to redshift $z$ $\sim$ 3 and beyond. Its success depends critically on obtaining highly accurate mean redshifts for ensembles of galaxies $n(z)$ in all tomographic bins, essential for deriving robust cosmological constraints. However, photometric redshifts (photo-$z$s) suffer from systematic biases arising from various sources of uncertainty. To address these challenges, we utilised self-organising maps (SOMs) with mock samples resembling the Euclid Wide Survey (EWS), to validate Euclid's uncertainty requirement of $|\Delta\langle z \rangle| = \langle z_{\text{est}} \rangle - \langle z \rangle \leq 0.002 (1+z)$ per tomographic bin, assuming DR3-level data. We observe that defining the redshift tomography using the mean spectroscopic redshift (spec-$z$) per SOM cell, results in none of the ten tomographic redshift bins satisfying the requirement. In contrast, the redshift tomography on the photo-$z$s of the EWS-like sample yields superior results, with eight out of ten bins [$0 < z\leq 2.5$] meeting the Euclid requirement. To enhance the realism of our study, we morph our calibration sample to mimic the C3R2 survey in incremental steps. In this context, a maximum of six out of ten bins meet the requirement, strongly advocating the adoption of a redshift tomography defined by the photo-$z$s of individual galaxies rather than the commonly used mean spec-$z$ of SOM cells. To examine the impact on the expected biases for $\Omega_{\text{m}}$, $\sigma_{8}$, and $\Delta w_{0}$ measured by Euclid, we perform a Fisher forecast for cosmic shear only, based on our redshift uncertainties. Here, we find that even under an evaluation of the uncertainty where the impact of the redshift bias is substantial, most absolute biases remain below 0.1$\sigma$ in the idealised scenario and below 0.3$\sigma$ in the more realistic case.         |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2508.03229-b31b1b.svg)](https://arxiv.org/abs/2508.03229) | **The ALMA-QUARKS Survey: III. Clump-to-core fragmentation and search for high-mass starless cores**  |
|| D. Yang, et al. -- incl., <mark>P. Garcia</mark> |
|*Appeared on*| *2025-08-06*|
|*Comments*| *30 pages, 16 figures, accepted by ApJS*|
|**Abstract**|            The Querying Underlying mechanisms of massive star formation with ALMA-Resolved gas Kinematics and Structures (QUARKS) survey observed 139 infrared-bright (IR-bright) massive protoclusters at 1.3 mm wavelength with ALMA. This study investigates clump-to-core fragmentation and searches for candidate high-mass starless cores within IR-bright clumps using combined ALMA 12-m (C-2) and Atacama Compact Array (ACA) 7-m data, providing $\sim$ 1 arcsec ($\sim\rm0.02~pc$ at 3.7 kpc) resolution and $\sim\rm0.6\,mJy\,beam^{-1}$ continuum sensitivity ($\sim 0.3~M_{\odot}$ at 30 K). We identified 1562 compact cores from 1.3 mm continuum emission using getsf. Observed linear core separations ($\lambda_{\rm obs}$) are significantly less than the thermal Jeans length ($\lambda_{\rm J}$), with the $\lambda_{\rm obs}/\lambda_{\rm J}$ ratios peaking at $\sim0.2$. This indicates that thermal Jeans fragmentation has taken place within the IR-bright protocluster clumps studied here. The observed low ratio of $\lambda_{\rm obs}/\lambda_{\rm J}\ll 1$ could be the result of evolving core separation or hierarchical fragmentation. Based on associated signatures of star formation (e.g., outflows and ionized gas), we classified cores into three categories: 127 starless, 971 warm, and 464 evolved cores. Two starless cores have mass exceeding 16$\,M_{\odot}$, and represent high-mass candidates. The scarcity of such candidates suggests that competitive accretion-type models could be more applicable than turbulent core accretion-type models in high-mass star formation within these IR-bright protocluster clumps.         |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2508.02782-b31b1b.svg)](https://arxiv.org/abs/2508.02782) | **Validation of TESS Planet Candidates with Multi-Color Transit Photometry and TRICERATOPS+**  |
|| J. G. Barrientos, et al. -- incl., <mark>I. J. M. Crossfield</mark> |
|*Appeared on*| *2025-08-06*|
|*Comments*| *26 pages, 7 Figures, accepted for publication in AJ*|
|**Abstract**|            We present an upgraded version of TRICERATOPS, a software package designed to calculate false positive probabilities for planet candidates identified by the Transiting Exoplanet Survey Satellite (TESS). This enhanced framework now incorporates ground-based light curves in separate bandpasses, which are routinely obtained as part of the candidate vetting process. We apply this upgraded framework to explore the planetary nature of 14 TESS planet candidates, combining primarily J band light curves acquired with the 200-inch Hale Telescope at Palomar Observatory with complementary archival observations from the Las Cumbres Observatory Global Telescope (LCOGT), the Fred Lawrence Whipple Observatory (FLWO), and the Teide Observatory, along with existing TESS data and contrast curves from high-resolution imaging. As a result of this analysis we statistically validate (False Positive Probability < 1.5% and Nearby False Positive Probability < 0.1%) six new planets in five systems: TOI-1346 b, TOI-1346 c, TOI-2719 b, TOI-4155 b, TOI-6000 b, and TOI-6324 b. For these systems, we provide updated estimates of their stellar and planetary properties derived from the TESS and ground-based observations. These new systems contain planets with radii between 0.9-6 Re and orbital periods between 0.3-5.5 days. Finally, we use our upgraded version of TRICERATOPS to quantify the relative importance of multi-wavelength transit photometry and high-resolution imaging for exoplanet candidate validation, and discuss which kinds of candidates typically benefit the most from ground-based multi-color transit observations.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2508.03019-b31b1b.svg)](https://arxiv.org/abs/2508.03019) | **Spectroscopic ages for 4 million main-sequence dwarf stars from LAMOST DR10 estimated with data-driven approach**  |
|| J.-H. Wang, et al. -- incl., <mark>M. Zhang</mark>, <mark>J. Liu</mark> |
|*Appeared on*| *2025-08-06*|
|*Comments*| *Accepted for publication in ApJS*|
|**Abstract**|            Stellar age determination for large samples of stars opens new avenues for a broad range of astronomical sciences. While precise stellar ages for evolved stars have been derived from large ground- and space-based stellar surveys, reliable age determination for cool main-sequence dwarf stars remains a challenge. In this work, we set out to estimate the age of dwarf stars from the LAMOST spectra with a data-driven approach. We build a training set by using wide binaries that the primary component has reliable isochrone age estimate thus gives the age of the secondary. This training set is further supplemented with field stars and cluster stars whose ages are known. We then train a data-driven model for inferring age from their spectra with the XGBoost algorithm. Given a spectral signal-to-noise ratio greater than 50, the age estimation precise to 10% to 25% for K-type stars, as younger stars have larger relative errors. Validations suggest that the underlying information used for our age estimation is largely attributed to the LAMOST spectral features of chemical abundances. It means our result is a manifestation of stellar chemical clock effectively acted on LAMOST spectra ($R\simeq1800$). Applying our model to the LAMOST DR10 yields a massive age catalog for $\sim4$ million dwarf stars. Statistical properties, such as the age distribution, age-abundance and age-stellar activity relations of the sample stars are discussed. The catalog is publicly accessible and can be helpful for extensive sciences from detection and characterization of Earth-like planets to Galactic archaeology.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2508.03291-b31b1b.svg)](https://arxiv.org/abs/2508.03291) | **Investigation on deep learning-based galaxy image translation models**  |
|| H. Ruan, et al. -- incl., <mark>Y. Wang</mark> |
|*Appeared on*| *2025-08-06*|
|*Comments*| *Accepted at A&A; 18+6 pages; 12+6 figures*|
|**Abstract**|            Galaxy image translation is an important application in galaxy physics and cosmology. With deep learning-based generative models, image translation has been performed for image generation, data quality enhancement, information extraction, and generalized for other tasks such as deblending and anomaly detection. However, most endeavors on image translation primarily focus on the pixel-level and morphology-level statistics of galaxy images. There is a lack of discussion on the preservation of complex high-order galaxy physical information, which would be more challenging but crucial for studies that rely on high-fidelity image translation. Therefore, we investigated the effectiveness of generative models in preserving high-order physical information (represented by spectroscopic redshift) along with pixel-level and morphology-level information. We tested four representative models, i.e. a Swin Transformer, an SRGAN, a capsule network, and a diffusion model, using the SDSS and CFHTLS galaxy images. We found that these models show different levels of incapabilities in retaining redshift information, even if the global structures of galaxies and morphology-level statistics can be roughly reproduced. In particular, the cross-band peak fluxes of galaxies were found to contain meaningful redshift information, whereas they are subject to noticeable uncertainties in the translation of images, which may substantially be due to the nature of many-to-many mapping. Nonetheless, imperfect translated images may still contain a considerable amount of information and thus hold promise for downstream applications for which high image fidelity is not strongly required. Our work can facilitate further research on how complex physical information is manifested on galaxy images, and it provides implications on the development of image translation models for scientific use.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [7]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    print("found figures", fig_fnames)
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        if not os.path.exists(fname):
            print("file not found", fname)
            continue
        print("copying ", fname, "to", directory)
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [8]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

found figures ['tmp_2508.02779/./vio_SOM_cell_tomo.png', 'tmp_2508.02779/./cshift.png', 'tmp_2508.02779/./Flowchart.png']
copying  tmp_2508.02779/./vio_SOM_cell_tomo.png to _build/html/
copying  tmp_2508.02779/./cshift.png to _build/html/
copying  tmp_2508.02779/./Flowchart.png to _build/html/
exported in  _build/html/2508.02779.md
    + _build/html/tmp_2508.02779/./vio_SOM_cell_tomo.png
    + _build/html/tmp_2508.02779/./cshift.png
    + _build/html/tmp_2508.02779/./Flowchart.png
found figures ['tmp_2508.03229/./flowchart1.png', 'tmp_2508.03229/./continuum.png', 'tmp_2508.03229/./Jeans_Separation.png']
copying  tmp_2508.03229/./flowchart1.png to _build/html/
copying  tmp_2508.03229/./continuum.png to _build/html/
copying  tmp_2508.03229/./Jeans_Separation.png to _build/html/
exported in  _build/html/2508.03229.md
    + _build/html/tmp_2508.03229/./flowchart1.png
    + _build/html/tmp_2508.03229/./continuum.png
    + _build/html/tmp_2508.03229/./Jeans_Separation.png


## Display the papers

Not necessary but allows for a quick check.

In [9]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\orcid}[1]$
$\newcommand{\arraystretch}{1.1}$
$\newcommand{\arraystretch}{1.0}$</div>



<div id="title">

# $\Euclid$ \/: Photometric redshift calibration with \ self-organising maps$\thanks{This paper is published on     behalf of the Euclid Consortium}$

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2508.02779-b31b1b.svg)](https://arxiv.org/abs/2508.02779)<mark>Appeared on: 2025-08-06</mark> -  _20 pages, 16 figures_

</div>
<div id="authors">

W. Roster, et al. -- incl., <mark>K. Jahnke</mark>

</div>
<div id="abstract">

**Abstract:** The $\Euclid$ large-scale weak-lensing survey aims to trace the evolution of cosmic structures up to redshift $z$ $\sim$ 3 and beyond. Its success depends critically on obtaining highly accurate mean redshifts for ensembles of galaxies $n(z)$ in all tomographic bins, essential for deriving robust cosmological constraints. However, photometric redshifts (photo- $z$ s) suffer from systematic biases, arising from various sources of uncertainty and dominated by selection effects of the spectroscopic sample used for calibration. To address these challenges, we utilised self-organising maps (SOMs) with mock samples resembling the Euclid Wide Survey (EWS) from the Flagship2 simulation, to validate $\Euclid$ 's uncertainty requirement of $|\Delta\langle z \rangle| = \langle z_{\text{est}} \rangle - \langle z \rangle \leq 0.002 (1+z)$ per tomographic bin, assuming DR3-level data. Consequently, we identify the most effective galaxy selection for our tomographic bins, while systematically examining the implementation of quality control cuts to reduce sources of uncertainty. In particular, we observe that defining the redshift tomography using the mean spectroscopic redshift (spec- $z$ ) per SOM cell, results in none of the ten tomographic redshift bins satisfying the requirement. In contrast, the redshift tomography on the photo- $z$ s of the EWS-like sample yields superior results, with eight out of ten bins [ $0 < z\leq 2.5$ ] meeting the $\Euclid$ requirement. To enhance the realism of our study, we morph our calibration sample to mimic the C3R2 survey in incremental steps. In this context, a maximum of six out of ten bins meet the requirement, strongly advocating the adoption of a redshift tomography defined by the photo- $z$ s of individual galaxies rather than the commonly used mean spec- $z$ of SOM cells. To examine the impact on the expected biases for $\Omega_{\text{m}}$ , $\sigma_{8}$ , and $\Delta w_{0}$ measured by $\Euclid$ , we perform a Fisher forecast for cosmic shear only, based on our redshift uncertainties. Here, we find that even under an evaluation of the uncertainty where the impact of the redshift bias is substantial, most absolute biases remain below 0.1 $\sigma$ in the idealised scenario and below 0.3 $\sigma$ in the more realistic case.

</div>

<div id="div_fig1">

<img src="tmp_2508.02779/./vio_SOM_cell_tomo.png" alt="Fig12" width="100%"/>

**Figure 12. -** Violin plot of biases per bin using tomography defined by SOM spec-$z$ before (red) and after (blue) applying QC. These distributions also include box-and-whisker plots. Lastly, the dynamic ($\Delta\langle z \rangle$) < 0.002(1+$z$) \Euclid requirement are given by grey shaded area. (*fig:6.10*)

</div>
<div id="div_fig2">

<img src="tmp_2508.02779/./cshift.png" alt="Fig2" width="100%"/>

**Figure 2. -** *Top*: Mean spec-$z$ distributions for the validation and calibration sample subject to a non-trivial selection function. The respective distributions are also depicted for a single SOM cell chosen at random. *Bottom*: Display of the distributions found for the same randomly selected cell after applying photo-$z$ binning. (*fig:2.02*)

</div>
<div id="div_fig3">

<img src="tmp_2508.02779/./Flowchart.png" alt="Fig5" width="100%"/>

**Figure 5. -** Flowchart outlining the sequential process of utilising noiseless SOMs to create a set of calibration samples, which are then used to train a secondary set of SOMs using noisy photometric data. Lastly, the trained noisy SOMs are populated by the EWS-like data. (*fig:4.1.1*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2508.02779"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\vdag}{(v)^\dagger}$
$\newcommand$
$\newcommand$
$\newcommand{\arcm}{\hbox{^\prime}}$
$\newcommand{\etal}{{\rm et al.}\thinspace}$
$\newcommand{\eg}{{\it e.g. }}$
$\newcommand{\etc}{{\it etc. }}$
$\newcommand{\ie}{{\it i.e. }}$
$\newcommand{\cf}{{\it c.f. }}$
$\newcommand{◦ee}{\hbox{^\circ}}$
$\newcommand{\NHH}{\ensuremath{N_{\mathrm{H_{2}}}}}$
$\newcommand{\s}{\ensuremath{\mbox{~s}}}$
$\newcommand{\ps}{\ensuremath{\s^{-1}}}$
$\newcommand{\cm}{\ensuremath{\mbox{~cm}}}$
$\newcommand{\pcmsq}{\ensuremath{\cm^{-2}}}$
$\newcommand{\pcmcu}{\ensuremath{\cm^{-3}}}$
$\newcommand{\km}{\ensuremath{\mbox{~km}}}$
$\newcommand{\erg}{\ensuremath{\mbox{~erg}}}$
$\newcommand{\ergps}{\ensuremath{\erg \ps}}$
$\newcommand{\mJy}{\ensuremath{\mbox{~mJy}}}$
$\newcommand{\ML}{\ensuremath{\mbox{\Msol/\LBsol}}}$
$\newcommand{\Hi}{H\textsc{i}}$
$\newcommand{\Hii}{H\textsc{ii}}$
$\newcommand{\Ha}{\ensuremath{\mathrm{H\alpha}}}$
$\newcommand{\nh}{\ensuremath{\mathrm{n}_\mathrm{H}}}$
$\newcommand{\Mdot}{\ensuremath{\dot{\mathrm{M}}}}$
$\newcommand{\thco}{^{13}CO}$
$\newcommand{\twco}{^{12}CO}$
$\newcommand{\etco}{C^{18}O}$
$\newcommand{\vel}{km s^{-1}}$
$\newcommand{\filAname}{G350.5-N}$
$\newcommand{\filBname}{G350.5-S}$
$\newcommand{\imcoor}{\alpha_{2000}=17^{\mathrm{h}}18^{\mathrm{m}}13\fs84, \delta_{2000}=-36◦28\arcmin21\farcs5}$
$\newcommand{\her}{Herschel}$
$\newcommand{\mline}{M_{\rm line}}$
$\newcommand{\msun}{M_{\odot}}$
$\newcommand{\lsun}{L_{\odot}}$
$\newcommand{\um}{\mum}$
$\newcommand{\cmcm}{cm^{-2}}$
$\newcommand{\egcite}{\citep[e.g.,][]}$
$\newcommand{\lmsun}{M_{\odot}~pc^{-1}}$
$\newcommand{\chiiioh}{CH_3OH}$
$\newcommand{\hciiin}{HC_3N}$
$\newcommand{\hcop}{HCO^{+}}$
$\newcommand{\htcop}{H^{13}CO^{+}}$
$\newcommand{\halpha}{H40_{\alpha}}$
$\newcommand{\chthocho}{CH_3OCHO}$
$\newcommand{\chthcho}{CH_3CHO}$
$\newcommand{\chthoh}{CH_3OH}$
$\newcommand{\chii}{H/UC-H\textsc{ii}}$
$\newcommand{\uchii}{UC-H\textsc{ii}}$
$\newcommand{\hchii}{HC-H\textsc{ii}}$
$\newcommand{\hii}{H\textsc{ii}}$
$\newcommand{\CHMC}{s-cHMC}$
$\newcommand{\PCHMC}{w-cHMC}$
$\newcommand{\filname}{G34}$
$\newcommand{\mdotyr}{M_{\odot}~yr^{-1}}$
$\newcommand{\tred}{\textcolor{red}}$
$\newcommand{\tblue}{\textcolor{blue}}$
$\newcommand{\torange}{\textcolor{orange}}$
$\newcommand{\orcidauthorHL}{0000-0003-3343-9645}$
$\newcommand{\mgt}{\color{magenta}}$
$\newcommand{\arraystretch}{1.8}$
$\newcommand{\arraystretch}{1.8}$
$\newcommand\aj{{\rm{AJ}}}$
$\newcommand\araa{{\rm{ARA\&A}}}$
$\newcommand\apj{{\rm{ApJ}}}$
$\newcommand\icarus{{\rm{Icarus}}}$
$\newcommand\apjs{{\rm{ApJS}}}$
$\newcommand\apjl{{\rm{ApJL}}}$
$\newcommand\apss{{\rm{Ap\&SS}}}$
$\newcommand\aap{{\rm{A\&A}}}$
$\newcommand\aapr{{\rm{A\&AR}}}$
$\newcommand\aaps{{\rm{A\&AS}}}$
$\newcommand\baas{{\rm{BAAS}}}$
$\newcommand\memras{{\rm{MmRAS}}}$
$\newcommand\mnras{{\rm{MNRAS}}}$
$\newcommand\pasp{{\rm{PASP}}}$
$\newcommand\prl{{\rm{Phys. Rev. Lett.}}}$
$\newcommand\jqsrt{{\rm{Journal of Quantitative Spectroscopy and Radiative$
$Transfer}}}$
$\newcommand\actaa{{\rm{Acta Astronomica}}}$</div>



<div id="title">

# $\bf$ The ALMA-QUARKS Survey: III. Clump-to-core fragmentation and search for high-mass starless cores

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2508.03229-b31b1b.svg)](https://arxiv.org/abs/2508.03229)<mark>Appeared on: 2025-08-06</mark> -  _30 pages, 16 figures, accepted by ApJS_

</div>
<div id="authors">

D. Yang, et al.

</div>
<div id="abstract">

**Abstract:** The Querying Underlying mechanisms of massive star formation with ALMA-Resolved gas Kinematics and Structures (QUARKS) survey observed 139 infrared-bright (IR-bright) massive protoclusters at 1.3 mm wavelength with ALMA. This study investigates clump-to-core fragmentation and searches for candidate high-mass starless cores within IR-bright clumps using combined ALMA 12-m (C-2) and Atacama Compact Array (ACA) 7-m data, providing $\sim$ 1 $\arcsec$ ( $\sim\rm0.02$ pc at 3.7 kpc) resolution and $\sim\rm0.6 mJy beam^{-1}$ continuum sensitivity ( $\sim 0.3 $ $\msun$ at 30 K). We identified 1562 compact cores from 1.3 mm continuum emission using $_ getsf_$ . Observed linear core separations ( $\lambda_{\rm obs}$ ) are significantly less than the thermal Jeans length ( $\lambda_{\rm J}$ ), with the $\lambda_{\rm obs}/\lambda_{\rm J}$ ratios peaking at $\sim0.2$ . This indicates that thermal Jeans fragmentation has taken place within the IR-bright protocluster clumps studied here. The observed low ratio of $\lambda_{\rm obs}/\lambda_{\rm J}\ll 1$ could be the result of evolving core separation or hierarchical fragmentation. Based on associated signatures of star formation (e.g., outflows and ionized gas), we classified cores into three categories: 127 starless, 971 warm, and 464 evolved cores. Two starless cores have mass exceeding 16 $\msun$ , and represent high-mass candidates. The scarcity of such candidates suggests that competitive accretion-type models could be more applicable than turbulent core accretion-type models in high-mass star formation within these IR-bright protocluster clumps.

</div>

<div id="div_fig1">

<img src="tmp_2508.03229/./flowchart1.png" alt="Fig1" width="100%"/>

**Figure 1. -** Flow chart for ALMA-QUARKS ACA and TM2 combined data reduction.
    The elliptical boxes denote the raw data and the final reduced data. The rectangle indicates a specific processing step, while the parallelogram does an intermediate product during the data reduction. (*fig:flowchat*)

</div>
<div id="div_fig2">

<img src="tmp_2508.03229/./continuum.png" alt="Fig10" width="100%"/>

**Figure 10. -** Comparison of ATOMS 3 mm and QUARKS 1.3 mm continuum images for two protocluster clumps. The black dashed circle in each panel defines the field of view of the QUARKS survey. Left panels (a, c): ATOMS 3 mm continuum emission. The dashed contour levels are [3, 6, 12, 24, 48, 96] rms, with $\rm rms$\s$im 1.4$ and $\rm $\s$im 0.4 mJy beam^{-1}$ for I17271-3439 and I17244-3536, respectively. Red contours correspond to integrated emission of the $\rm H40\alpha$ recombination line representative of ionized gas from the ATOMS survey, with rms $\sim\rm 0.2$ and $\sim \rm 0.1 Jy beam^{-1} km s^{-1}$ for I17271-3439 and I17244-3536, respectively.
    Right panels (b, d): QUARKS TM2+ACA 1.3 mm continuum emission. The contour levels are the same as in left panels, but with $\rm rms $\s$im 2.1$ and $\sim \rm 0.4 mJy beam^{-1}$ for I17271-3439 and I17244-3536, respectively.
    The cross symbols correspond to the cores extracted from 1.3mm continuum emission using the _ getsf_ algorithm, where black markers denote those detected by the algorithm at a configuration with a minimum source size employing the beam size, while red markers correspond to relatively faint cores detected at the other configuration employing half-beam.
    The beam sizes of ATOMS and QUARKS TM2+ACA continuum emission are shown on the lower left, and the 0.1 pc scale bar is on the upper right of each panel. (*fig:mst*)

</div>
<div id="div_fig3">

<img src="tmp_2508.03229/./Jeans_Separation.png" alt="Fig4" width="100%"/>

**Figure 4. -** Distribution of the ratio between the observed core separation to the predicted thermal Jeans length. The black dashed line shows the peak of the distribution. The red contour represents the distribution with the spatial projection correction applied (see text).
     (*fig:Jeans_sep*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2508.03229"></div>

# Create HTML index

In [10]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

135  publications files modified in the last 7 days.


In [11]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

8  publications in the last 7 days.


In [12]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [13]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [14]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

4  publications in the last day.


In [15]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
