# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 
import re

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt


def clean_non_western_encoded_characters_commands(text: str) -> str:
    """ Remove non-western encoded characters from a string
    List may need to grow.
    
    :param text: the text to clean
    :return: the cleaned text
    """
    text = re.sub(r"(\\begin{CJK}{UTF8}{gbsn})(.*?)(\\end{CJK})", r"\2", text)
    return text


def get_initials(name: str) -> str:
    """ Get the short name, e.g., A.-B. FamName
    :param name: full name
    :returns: initials
    """
    initials = []
    # account for non western names often in ()
    if '(' in name:
        name = clean_non_western_encoded_characters_commands(name)
        suffix = re.findall(r"\((.*?)\)", name)[0]
        name = name.replace(f"({suffix})", '')
    else:
        suffix = ''
    split = name.split()
    for token in split[:-1]:
        if '-' in token:
            current = '-'.join([k[0] + '.' for k in token.split('-')])
        else:
            current = token[0] + '.'
        initials.append(current)
    initials.append(split[-1].strip())
    if suffix:
        initials.append(f"({suffix})")
    return ' '.join(initials)

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# deal with the author list and edge cases of people that cannot be consistent on their name  

def filter_non_scientists(name: str) -> bool:
    """ Loose filter on expected authorships

    removing IT, administration, technical staff
    :param name: name
    :returns: False if name is not a scientist
    """
    remove_list = ['Licht', 'Binroth', 'Witzel', 'Jordan',
                   'Zähringer', 'Scheerer', 'Hoffmann', 'Düe',
                   'Hellmich', 'Enkler-Scharpegge', 'Witte-Nguy',
                   'Dehen', 'Beckmann', 'Jager', 'Jäger'
                  ]

    for k in remove_list:
        if k in name:
            return False
    return True

def add_author_to_list(author_list: list) -> list:
    """ Add author to list if not already in list
    
    :param author: author name
    :param author_list: list of authors
    :returns: updated list of authors
    """
    add_list = ['T. Henning']

    for author in add_list:
        if author not in author_list:
            author_list.append(author)
    return author_list

# get list from MPIA website
# filter for non-scientists (mpia.get_mpia_mitarbeiter_list() does some filtering)
mpia_authors = [k[1] for k in mpia.get_mpia_mitarbeiter_list() if filter_non_scientists(k[1])]
# add some missing author because of inconsistencies in their MPIA name and author name on papers
mpia_authors = add_author_to_list(mpia_authors)

In [4]:
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

def robust_call(fn, value, *args, **kwargs):
    try:
        return fn(value, *args, **kwargs)
    except Exception:
        return value

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [robust_call(mpia.get_initials, k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

Y. Wu  ->  Y. Wu  |  ['Y. Wu']
S. Kraus  ->  S. Kraus  |  ['S. Kraus']
M. Zhang  ->  M. Zhang  |  ['M. Zhang']
Y. Wang  ->  Y. Wang  |  ['Y. Wang']
H. Klahr  ->  H. Klahr  |  ['H. Klahr']
T. Henning  ->  T. Henning  |  ['T. Henning']


K. Jahnke  ->  K. Jahnke  |  ['K. Jahnke']
K. Naidoo  ->  K. Naidoo  |  ['K. Naidoo']
Arxiv has 51 new papers today
          7 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [5]:
documents = []
failed = []
for paper in tqdm(candidates):
    # debug crap
    paper['identifier'] = paper['identifier'].lower().replace('arxiv:', '').replace(r'\n', '').strip()
    paper_id = paper['identifier']
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [get_initials(k) for k in doc.authors], 
                mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print("Issues with the citations")
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/7 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2512.05180


extracting tarball to tmp_2512.05180...

 done.
Retrieving document from  https://arxiv.org/e-print/2512.05220


extracting tarball to tmp_2512.05220...

 done.
Retrieving document from  https://arxiv.org/e-print/2512.05548
extracting tarball to tmp_2512.05548... done.
Retrieving document from  https://arxiv.org/e-print/2512.05622


extracting tarball to tmp_2512.05622...

 done.
Retrieving document from  https://arxiv.org/e-print/2512.05751


extracting tarball to tmp_2512.05751...

 done.
Retrieving document from  https://arxiv.org/e-print/2512.05899


extracting tarball to tmp_2512.05899...

 done.


Error retrieving bib data for Q1-SP052: 'q1-sp052'
Retrieving document from  https://arxiv.org/e-print/2512.05909


extracting tarball to tmp_2512.05909...

 done.


Issues with the citations
entry with key miguel_a_aragon_calvo_galaxy_2016 has a duplicate doi field


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [6]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05899-b31b1b.svg)](https://arxiv.org/abs/2512.05899) | **Euclid Quick Data Release (Q1). From simulations to sky: Advancing machine-learning lens detection with real Euclid data**  |
|| E. Collaboration, et al. -- incl., <mark>K. Jahnke</mark> |
|*Appeared on*| *2025-12-08*|
|*Comments*| *16 pages*|
|**Abstract**|            In the era of large-scale surveys like Euclid, machine learning has become an essential tool for identifying rare yet scientifically valuable objects, such as strong gravitational lenses. However, supervised machine-learning approaches require large quantities of labelled examples to train on, and the limited number of known strong lenses has lead to a reliance on simulations for training. A well-known challenge is that machine-learning models trained on one data domain often underperform when applied to a different domain: in the context of lens finding, this means that strong performance on simulated lenses does not necessarily translate into equally good performance on real observations. In Euclid's Quick Data Release 1 (Q1), covering 63 deg2, 500 strong lens candidates were discovered through a synergy of machine learning, citizen science, and expert visual inspection. These discoveries now allow us to quantify this performance gap and investigate the impact of training on real data. We find that a network trained only on simulations recovers up to 92% of simulated lenses with 100% purity, but only achieves 50% completeness with 24% purity on real Euclid data. By augmenting training data with real Euclid lenses and non-lenses, completeness improves by 25-30% in terms of the expected yield of discoverable lenses in Euclid DR1 and the full Euclid Wide Survey. Roughly 20% of this improvement comes from the inclusion of real lenses in the training data, while 5-10% comes from exposure to a more diverse set of non-lenses and false-positives from Q1. We show that the most effective lens-finding strategy for real-world performance combines the diversity of simulations with the fidelity of real lenses. This hybrid approach establishes a clear methodology for maximising lens discoveries in future data releases from Euclid, and will likely also be applicable to other surveys such as LSST.         |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05909-b31b1b.svg)](https://arxiv.org/abs/2512.05909) | **Learning the Cosmic Web: Graph-based Classification of Simulated Galaxies by their Dark Matter Environments**  |
|| D. Kololgi, <mark>K. Naidoo</mark>, A. Saintonge, O. Lahav |
|*Appeared on*| *2025-12-08*|
|*Comments*| *15 pages, 7 figures, 9 tables, submitted to Royal Astronomical Society Techniques and Instruments*|
|**Abstract**|            We present a novel graph-based machine learning classifier for identifying the dark matter cosmic web environments of galaxies. Large galaxy surveys offer comprehensive statistical views of how galaxy properties are shaped by large-scale structure, but this requires robust classifications of galaxies' cosmic web environments. Using stellar mass-selected IllustrisTNG-300 galaxies, we apply a three-stage, simulation-based framework to link galaxies to the total (mainly dark) underlying matter distribution. Here, we apply the following three steps: First, we assign the positions of simulated galaxies to a void, wall, filament, or cluster environment using the T-Web classification of the underlying matter distribution. Second, we construct a Delaunay triangulation of the galaxy distribution to summarise the local geometric structure with ten graph metrics for each galaxy. Third, we train a graph attention network (GAT) on each galaxy's graph metrics to predict its cosmic web environment. For galaxies with stellar mass $\mathrm{>10^9 M_{\odot}}$, our GAT+ model achieves an accuracy of $85\,\%$, outperforming graph-agnostic multilayer perceptrons and graph convolutional networks. Our results demonstrate that graph-based representations of galaxy positions provide a powerful and physically meaningful way to infer dark matter environments. We plan to apply this simulation-based graph modelling to investigate how the properties of observed galaxies from the Dark Energy Spectroscopic Instrument (DESI) survey are influenced by their dark matter environments.         |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05220-b31b1b.svg)](https://arxiv.org/abs/2512.05220) | **Multiple outflows and delayed ejections revealed by early imaging of novae**  |
|| E. Aydi, et al. -- incl., <mark>S. Kraus</mark> |
|*Appeared on*| *2025-12-08*|
|*Comments*| *62 pages, 29 Figures, 6 Tables, Published in Nature Astronomy*|
|**Abstract**|            Novae are thermonuclear eruptions on accreting white dwarfs in interacting binaries. Although most of the accreted envelope is expelled, the mechanism -- impulsive ejection, multiple outflows or prolonged winds, or a common-envelope interaction -- remains uncertain. GeV $\gamma$-ray detections from $>20$ Galactic novae establish these eruptions as nearby laboratories for shock physics and particle acceleration, underscoring the need to determine how novae eject their envelopes. Here we report on near-infrared interferometry, supported with multiwavelength observations, of two $\gamma$-ray detected novae. The images of the very fast 2021 nova V1674~Her, taken just 2--3 days after discovery, reveal the presence of two perpendicular outflows. The interaction between these outflows likely drives the observed $\gamma$-ray emission. Conversely, the images of the very slow 2021 nova V1405~Cas suggest a delay in the ejection of the bulk of the accreted envelope of more than 50 days after the start of eruption, as the nova slowly rises to visible peak and during which the envelope engulfed the system in a common envelope phase. These unprecedented images offer direct observational evidence that the mechanisms driving mass ejection from the surfaces of accreting white dwarfs are not as simple as previously thought, revealing multiple outflows and delayed ejections.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05751-b31b1b.svg)](https://arxiv.org/abs/2512.05751) | **Exoplanet formation inference using conditional invertible neural networks**  |
|| R. Burn, V. F. Ksoll, <mark>H. Klahr</mark>, <mark>T. Henning</mark> |
|*Appeared on*| *2025-12-08*|
|*Comments*| *10 pages, accepted poster for the Machine Learning and the Physical Sciences Workshop at the 39th conference on Neural Information Processing Systems (NeurIPS 2025)*|
|**Abstract**|            The interpretation of the origin of observed exoplanets is usually done only qualitatively due to uncertainties of key parameters in planet formation models. To allow a quantitative methodology which traces back in time to the planet birth locations, we train recently developed conditional invertible neural networks (cINN) on synthetic data from a global planet formation model which tracks growth from dust grains to evolved final giant planets. In addition to deterministic single planet formation runs, we also include gravitationally interacting planets in multiplanetary systems, which include some measure of chaos. For the latter case, we treat them as individual planets or choose the two or three planets most likely to be discovered by telescopes. We find that training on multiplanetary data, each planet treated as individual point, is promising. The single-planet data only covers a small range of planets and does not extrapolate well to planet properties not included in the training data. Extension to planetary systems will require more training data due to the higher dimensionality of the problem.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05180-b31b1b.svg)](https://arxiv.org/abs/2512.05180) | **Little red dot variability over a century reveals black hole envelope via a giant Einstein cross**  |
|| Z. Zhang, et al. -- incl., <mark>Y. Wu</mark> |
|*Appeared on*| *2025-12-08*|
|*Comments*| *59 pages, 13 figures, 3 tables, submitted to a peer-reviewed journal; comments are welcome*|
|**Abstract**|            "Little red dots" (LRDs) represent a new population of astronomical objects uncovered by JWST whose nature remains debated. Although many LRDs are suspected as active galactic nuclei (AGN), they show little variability on days-years timescales. We report the discovery of two gravitationally lensed LRDs at redshift $\sim$4.3 behind the cluster RXCJ2211-0350, one of which (RX1) is quadruply imaged with time delays spanning $\sim$130 years. RX1 exhibits intrinsic color and brightness variations of up to 0.7 magnitude among its images. These changes are consistent with blackbody-temperature variations of a photosphere, indicating long-term variability analogous to Cepheid-like pulsations but in a far more extended ($R \sim 2000$ AU) and massive ($M \gtrsim 10^6 \, M_{\odot}$) systems. These results suggest LRDs as a distinct class of AGN with stellar-like envelopes.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05548-b31b1b.svg)](https://arxiv.org/abs/2512.05548) | **How Dark Sector Equations of State Govern Interaction Signatures**  |
|| P.-J. Wu, <mark>M. Zhang</mark>, S.-J. Jin |
|*Appeared on*| *2025-12-08*|
|*Comments*| *7 pages, 2 figures,*|
|**Abstract**|            Using late-Universe observations, we demonstrate that freeing dark energy and dark matter equations of state (EoS) dramatically alters the inferred strength and direction of their interactions. When dark sector EoS are fixed to $w_{\mathrm{de}}=-1$ and $w_{\mathrm{dm}}=0$, the data consistently favor an energy transfer from dark energy to dark matter across various interaction forms. This apparent evidence, however, proves highly sensitive to the EoS assumptions: treating $w_{\mathrm{de}}$ as a free parameter substantially weakens the evidence for interaction, with its value converging to the quintessence regime ($w_{\mathrm{de}}>-1$). In contrast, freeing $w_{\mathrm{dm}}$ maintains a preference for interaction, revealing a correlation where positive $w_{\mathrm{dm}}$ is associated with energy transfer from dark energy to dark matter, and negative $w_{\mathrm{dm}}$ with energy transfer from dark matter to dark energy. These findings caution against the simplistic assumption of $\Lambda$CDM EoS values when attempting to detect a possible interaction. Despite these fundamental degeneracies, model comparison using the Akaike and Deviance information criteria shows that all of the tested interacting dark energy scenarios receive substantial support over the $\Lambda$CDM model.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2512.05622-b31b1b.svg)](https://arxiv.org/abs/2512.05622) | **First Statistical Detection of Cool Gas Outflows with JWST Towards Cosmic Dawn**  |
|| C. Lyu, et al. -- incl., <mark>Y. Wang</mark> |
|*Appeared on*| *2025-12-08*|
|*Comments*| *16 pages, 6+2 figures, to be submitted*|
|**Abstract**|            Galactic-scale outflows are a crucial component of galaxy evolution, yet their properties in the early universe remain poorly constrained. We present the first statistical investigation of cool gas outflows in galaxies spanning a wide cosmic timeline from $z \approx 1$ to $z \approx 10$. Using thousands of public JWST/NIRSpec spectra, we employ a signal-to-noise weighted spectral stacking technique on the \ion{Mg}{2} $\lambda\lambda2796, 2803$ absorption doublet. We robustly detect blueshifted \ion{Mg}{2} absorption in all stellar mass and redshift bins. The outflow equivalent width exhibits a strong, positive correlation with stellar mass ($M_*$) at all epochs, increasing from $\sim 1$~Å at $M_* \approx 10^9~\mathrm{M}_\odot$ to over $3$~Å at $M_* > 10^{10.5}~\mathrm{M}_\odot$. Our work provides the first statistical constraints on cool outflows in the low-mass ($M_* \lesssim 10^{9.5}~\mathrm{M}_\odot$), high-redshift ($z > 3$) regime, vital for constraining feedback in the numerous progenitors of typical present-day galaxies. Crucially, the scaling relation between outflow properties and stellar mass shows no significant evolution at $z > 3$. This suggests a persistent, unevolving feedback mechanism governing the baryon cycle in the early universe, placing strong constraints on models that invoke a fundamental change in feedback physics at Cosmic Dawn, such as the feedback-free starburst model.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [7]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    print("found figures", fig_fnames)
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        if not os.path.exists(fname):
            print("file not found", fname)
            continue
        print("copying ", fname, "to", directory)
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [8]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

found figures ['tmp_2512.05899/./figs/roc_dr1_ews_lin.png', 'tmp_2512.05899/./figs/adding_Q1_data.png', 'tmp_2512.05899/./figs/sims-vs-real.png']
copying  tmp_2512.05899/./figs/roc_dr1_ews_lin.png to _build/html/
copying  tmp_2512.05899/./figs/adding_Q1_data.png to _build/html/
copying  tmp_2512.05899/./figs/sims-vs-real.png to _build/html/
exported in  _build/html/2512.05899.md
    + _build/html/tmp_2512.05899/./figs/roc_dr1_ews_lin.png
    + _build/html/tmp_2512.05899/./figs/adding_Q1_data.png
    + _build/html/tmp_2512.05899/./figs/sims-vs-real.png
found figures ['tmp_2512.05909/./Figures/MLP_GNN_GATPlus.png', 'tmp_2512.05909/./Figures/umap_gat_embeddings_test_predictions.png', 'tmp_2512.05909/./Figures/training_validation_accuracies_losses.png']
copying  tmp_2512.05909/./Figures/MLP_GNN_GATPlus.png to _build/html/
copying  tmp_2512.05909/./Figures/umap_gat_embeddings_test_predictions.png to _build/html/
copying  tmp_2512.05909/./Figures/training_validation_accuracies_losses.png to 

## Display the papers

Not necessary but allows for a quick check.

In [9]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\red}[1]{\textcolor{black}{#1}}$
$\newcommand{\orcid}[1]$
$\newcommand{\linenumbers}[0]$</div>



<div id="title">

# Euclid Quick Data Release (Q1): From simulations to sky: Advancing machine-learning lens detection with real Euclid data

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2512.05899-b31b1b.svg)](https://arxiv.org/abs/2512.05899)<mark>Appeared on: 2025-12-08</mark> -  _16 pages_

</div>
<div id="authors">

E. Collaboration, et al. -- incl., <mark>K. Jahnke</mark>

</div>
<div id="abstract">

**Abstract:** In the era of large-scale surveys like $\Euclid$ , machine learning has become an essential tool for identifying rare yet scientifically valuable objects, such as strong gravitational lenses.However, supervised machine-learning approaches require large quantities of labelled examples to train on, and the limited number of known strong lenses has lead to a reliance on simulations for training.A well-known challenge is that machine-learning models trained on one data domain often underperform when applied to a different domain: in the context of lens finding, this means that strong performance on simulated lenses does not necessarily translate into equally good performance on real observations.In \textcolor{black}{\Euclid's Quick Data Release 1 (Q1)} , covering 63 deg $^{2}$ , 500 strong lens candidates were discovered through a synergy of machine learning, citizen science, and expert visual inspection.These discoveries now allow us to quantify this performance gap and investigate the impact of training on real data. We find that a network trained only on simulations recovers up to 92 \% of simulated lenses with 100 \% purity, but only achieves 50 \% completeness with 24 \% purity on real $\Euclid$ data. By augmenting training data with real $\Euclid$ lenses and non-lenses, completeness improves by 25--30 \% in terms of the expected yield of discoverable lenses in $\Euclid$ 's Data Release 1 and the full Euclid Wide Survey.Roughly 20 \% of this improvement comes from the inclusion of real lenses in the training data, while 5--10 \% comes from exposure to a more diverse set of non-lenses and false-positives from Q1.We show that the most effective lens-finding strategy for real-world performance combines the diversity of simulations with the fidelity of real lenses. This hybrid approach establishes a clear methodology for maximising lens discoveries in future data releases from $\Euclid$ , and will likely also be applicable to other surveys such as the Vera Rubin Observatory's Legacy Survey of Space and Time.

</div>

<div id="div_fig1">

<img src="tmp_2512.05899/./figs/roc_dr1_ews_lin.png" alt="Fig3" width="100%"/>

**Figure 3. -** Projected number of lenses discoverable in DR1 and EWS as a function of number of images to inspect, for the network trained with and without the Q1 data. (*fig:dr1_ews_forecast*)

</div>
<div id="div_fig2">

<img src="tmp_2512.05899/./figs/adding_Q1_data.png" alt="Fig7" width="100%"/>

**Figure 7. -** Impact of augmenting the training data with available Q1 lenses and non-lenses, in terms of performance across a range of metrics. These include F1 score and AUC, as well as the projected fraction of lenses that would be discoverable in a data set the size of EWS, assuming that one million images can be visually inspected. (*fig:adding-Q1-data*)

</div>
<div id="div_fig3">

<img src="tmp_2512.05899/./figs/sims-vs-real.png" alt="Fig1" width="100%"/>

**Figure 1. -** _Top_: simulated \Euclid lenses from Q1-SP052. _Bottom_: real lenses found in Q1. (*fig:simims*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2512.05899"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\cmark}{\ding{51}}$
$\newcommand{\xmark}{\ding{55}}$
$\newcommand{\LM}[1]{{\color{blue}LM: #1}}$
$\newcommand{\thebibliography}{\DeclareRobustCommand{\VAN}[3]{##3}\VANthebibliography}$</div>



<div id="title">

# Learning the Cosmic Web: Graph-based Classification of Simulated Galaxies by their Dark Matter Environments

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2512.05909-b31b1b.svg)](https://arxiv.org/abs/2512.05909)<mark>Appeared on: 2025-12-08</mark> -  _15 pages, 7 figures, 9 tables, submitted to Royal Astronomical Society Techniques and Instruments_

</div>
<div id="authors">

D. Kololgi, <mark>K. Naidoo</mark>, A. Saintonge, O. Lahav

</div>
<div id="abstract">

**Abstract:** We present a novel graph-based machine learning classifier for identifying the dark matter cosmic web environments of galaxies. Large galaxy surveys offer comprehensive statistical views of how galaxy properties are shaped by large-scale structure, but this requires robust classifications of galaxies' cosmic web environments. Using stellar mass-selected IllustrisTNG-300 galaxies, we apply a three-stage, simulation-based framework to link galaxies to the total (mainly dark) underlying matter distribution. Here, we apply the following three steps: First, we assign the positions of simulated galaxies to a void, wall, filament, or cluster environment using the T-Web classification of the underlying matter distribution. Second, we construct a Delaunay triangulation of the galaxy distribution to summarise the local geometric structure with ten graph metrics for each galaxy. Third, we train a graph attention network (GAT) on each galaxy's graph metrics to predict its cosmic web environment. For galaxies with stellar mass $\mathrm{>10^9 M_{\odot}}$ , our GAT+ model achieves an accuracy of $85 \%$ , outperforming graph-agnostic multilayer perceptrons and graph convolutional networks. Our results demonstrate that graph-based representations of galaxy positions provide a powerful and physically meaningful way to infer dark matter environments. We plan to apply this simulation-based graph modelling to investigate how the properties of observed galaxies from the Dark Energy Spectroscopic Instrument (DESI) survey are influenced by their dark matter environments.

</div>

<div id="div_fig1">

<img src="tmp_2512.05909/./Figures/MLP_GNN_GATPlus.png" alt="Fig5" width="100%"/>

**Figure 5. -** Overview of the baseline and graph-based neural network architectures explored in this work.
The MLP (left) serves as the baseline model, taking node features as independent inputs passed through successive fully-connected layers. The GCN (middle) introduces relational inductive biases by aggregating information from connected nodes in the Delaunay graph, enabling feature propagation along edges. The GAT+ (right) extends this by applying multi-head attention mechanisms and edge features, allowing the network to learn the relative importance of neighbouring nodes. The architectures were refined through iterative experimentation, adjusting the number of layers, hidden dimensions, and normalisation or dropout configurations until convergence performance and stability were optimised across validation runs. (*fig:mlpgnngatplusarchitecture*)

</div>
<div id="div_fig2">

<img src="tmp_2512.05909/./Figures/umap_gat_embeddings_test_predictions.png" alt="Fig7" width="100%"/>

**Figure 7. -** UMAP projections into three-dimensions represented by the columns. Top row: UMAP contours of the GAT+ embeddings, but only limited to the test galaxy set and coloured by the model-predicted environments. Bottom row: GAT+ embeddings of the test galaxy set in UMAP projected space. Darker edges represent higher model uncertainty, which is derived from the Shannon entropy of the output probabilities. The overlap of cosmic web environments in the top figure corresponds with regions of high model uncertainty given by the bottom figure, indicating that the GAT+ model learns a physically meaningful representation of the cosmic web. (*fig:UMAP*)

</div>
<div id="div_fig3">

<img src="tmp_2512.05909/./Figures/training_validation_accuracies_losses.png" alt="Fig6" width="100%"/>

**Figure 6. -** Training and validation performance of the GAT+ model. Accuracy (left) and loss (right) curves are shown for the training and validation datasets over 10,000 epochs for the $\mathrm{10^{9} M_{\odot}}$ stellar mass cut. The convergence and close overlap between training and validation curves indicate stable optimisation and minimal overfitting. (*fig:gat_4H_plus_10_9_loss_accuracy_curves*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2512.05909"></div>

# Create HTML index

In [10]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

139  publications files modified in the last 7 days.


In [11]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

15  publications in the last 7 days.


In [12]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [13]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [14]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

2  publications in the last day.


In [15]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
