# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 
import re

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt


def clean_non_western_encoded_characters_commands(text: str) -> str:
    """ Remove non-western encoded characters from a string
    List may need to grow.
    
    :param text: the text to clean
    :return: the cleaned text
    """
    text = re.sub(r"(\\begin{CJK}{UTF8}{gbsn})(.*?)(\\end{CJK})", r"\2", text)
    return text


def get_initials(name: str) -> str:
    """ Get the short name, e.g., A.-B. FamName
    :param name: full name
    :returns: initials
    """
    initials = []
    # account for non western names often in ()
    if '(' in name:
        name = clean_non_western_encoded_characters_commands(name)
        suffix = re.findall(r"\((.*?)\)", name)[0]
        name = name.replace(f"({suffix})", '')
    else:
        suffix = ''
    split = name.split()
    for token in split[:-1]:
        if '-' in token:
            current = '-'.join([k[0] + '.' for k in token.split('-')])
        else:
            current = token[0] + '.'
        initials.append(current)
    initials.append(split[-1].strip())
    if suffix:
        initials.append(f"({suffix})")
    return ' '.join(initials)

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# deal with the author list and edge cases of people that cannot be consistent on their name  

def filter_non_scientists(name: str) -> bool:
    """ Loose filter on expected authorships

    removing IT, administration, technical staff
    :param name: name
    :returns: False if name is not a scientist
    """
    remove_list = ['Licht', 'Binroth', 'Witzel', 'Jordan',
                   'Zähringer', 'Scheerer', 'Hoffmann', 'Düe',
                   'Hellmich', 'Enkler-Scharpegge', 'Witte-Nguy',
                   'Dehen', 'Beckmann', 'Jager', 'Jäger'
                  ]

    for k in remove_list:
        if k in name:
            return False
    return True

def add_author_to_list(author_list: list) -> list:
    """ Add author to list if not already in list
    
    :param author: author name
    :param author_list: list of authors
    :returns: updated list of authors
    """
    add_list = ['T. Henning']

    for author in add_list:
        if author not in author_list:
            author_list.append(author)
    return author_list

# get list from MPIA website
# filter for non-scientists (mpia.get_mpia_mitarbeiter_list() does some filtering)
mpia_authors = [k[1] for k in mpia.get_mpia_mitarbeiter_list() if filter_non_scientists(k[1])]
# add some missing author because of inconsistencies in their MPIA name and author name on papers
mpia_authors = add_author_to_list(mpia_authors)

In [4]:
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

def robust_call(fn, value, *args, **kwargs):
    try:
        return fn(value, *args, **kwargs)
    except Exception:
        return value

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [robust_call(mpia.get_initials, k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

J. Liu  ->  J. Liu  |  ['J. Liu']
J. Wolf  ->  D. J. Wolf  |  ['J. Wolf']
K. El-Badry  ->  K. El-Badry  |  ['K. El-Badry']
X. Zhang  ->  X. Zhang  |  ['X. Zhang']
Y. Wang  ->  Y. Wang  |  ['Y. Wang']
K. El-Badry  ->  K. El-Badry  |  ['K. El-Badry']


Arxiv has 105 new papers today
          6 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [5]:
documents = []
failed = []
for paper in tqdm(candidates):
    # debug crap
    paper['identifier'] = paper['identifier'].lower().replace('arxiv:', '').replace(r'\n', '').strip()
    paper_id = paper['identifier']
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [get_initials(k) for k in doc.authors], 
                mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print("Issues with the citations")
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/6 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2509.02759


extracting tarball to tmp_2509.02759... done.
Retrieving document from  https://arxiv.org/e-print/2509.02842


extracting tarball to tmp_2509.02842...

 done.


Found 89 bibliographic references in tmp_2509.02842/aanda.bbl.
Retrieving document from  https://arxiv.org/e-print/2509.02906


extracting tarball to tmp_2509.02906...

 done.
  0: tmp_2509.02906/ms.tex, 635 lines
  1: tmp_2509.02906/aassymbols.tex, 579 lines
Retrieving document from  https://arxiv.org/e-print/2509.02945



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2509.02945...

 done.
Retrieving document from  https://arxiv.org/e-print/2509.02995


extracting tarball to tmp_2509.02995... done.
Retrieving document from  https://arxiv.org/e-print/2509.03216



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2509.03216...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [6]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.02842-b31b1b.svg)](https://arxiv.org/abs/2509.02842) | **Counterpart identification and classification for eRASS1 and characterisation of the AGN content**  |
|| M. Salvato, et al. -- incl., <mark>J. Wolf</mark> |
|*Appeared on*| *2025-09-04*|
|*Comments*| *Paper resubmitted to A&A after taking into account the comments of the referee. The associated 7 catalogues will be released via the eROSITA pages and Vizier as soon as the paper is accepted for publication*|
|**Abstract**|            [abridged] Accurately accounting for the AGN phase in galaxy evolution requires a large, clean AGN sample. This is now possible with SRG/eROSITA. The public Data Release 1 (DR1, Jan 31, 2024) includes 930,203 sources from the Western Galactic Hemisphere. The data enable the selection of a large AGN sample and the discovery of rare sources. However, scientific return depends on accurate characterisation of the X-ray emitters, requiring high-quality multiwavelength data. This paper presents the identification and classification of optical and infrared counterparts to eRASS1 sources using Gaia DR3, CatWISE2020, and Legacy Survey DR10 (LS10) with the Bayesian NWAY algorithm and trained priors. Sources were classified as Galactic or extragalactic via a Machine Learning model combining optical/IR and X-ray properties, trained on a reference sample. For extragalactic LS10 sources, photometric redshifts were computed using Circlez. Within the LS10 footprint, all 656,614 eROSITA/DR1 sources have at least one possible optical counterpart; about 570,000 are extragalactic and likely AGN. Half are new detections compared to AllWISE, Gaia, and Quaia AGN catalogues. Gaia and CatWISE2020 counterparts are less reliable, due to the surveys shallowness and the limited amount of features available to assess the probability of being an X-ray emitter. In the Galactic Plane, where the overdensity of stellar sources also increases the chance of associations, using conservative reliability cuts, we identify approximately 18,000 Gaia and 55,000 CatWISE2020 extragalactic sources. We release three high-quality counterpart catalogues, plus the training and validation sets, as a benchmark for the field. These datasets have many applications, but in particular empower researchers to build AGN samples tailored for completeness and purity, accelerating the hunt for the Universe most energetic engines.         |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.02906-b31b1b.svg)](https://arxiv.org/abs/2509.02906) | **Double White Dwarf Binaries in SDSS-V DR19 : A catalog of DA white dwarf binaries and constraints on the binary population**  |
|| G. A. Pallathadka, et al. -- incl., <mark>K. El-Badry</mark> |
|*Appeared on*| *2025-09-04*|
|*Comments*| *Submitted to AAS journals*|
|**Abstract**|            The fifth-generation Sloan Digital Sky Survey (SDSS-V) includes the first large-scale spectroscopic survey of white dwarfs (WDs) in the era of Gaia parallaxes. SDSS-V collects multiple exposures per target, making it ideal for binary detection. We present a search for hydrogen atmosphere (DA) double white dwarf (DWD) binaries in this rich dataset. We quantify radial velocity variations between sub-exposures to identify binary candidates, and also measure the orbital period for a subset of DWD binary candidates. We find 63 DWD binary candidates, of which 43 are new discoveries, and we provide tentative periods for 10 binary systems. Using these measurements, we place constraints on the binary fraction of the Galactic WD population with $< 0.4$ AU separations $f_{\mathrm{bin,0.4}} = 9\%$, and the power-law index of the initial separation distribution $\alpha = -0.62$. Using the simulated binary population, we estimate that $\leq 10$ super-Chandrasekhar binaries that merge within a Hubble time are expected in our sample. We predict that $\leq 5$ systems in our sample should be detectable via gravitational waves by LISA (Laser Interferometer Space Antenna), one of which has already been identified as a LISA verification source. We also estimate a total of about 10,000 - 20,000 LISA-detectable DWD binaries in the galaxy. Our catalog of WD+WD binary candidates in SDSS-V is now public, and promises to uncover a large number of exciting DWD systems.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.03216-b31b1b.svg)](https://arxiv.org/abs/2509.03216) | **Magnetic Atmospheres and Circumstellar Interaction in J1901+1458: Revisiting the Most Compact White Dwarf Merger Remnant in the light of new UV and X-ray data**  |
|| A. Desai, et al. -- incl., <mark>K. El-Badry</mark> |
|*Appeared on*| *2025-09-04*|
|*Comments*| *28 pages, submitted to A&A, comments are welcome*|
|**Abstract**|            Double degenerate white dwarf (WD) mergers can exhibit extreme magnetic fields exceeding $10^{8}$ G and rapid rotation, but their spectral-energy distributions and high-energy emission mechanisms remain poorly characterised. ZTF J1901+1458 stands out as the most compact and strongly magnetised object discovered in this class to date. Recent Chandra observations have revealed that the white dwarf is also a source of soft X-ray emission, inconsistent with a photospheric origin. We analyse new phase resolved UV spectroscopy from the HST combined with optical and near-infrared photometry and spectroscopy, with newly developed magnetic atmosphere models to determine its effective temperature, radius, mass, average surface magnetic field strength, and cooling age. Our results demonstrate that the spectral break at $\approx$3000 Å, observed in several highly magnetised WDs, is well-reproduced by our new models, which take into account the effect of magnetic opacities on the structure of the atmosphere. Our best-fit parameters for the WD yield an effective temperature ($T_{\rm{eff}}=28,015\pm 20$ K) and larger radius ($2630\pm10$ km) than previously reported. Furthermore, the near-infrared data exclude the presence of a stellar or brown dwarf companion hotter than $\approx$700 K. We also jointly analyse the previously published Chandra data and new XMM-Newton X-ray spectra. The faint X-ray emission, $L_X =(1.3\pm0.2)\times10^{27}$ erg/s is very soft and highly pulsed on the rotation period of the WD. We suggest that the X-rays are powered by accretion or via the interaction of the WD magnetosphere with CSM. If the rapidly rotating magnetic field could power a weak wind along open field lines, material could be extracted directly from the surface of the WD. Alternatively, accretion of fallback material from the merger or the tidal disruption of a planetary body are possible sources of CSM.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.02759-b31b1b.svg)](https://arxiv.org/abs/2509.02759) | **Reconciling Jupiter's Vertical Motions with the Observed Cloud Structure in the Upper Troposphere**  |
|| J. M. Mendonça, T. Schneider, <mark>J. Liu</mark>, Y. Lian |
|*Appeared on*| *2025-09-04*|
|*Comments*| *Accepted for publication in Icarus (this https URL)*|
|**Abstract**|            The eddy fluxes of angular momentum in Jupiter's upper troposphere are known to converge in prograde jets and diverge in retrograde jets. Away from the equator, this implies convergence of the Eulerian mean meridional flow in zones (anticyclonic shear) and divergence in belts (cyclonic shear). It indicates lower-tropospheric downwelling in zones and upwelling in belts because the mean meridional circulation almost certainly closes at depth. Yet the observed banded structure of Jupiter's clouds and hazes suggests that there is upwelling in the brighter zones and downwelling in the darker belts. Here, we show that this apparent contradiction can be resolved by considering not the Eulerian but the transformed Eulerian mean circulation, which includes a Stokes drift owing to eddies and is a better approximation of the Lagrangian mean transport of tracers such as ammonia. The potential vorticity structure inferred from observations paired with mixing length arguments suggests that there is transformed Eulerian mean upwelling in zones and downwelling in belts. Simulations with a global circulation model of Jupiter's upper atmosphere demonstrate the plausibility of these inferences and allow us to speculate on the band structure at deeper levels.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.02945-b31b1b.svg)](https://arxiv.org/abs/2509.02945) | **Observational challenges to holographic and Ricci dark energy paradigms: Insights from ACT DR6 and DESI DR2**  |
|| P.-J. Wu, T.-N. Li, G.-H. Du, <mark>X. Zhang</mark> |
|*Appeared on*| *2025-09-04*|
|*Comments*| *12 pages, 4 figures*|
|**Abstract**|            Recent studies suggest that dark energy may be dynamical rather than being a mere cosmological constant $\Lambda$. In this work, we examine the viability of two physically well-motivated dynamical dark energy models -- holographic dark energy (HDE) and Ricci dark energy (RDE) -- by confronting them with the latest observational data, including ACT cosmic microwave background anisotropies, DESI baryon acoustic oscillations, and DESY5 supernovae. Our analysis reveals a fundamental tension between early- and late-universe constraints within both frameworks: ACT favors a quintom scenario where the dark energy equation of state (EoS) evolves from $w>-1$ at early times to $w<-1$ at late times, while DESI+DESY5 exhibits a distinct preference for quintessence where $w>-1$ across cosmic evolution. The joint analysis yields constraints that align more closely with the ACT preference. Critically, the canonical RDE model fails to provide a coherent description of cosmic evolution, as it manifests severe tensions (even exceeding $10\sigma$ significance) between early- and late-universe parameter reconstructions. Based on the combined data, Bayesian evidence decisively disfavors both HDE and RDE models relative to the $\Lambda$ cold dark matter model, with ACT providing decisive disfavor and DESI+DESY5 yielding moderate disfavor. Our results show that the HDE and RDE models remain excluded by the new data, reinforcing earlier conclusions drawn from previous datasets.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2509.02995-b31b1b.svg)](https://arxiv.org/abs/2509.02995) | **Extending the short gamma-ray burst population from sub-threshold triggers in Fermi/GBM and GECAM data and its implications**  |
|| C. Cai, et al. -- incl., <mark>Y. Wang</mark> |
|*Appeared on*| *2025-09-04*|
|*Comments*| *13 pages, 8 figures*|
|**Abstract**|            Detection of short gamma-ray bursts (SGRBs) is critically important for the research of compact object mergers and multi-messenger astrophysics, but a significant part of SGRBs fall below the trigger threshold of GRB detectors, and thus are often missed. Here we present a systematic search for and verification of missed SGRBs using Fermi/GBM subthreshold triggers, jointly analyzing data from GBM, GECAM-B, and GECAM-C. Among 466 Fermi/GBM sub-threshold events (with reliability >= 5) from 2021 to 2024, 181 are within GECAM's field of view. We find that 49 out of 181 are confirmed astrophysical transients, and 41 can be classified as SGRBs. Thus, the SGRB detection rate of Fermi/GBM is increased to about 50 per year. Additionally, a complete multi-instrument monitoring and systematic verification of GBM sub-threshold events is expected to further increase the SGRB rate to about 80 per year, which is about 100% improvement relative to the GBM-triggered SGRBs. These results may have important implications on the local formation rate of SGRBs and the binary neutron star merger rate. We also searched for potential temporal coincidences between these SGRBs and gravitational waves from the LIGO-Virgo-KAGRA O4 run resulting in no detection.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [7]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    print("found figures", fig_fnames)
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        if not os.path.exists(fname):
            print("file not found", fname)
            continue
        print("copying ", fname, "to", directory)
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [8]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

found figures ['tmp_2509.02842/./density_plots_outline.png', 'tmp_2509.02842/./CTP_DR1.drawio.png', 'tmp_2509.02842/./purcomp_LS10_new.png', 'tmp_2509.02842/./purcomp_GDR3_window_new.png', 'tmp_2509.02842/./purcomp_CW2020_new.png']
copying  tmp_2509.02842/./density_plots_outline.png to _build/html/
copying  tmp_2509.02842/./CTP_DR1.drawio.png to _build/html/
copying  tmp_2509.02842/./purcomp_LS10_new.png to _build/html/
copying  tmp_2509.02842/./purcomp_GDR3_window_new.png to _build/html/
copying  tmp_2509.02842/./purcomp_CW2020_new.png to _build/html/
exported in  _build/html/2509.02842.md
    + _build/html/tmp_2509.02842/./density_plots_outline.png
    + _build/html/tmp_2509.02842/./CTP_DR1.drawio.png
    + _build/html/tmp_2509.02842/./purcomp_LS10_new.png
    + _build/html/tmp_2509.02842/./purcomp_GDR3_window_new.png
    + _build/html/tmp_2509.02842/./purcomp_CW2020_new.png


## Display the papers

Not necessary but allows for a quick check.

In [9]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\citetemp}[1]{(#1)}$
$\newcommand{\tdcomm}[1]{\textcolor{orange}{(TD: #1)}}$</div>



<div id="title">

# Counterpart identification and classification for eRASS1 and characterisation of the AGN content

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2509.02842-b31b1b.svg)](https://arxiv.org/abs/2509.02842)<mark>Appeared on: 2025-09-04</mark> -  _Paper resubmitted to A&A after taking into account the comments of the referee. The associated 7 catalogues will be released via the eROSITA pages and Vizier as soon as the paper is accepted for publication_

</div>
<div id="authors">

M. Salvato, et al. -- incl., <mark>J. Wolf</mark>

</div>
<div id="abstract">

**Abstract:** Accurately accounting for the AGN phase in galaxy evolution requires a large, clean AGN sample. This is now possible with SRG/eROSITA, which completed its first all-sky X-ray survey (eRASS1) on June 12, 2020. The public Data Release 1 (DR1, Jan 31, 2024) includes 930,203 sources from the Western Galactic Hemisphere. The all-sky X-ray data enable the selection of a large AGN sample and the discovery of rare sources. However, scientific return depends on accurate characterisation of the X-ray emitters, requiring high-quality multiwavelength data. This paper presents the identification and classification of optical and infrared counterparts to eRASS1 sources. Counterparts to eRASS1 X-ray point sources were identified using Gaia DR3, CatWISE2020, and Legacy Survey DR10 (LS10) with the Bayesian NWAY algorithm and trained priors. Sources were classified as Galactic or extragalactic via a Machine Learning model combining optical/IR and X-ray properties, trained on a reference sample. For extragalactic LS10 sources, photometric redshifts were computed using ${\sc Circlez}$ . Within the LS10 footprint, all 656,614 eROSITA/DR1 sources have at least one possible optical counterpart; $\sim$ 570,000 are extragalactic and likely AGN. Half are new detections compared to AllWISE, Gaia, and Quaia AGN catalogues. Gaia and CatWISE2020 counterparts are less reliable, due to the survey's shallowness and the limited amount of features available to assess the probability of being an X-ray emitter. In the Galactic Plane, where the overdensity of stellar sources also increases the chance of associations, using conservative reliability cuts, we identify approximately 18,000 Gaia and 55,000 CatWISE2020 extragalactic sources. We release three high-quality counterpart catalogues — plus the training and validation sets — as a benchmark for the field. These datasets have many applications, but in particular empower researchers to build AGN samples tailored for completeness and purity, accelerating the hunt for the Universe’s most energetic engines.

</div>

<div id="div_fig1">

<img src="tmp_2509.02842/./density_plots_outline.png" alt="Fig10" width="100%"/>

**Figure 10. -** Source density per eROSITA sky tile  ($3\deg \times  3\deg$) of LS DR10 (LS10; left panel), Gaia DR3 (GDR3; middle panel) and CatWISE2020 (CW2020; right panel). On the LS10 map, the dark green (magenta) regions of InAllLS10 (InAnyLS10), indicating whether all (at least one of) the LS10 bands reach the nominal depth of the survey (see Section \ref{sec:limitations}), are overplotted. (*fig:densityfield*)

</div>
<div id="div_fig2">

<img src="tmp_2509.02842/./CTP_DR1.drawio.png" alt="Fig17" width="100%"/>

**Figure 17. -** Flowchart describing the construction of six samples of candidate AGN with different supporting data, on different regions and with different levels of completeness and purity. For the detailed construction of the samples, see Section \ref{sec:eRASS1_AGN}.) (*fig:FlowChart*)

</div>
<div id="div_fig3">

<img src="tmp_2509.02842/./purcomp_LS10_new.png" alt="Fig2.1" width="33%"/><img src="tmp_2509.02842/./purcomp_GDR3_window_new.png" alt="Fig2.2" width="33%"/><img src="tmp_2509.02842/./purcomp_CW2020_new.png" alt="Fig2.3" width="33%"/>

**Figure 2. -** Mean purity and completeness as  a function of \texttt{p\_any} for LS10, GDR3 and CW2020, averaged over their respective surveys' footprints (i.e, considering eROSITA sources outside the Galactic Plane in the case of LS10) (*fig:Meanpurcomp*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2509.02842"></div>

# Create HTML index

In [10]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

129  publications files modified in the last 7 days.


In [11]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

11  publications in the last 7 days.


In [12]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [13]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [14]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

4  publications in the last day.


In [15]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
