# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

K. El-Badry  ->  K. El-Badry  |  ['K. El-Badry']
H.-W. Rix  ->  H.-W. Rix  |  ['H.-W. Rix']
L. Eisert  ->  L. Eisert  |  ['L. Eisert']
A. Pillepich  ->  A. Pillepich  |  ['A. Pillepich']
J. Li  ->  J. Li  |  ['J. Li']


F. Walter  ->  F. Walter  |  ['F. Walter']
Arxiv has 79 new papers today
          6 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/6 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2310.19866


extracting tarball to tmp_2310.19866... done.


K. El-Badry  ->  K. El-Badry  |  ['K. El-Badry']
H.-W. Rix  ->  H.-W. Rix  |  ['H.-W. Rix']


Found 51 bibliographic references in tmp_2310.19866/paper.bbl.
Retrieving document from  https://arxiv.org/e-print/2310.19904


extracting tarball to tmp_2310.19904...

 done.


Found 87 bibliographic references in tmp_2310.19904/main.bbl.
syntax error in line 32: '=' expected
Retrieving document from  https://arxiv.org/e-print/2310.19963


extracting tarball to tmp_2310.19963... done.
Retrieving document from  https://arxiv.org/e-print/2310.19966



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2310.19966...

 done.
Retrieving document from  https://arxiv.org/e-print/2310.20161



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2310.20161... done.
Retrieving document from  https://arxiv.org/e-print/2310.20675


extracting tarball to tmp_2310.20675...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.19866-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.19866) | **Measuring The Mass-Radius Relation of White Dwarfs Using Wide Binaries**  |
|| S. Arseneau, et al. -- incl., <mark>K. El-Badry</mark>, <mark>H.-W. Rix</mark> |
|*Appeared on*| *2023-11-01*|
|*Comments*| *12 pages, 9 figures*|
|**Abstract**| Measuring the mass-radius relation of individual white dwarfs is an empirically challenging task that has been performed for only a few dozen stars. We measure the white dwarf mass-radius relation using gravitational redshifts and radii of 137 white dwarfs in wide binaries with main sequence companions. We obtain the space velocities to these systems using the main sequence companion, and subtract these Doppler redshifts from the white dwarfs' apparent motions, isolating their gravitational redshifts. We use Gaia data to calculate the surface temperatures and radii of these white dwarfs, thereby deriving an empirical gravitational redshift-radius relation. This work demonstrates the utility of low-resolution Galactic surveys to measure the white dwarf equation of state. Our results are consistent with theoretical models, and represent the largest sample of individual white dwarf gravitational redshift measurements to date. |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.19904-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.19904) | **ERGO-ML -- Comparing IllustrisTNG and HSC galaxy images via contrastive  learning**  |
|| <mark>L. Eisert</mark>, et al. -- incl., <mark>A. Pillepich</mark> |
|*Appeared on*| *2023-11-01*|
|*Comments*| *26 pages, 17 figures. Submitted to MNRAS. Comments are welcome. Highlights in Fig. 3, 7 and 13*|
|**Abstract**| Modern cosmological hydrodynamical galaxy simulations provide tens of thousands of reasonably realistic synthetic galaxies across cosmic time. However, quantitatively assessing the level of realism of simulated universes in comparison to the real one is difficult. In this paper of the ERGO-ML series (Extracting Reality from Galaxy Observables with Machine Learning), we utilize contrastive learning to directly compare a large sample of simulated and observed galaxies based on their stellar-light images. This eliminates the need to specify summary statistics and allows to exploit the whole information content of the observations. We produce survey-realistic galaxy mock datasets resembling real Hyper Suprime-Cam (HSC) observations using the cosmological simulations TNG50 and TNG100. Our focus is on galaxies with stellar masses between $10^9$ and $10^{12} M_\odot$ at $z=0.1-0.4$. This allows us to evaluate the realism of the simulated TNG galaxies in comparison to actual HSC observations. We apply the self-supervised contrastive learning method NNCLR to the images from both simulated and observed datasets (g, r, i - bands). This results in a 256-dimensional representation space, encoding all relevant observable galaxy properties. Firstly, this allows us to identify simulated galaxies that closely resemble real ones by seeking similar images in this multi-dimensional space. Even more powerful, we quantify the alignment between the representations of these two image sets, finding that the majority ($\gtrsim 70$ per cent) of the TNG galaxies align well with observed HSC images. However, a subset of simulated galaxies with larger sizes, steeper Sersic profiles, smaller Sersic ellipticities, and larger asymmetries appears unrealistic. We also demonstrate the utility of our derived image representations by inferring properties of real HSC galaxies using simulated TNG galaxies as the ground truth. |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.20675-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.20675) | **NOEMA reveals the true nature of luminous red JWST z>10 galaxy  candidates**  |
|| R. A. Meyer, et al. -- incl., <mark>F. Walter</mark> |
|*Appeared on*| *2023-11-01*|
|*Comments*| *6 pages. Submitted to A&A Letters. Comments welcome*|
|**Abstract**| The first year of JWST has revealed a surprisingly large number of luminous galaxy candidates beyond $z>10$. While some galaxies are already spectroscopically confirmed, there is mounting evidence that a subsample of the candidates with particularly red inferred UV colors are in fact lower redshift contaminants.These interlopers are often found to be `HST-dark' or `optically-faint' galaxies at $z\sim2-6$, a population key to understanding dust-obscured star formation throughout cosmic time. This paper demonstrates the complementarity of ground-based mm-interferometry and JWST infrared imaging to unveil the true nature of red 1.5-2.0 $\mu$m dropouts that have been selected as ultra-high-redshift galaxy candidates. We present NOEMA Polyfix follow-up observations of four JWST red 1.5-2.0 $\mu$m dropouts selected by Yan et al. 2023 as ultra-high-redshift candidates in the PEARLS field. The new NOEMA observations constrain the rest-frame far-infrared continuum emission and efficiently discriminate between intermediate- and high-redshift solutions. We report $>10\sigma$ NOEMA continuum detections of all our target galaxies at observed frequencies of $\nu$=236 and 252 GHz, with FIR slopes indicating a redshift $z<5$. We model their optical-to-FIR spectral energy distribution (SED) with multiple SED codes, and find that they are not $z>10$ galaxies but instead dust-obscured, massive star-forming galaxies at $z\sim 2-4$. The contribution to the cosmic star-formation rate density of such sources is not negligible at $z\simeq 3.5$ ($\phi\gtrsim(1.9-4.4)\times10^{-3}\ \rm{cMpc}^{-3}$), in line with previous studies of optically-faint/sub-millimeter galaxies. This work showcases a new way to select intermediate- to high-redshift dust-obscured galaxies in JWST fields with minimal wavelength coverage to open a new window on obscured star-formation at intermediate redshifts .[abridged] |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.19963-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.19963) | **Demonstrating Agreement between Radio and Fluorescence Measurements of  the Depth of Maximum of Extensive Air Showers at the Pierre Auger Observatory**  |
|| P. A. Collaboration, et al. |
|*Appeared on*| *2023-11-01*|
|*Comments*| *Submitted to Phys. Rev. Lett*|
|**Abstract**| We show, for the first time, radio measurements of the depth of shower maximum ($X_\text{max}$) of air showers induced by cosmic rays that are compared to measurements of the established fluorescence method at the same location. Using measurements at the Pierre Auger Observatory we show full compatibility between our radio and the previously published fluorescence data set, and between a subset of air showers observed simultaneously with both radio and fluorescence techniques, a measurement setup unique to the Pierre Auger Observatory. Furthermore, we show radio $X_\text{max}$ resolution as a function of energy and demonstrate the ability to make competitive high-resolution $X_\text{max}$ measurements with even a sparse radio array. With this, we show that the radio technique is capable of cosmic-ray mass composition studies, both at Auger and at other experiments. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.19966-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.19966) | **Radio Measurements of the Depth of Air-Shower Maximum at the Pierre  Auger Observatory**  |
|| P. A. Collaboration, et al. |
|*Appeared on*| *2023-11-01*|
|*Comments*| *Submitted to Phys. Rev. D*|
|**Abstract**| The Auger Engineering Radio Array (AERA), part of the Pierre Auger Observatory, is currently the largest array of radio antenna stations deployed for the detection of cosmic rays, spanning an area of $17$ km$^2$ with 153 radio stations. It detects the radio emission of extensive air showers produced by cosmic rays in the $30-80$ MHz band. Here, we report the AERA measurements of the depth of the shower maximum ($X_\text{max}$), a probe for mass composition, at cosmic-ray energies between $10^{17.5}$ to $10^{18.8}$ eV, which show agreement with earlier measurements with the fluorescence technique at the Pierre Auger Observatory. We show advancements in the method for radio $X_\text{max}$ reconstruction by comparison to dedicated sets of CORSIKA/CoREAS air-shower simulations, including steps of reconstruction-bias identification and correction, which is of particular importance for irregular or sparse radio arrays. Using the largest set of radio air-shower measurements to date, we show the radio $X_\text{max}$ resolution as a function of energy, reaching a resolution better than $15$ g cm$^{-2}$ at the highest energies, demonstrating that radio $X_\text{max}$ measurements are competitive with the established high-precision fluorescence technique. In addition, we developed a procedure for performing an extensive data-driven study of systematic uncertainties, including the effects of acceptance bias, reconstruction bias, and the investigation of possible residual biases. These results have been cross-checked with air showers measured independently with both the radio and fluorescence techniques, a setup unique to the Pierre Auger Observatory. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2310.20161-b31b1b.svg)](https://arxiv.org/abs/arXiv:2310.20161) | **Sulphur isotopes toward Sagittarius B2 extended envelope in the Galactic  Center**  |
|| Q. Li, et al. -- incl., <mark>J. Li</mark> |
|*Appeared on*| *2023-11-01*|
|*Comments*| *20 pages, 7 figures, accepted by PASJ*|
|**Abstract**| The isotopic ratios are good tools for probing the stellar nucleosynthesis and chemical evolution. We performed high-sensitivity mapping observations of the J=7-6 rotational transitions of OCS, OC34S, O13CS, and OC33S toward the Galactic Center giant molecular cloud, Sagittarius B2 (Sgr B2) with IRAM 30m telescope. Positions with optically thin and uncontaminated lines are chosen to determine the sulfur isotope ratios. A 32S/34S ratio of 17.1\pm0.9 was derived with OCS and OC34S lines, while 34S/33S ratio of 6.8\pm1.9 was derived directly from integrated intensity ratio of OC34S and OC33S. With independent and accurate measurements of 32S/34S ratio, our results confirm the termination of the decreasing trend of 32S/34S ratios toward the Galactic Center, suggesting a drop in the production of massive stars at the Galactic centre. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2310.19866.md
    + _build/html/tmp_2310.19866/./figures/115_dopplerfit.png
exported in  _build/html/2310.19904.md
    + _build/html/tmp_2310.19904/./img/results/kde.png
    + _build/html/tmp_2310.19904/./img/results/kde_TNG50.png
    + _build/html/tmp_2310.19904/./img/results/kde_TNG100.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_TNG100_z.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_TNG100_i_band_mag_dust_apparent.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_TNG100_petro_90_light.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_photoz.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_i_cmodel_mag_ge.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_petro_90_light.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_TNG50_z.png
    + _build/html/tmp_2310.19904/./img/results/umap_compare_TNG50_i_band_mag_dust_apparent.p

## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

 to $2\sigma$, mean absolute error of $14.2$ km s$^{-1}$, and bias of $0.2$ km s$^{-1}$. (*fig:koester-falcon*)

</div>
<div id="div_fig3">

<img src="tmp_2310.19866/./figures/115_dopplerfit.png" alt="Fig4" width="100%"/>

**Figure 4. -** Main sequence radial velocity fit using templates from MaStar. Radial velocity is calculated from a template spectrum via $\chi^2$ minimization. _Red:_ MaStar template spectrum. _Black:_ Observed main sequence spectrum from SDSS-IV. (*fig:ms_rv*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2310.19866"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\ap}{\color{magenta}}$
$\newcommand{\todo}{\color{green}}$
$\newcommand{\MSUN}{\rm{M}_{\odot}}$
$\newcommand{\logmstar}{\log(M_{\star}/\mathrm{M}_{\odot})}$
$\newcommand{\cb}{\textcolor{Crimson}}$
$\newcommand{\thebibliography}{\DeclareRobustCommand{\VAN}[3]{##3}\VANthebibliography}$</div>



<div id="title">

# ERGO-ML -- Comparing IllustrisTNG and HSC galaxy images via contrastive learning

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2310.19904-b31b1b.svg)](https://arxiv.org/abs/2310.19904)<mark>Appeared on: 2023-11-01</mark> -  _26 pages, 17 figures. Submitted to MNRAS. Comments are welcome. Highlights in Fig. 3, 7 and 13_

</div>
<div id="authors">

<mark>L. Eisert</mark>, et al. -- incl., <mark>A. Pillepich</mark>

</div>
<div id="abstract">

**Abstract:** Modern cosmological hydrodynamical galaxy simulations provide tens of thousands of reasonably realistic synthetic galaxies across cosmic time. However, quantitatively assessing the level of realism of simulated universes in comparison to the real one is difficult. In this paper of the ERGO-ML series (Extracting Reality from Galaxy Observables with Machine Learning), we utilize contrastive learning to directly compare a large sample of simulated and observed galaxies based on their stellar-light images. This eliminates the need to specify summary statistics and allows to exploit the whole information content of the observations. We produce survey-realistic galaxy mock datasets resembling real Hyper Suprime-Cam (HSC) observations using the cosmological simulations TNG50 and TNG100. Our focus is on galaxies with stellar masses between $10^9$ and $10^{12} M_\odot$ at $z=0.1-0.4$ . This allows us to evaluate the realism of the simulated TNG galaxies in comparison to actual HSC observations. We apply the self-supervised contrastive learning method NNCLR to the images from both simulated and observed datasets (g, r, i - bands). This results in a 256-dimensional representation space, encoding all relevant observable galaxy properties. Firstly, this allows us to identify simulated galaxies that closely resemble real ones by seeking similar images in this multi-dimensional space. Even more powerful, we quantify the alignment between the representations of these two image sets, finding that the majority ( $\gtrsim 70$ per cent) of the TNG galaxies align well with observed HSC images. However, a subset of simulated galaxies with larger sizes, steeper Sersic profiles, smaller Sersic ellipticities, and larger asymmetries appears unrealistic. We also demonstrate the utility of our derived image representations by inferring properties of real HSC galaxies using simulated TNG galaxies as the ground truth.

</div>

<div id="div_fig1">

<img src="tmp_2310.19904/./img/results/kde.png" alt="Fig6.1" width="6%"/><img src="tmp_2310.19904/./img/results/kde_TNG50.png" alt="Fig6.2" width="6%"/><img src="tmp_2310.19904/./img/results/kde_TNG100.png" alt="Fig6.3" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG100_z.png" alt="Fig6.4" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG100_i_band_mag_dust_apparent.png" alt="Fig6.5" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG100_petro_90_light.png" alt="Fig6.6" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_photoz.png" alt="Fig6.7" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_i_cmodel_mag_ge.png" alt="Fig6.8" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG100_petro_90_light.png" alt="Fig6.9" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG50_z.png" alt="Fig6.10" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG50_i_band_mag_dust_apparent.png" alt="Fig6.11" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_TNG50_petro_90_light.png" alt="Fig6.12" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG50_photoz.png" alt="Fig6.13" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG50_i_cmodel_mag_ge.png" alt="Fig6.14" width="6%"/><img src="tmp_2310.19904/./img/results/umap_compare_HSC_TNG50_petro_90_light.png" alt="Fig6.15" width="6%"/>

**Figure 6. -** ** How well do the representations of the observed and simulated galaxy images align to each other?** We compare the distributions of TNG50/100 and HSC images in the corresponding 2D-UMAP mapping of the 256-dimensional representations. In the uppermost panel we show KDE density plots of TNG100 (TNG50) images in blue (orange). Additionally the HSC sets that are matched against TNG100/TNG50, respectively, are shown in light blue/orange in two additional plots. We see that there is a large overlap among the three sets but also a slight offset and differences in point density.
    In the lower two panels we further investigate the offset in a visual way: we show the 2D hexbins of the UMAPs coloured bin-wise according to the median value of the three common properties used for the galaxy-sample matching: from left to right, redshift, i-band magnitude, Petrosian radius. In each row we show one of the datasets: from top to bottom, TNG100, HSC matched to TNG100, TNG50, HSC matched to TNG50. We also include the contours covering 80 per cent of the datasets from the uppermost panel: TNG100 in blue, TNG50 in orange and in light orange/light blue the subsets of HSC matched to TNG50/TNG100. (*fig:kde_umap_compare*)

</div>
<div id="div_fig2">

<img src="tmp_2310.19904/./img/results/TNG100_mass.png" alt="Fig3.1" width="12%"/><img src="tmp_2310.19904/./img/results/TNG50_mass.png" alt="Fig3.2" width="12%"/><img src="tmp_2310.19904/./img/results/TNG100_color.png" alt="Fig3.3" width="12%"/><img src="tmp_2310.19904/./img/results/TNG50_color.png" alt="Fig3.4" width="12%"/><img src="tmp_2310.19904/./img/results/TNG100_fraction_disk_stars.png" alt="Fig3.5" width="12%"/><img src="tmp_2310.19904/./img/results/TNG50_fraction_disk_stars.png" alt="Fig3.6" width="12%"/><img src="tmp_2310.19904/./img/results/TNG100_half_mass_rad_physical.png" alt="Fig3.7" width="12%"/><img src="tmp_2310.19904/./img/results/TNG50_half_mass_rad_physical.png" alt="Fig3.8" width="12%"/>

**Figure 3. -** ** Are the representations related to observable features?** Each panel shows a 2D hexbin histogram of TNG100 galaxy images (left panels) and TNG50 galaxy images (right panels) in the 2D UMAP parameter space. The UMAP mapping itself was trained using the 256-dimensional representations of TNG50/100 and HSC test galaxies. Note that because of the smaller sample size of TNG50, we choose a larger bin size in the right panels. The bins are coloured according to the median image/galaxy property in each bin. We show from top to bottom, and in the continuation figures: total stellar mass, fraction of disk stars, integrated galaxy colour index (g-r), stellar half-mass radius, Sérsic half-light radius, Sérsic index, Sérsic ellipticity, asymmetry of the light distribution, concentration of the light, smoothness of the light distribution, Gini-M20 bulge parameter and Gini-M20 merger parameter. The UMAP, and therefore the representations upon which it is based, are clearly related to observable properties of the TNG50/100 galaxies. (*fig:tng_umaps*)

</div>
<div id="div_fig3">

<img src="tmp_2310.19904/./img/results/images_TNG100.png" alt="Fig7" width="100%"/>

**Figure 7. -** ** What images underlie the UMAP representations?** We align TNG100 galaxy images to their positions in the UMAP representations from Figure \ref{fig:kde_umap_compare}. For each node on a rectangular grid in 2D UMAP space, we choose the closest data point (i.e. image representation) and plot its corresponding image. As a guideline, we report the contours as in Figure \ref{fig:kde_umap_compare} covering $80$ per cent of the datasets: TNG100 in blue, TNG50 in orange, and the corresponding HSC matched samples in light blue and light orange. With this visual inspection, we see that the galaxy structures are indeed well grouped together. (*fig:umap_images_TNG100*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2310.19904"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

314  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

6  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

4  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
