# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

S. Scheithauer  ->  S. Scheithauer  |  ['S. Scheithauer']
J. Li  ->  J. Li  |  ['J. Li']
Zhang  ->  X. Zhang  |  ['Zhang']
Zhang  ->  X. Zhang  |  ['Zhang']
J. L.  ->  J. Li  |  ['J. Li']
Zhang  ->  X. Zhang  |  ['Zhang']
Zhang  ->  X. Zhang  |  ['Zhang']
G. Guiglion  ->  G. Guiglion  |  ['G. Guiglion']
M. Hobson  ->  M. Hobson  |  ['M. Hobson']


Arxiv has 71 new papers today
          5 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2311.03472


extracting tarball to tmp_2311.03472...

 done.
















Found 59 bibliographic references in tmp_2311.03472/polarization_v3.bbl.
syntax error in line 4: '=' expected
Retrieving document from  https://arxiv.org/e-print/2311.03494


extracting tarball to tmp_2311.03494...

 done.
Retrieving document from  https://arxiv.org/e-print/2311.03635



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)


extracting tarball to tmp_2311.03635...

 done.
Retrieving document from  https://arxiv.org/e-print/2311.04146


extracting tarball to tmp_2311.04146...

 done.



  exec(code_obj, self.user_global_ns, self.user_ns)

  exec(code_obj, self.user_global_ns, self.user_ns)




Found 82 bibliographic references in tmp_2311.04146/main.bbl.
syntax error in line 165: '=' expected
Retrieving document from  https://arxiv.org/e-print/2311.04153


extracting tarball to tmp_2311.04153...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2311.03472-b31b1b.svg)](https://arxiv.org/abs/arXiv:2311.03472) | **Polarization analysis of the VLTI and GRAVITY**  |
|| G. Collaboration, et al. -- incl., <mark>S. Scheithauer</mark> |
|*Appeared on*| *2023-11-08*|
|*Comments*| *Accepted by A&A*|
|**Abstract**| The goal of this work is to characterize the polarization effects of the VLTI and GRAVITY. This is needed to calibrate polarimetric observations with GRAVITY for instrumental effects and to understand the systematic error introduced to the astrometry due to birefringence when observing targets with a significant intrinsic polarization. By combining a model of the VLTI light path and its mirrors and dedicated experimental data, we construct a full polarization model of the VLTI UTs and the GRAVITY instrument. We first characterize all telescopes together to construct a UT calibration model for polarized targets. We then expand the model to include the differential birefringence. With this, we can constrain the systematic errors for highly polarized targets. Together with this paper, we publish a standalone Python package to calibrate the instrumental effects on polarimetric observations. This enables the community to use GRAVITY to observe targets in a polarimetric observing mode. We demonstrate the calibration model with the galactic center star IRS 16C. For this source, we can constrain the polarization degree to within 0.4 % and the polarization angle within 5 deg while being consistent with the literature. Furthermore, we show that there is no significant contrast loss, even if the science and fringe-tracker targets have significantly different polarization, and we determine that the phase error in such an observation is smaller than 1 deg, corresponding to an astrometric error of 10 {\mu}as. With this work, we enable the use of the polarimetric mode with GRAVITY/UTs and outline the steps necessary to observe and calibrate polarized targets. We demonstrate that it is possible to measure the intrinsic polarization of astrophysical sources with high precision and that polarization effects do not limit astrometric observations of polarized targets. |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2311.04146-b31b1b.svg)](https://arxiv.org/abs/arXiv:2311.04146) | **Galaxy Spectra neural Network (GaSNet). II. Using Deep Learning for  Spectral Classification and Redshift Predictions**  |
|| F. Zhong, et al. -- incl., <mark>G. Guiglion</mark> |
|*Appeared on*| *2023-11-08*|
|*Comments*| *23 pages and 31 figures. The draft has been submitted to MNRAS*|
|**Abstract**| Large sky spectroscopic surveys have reached the scale of photometric surveys in terms of sample sizes and data complexity. These huge datasets require efficient, accurate, and flexible automated tools for data analysis and science exploitation. We present the Galaxy Spectra Network/GaSNet-II, a supervised multi-network deep learning tool for spectra classification and redshift prediction. GaSNet-II can be trained to identify a customized number of classes and optimize the redshift predictions for classified objects in each of them. It also provides redshift errors, using a network-of-networks that reproduces a Monte Carlo test on each spectrum, by randomizing their weight initialization. As a demonstration of the capability of the deep learning pipeline, we use 260k Sloan Digital Sky Survey spectra from Data Release 16, separated into 13 classes including 140k galactic, and 120k extragalactic objects. GaSNet-II achieves 92.4% average classification accuracy over the 13 classes (larger than 90% for the majority of them), and an average redshift error of approximately 0.23% for galaxies and 2.1% for quasars. We further train/test the same pipeline to classify spectra and predict redshifts for a sample of 200k 4MOST mock spectra and 21k publicly released DESI spectra. On 4MOST mock data, we reach 93.4% accuracy in 10-class classification and an average redshift error of 0.55% for galaxies and 0.3% for active galactic nuclei. On DESI data, we reach 96% accuracy in (star/galaxy/quasar only) classification and an average redshift error of 2.8% for galaxies and 4.8% for quasars, despite the small sample size available. GaSNet-II can process ~40k spectra in less than one minute, on a normal Desktop GPU. This makes the pipeline particularly suitable for real-time analyses of Stage-IV survey observations and an ideal tool for feedback loops aimed at night-by-night survey strategy optimization. |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2311.03494-b31b1b.svg)](https://arxiv.org/abs/arXiv:2311.03494) | **The Effects of the Local Environment on a Compact Radio Interferometer  I: Cross-coupling in the Tianlai Dish Pathfinder Array**  |
|| J. Kwak, et al. -- incl., <mark>J. Li</mark> |
|*Appeared on*| *2023-11-08*|
|*Comments*| *20 pages, 22 figures, accepted for publication by JAI*|
|**Abstract**| The visibilities measured by radio astronomical interferometers include non-astronomical correlated signals that arise from the local environment of the array. These correlated signals are especially important in compact arrays such as those under development for 21\,cm intensity mapping. The amplitudes of the contaminated visibilities can exceed the expected 21\,cm signal and represent a significant systematic effect. We study the receiver noise radiated by antennas in compact arrays and develop a model for how it couples to other antennas. We apply the model to the Tianlai Dish Pathfinder Array (TDPA), a compact array of 16, 6-m dish antennas. The coupling model includes electromagnetic simulations, measurements with a network analyzer, and measurements of the noise of the receivers. We compare the model to drift-scan observations with the array and set requirements on the level of antenna cross-coupling for 21\,cm intensity mapping instruments. We find that for the TDPA, cross-coupling would have to be reduced by TBD orders of magnitude in order to contribute negligibly to the visibilities. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2311.03635-b31b1b.svg)](https://arxiv.org/abs/arXiv:2311.03635) | **A Phase-resolved View of the Low-frequency Quasiperiodic Oscillations  from the Black Hole Binary MAXI J1820+070**  |
|| Shui, et al. -- incl., <mark>Zhang</mark>, <mark>Zhang</mark>, <mark>J. L.</mark>, <mark>Zhang</mark>, <mark>Zhang</mark> |
|*Appeared on*| *2023-11-08*|
|*Comments*| *Accepted for publication in The Astrophysical Journal*|
|**Abstract**| Although low-frequency quasiperiodic oscillations (LFQPOs) are commonly detected in the X-ray light curves of accreting black hole X-ray binaries, their origin still remains elusive. In this study, we conduct phase-resolved spectroscopy in a broad energy band for LFQPOs in MAXI J1820+070 during its 2018 outburst, utilizing Insight-HXMT observations. By employing the Hilbert-Huang transform method, we extract the intrinsic quasiperiodic oscillation (QPO) variability, and obtain the corresponding instantaneous amplitude, phase, and frequency functions for each data point. With well-defined phases, we construct QPO waveforms and phase-resolved spectra. By comparing the phase-folded waveform with that obtained from the Fourier method, we find that phase folding on the phase of the QPO fundamental frequency leads to a slight reduction in the contribution of the harmonic component. This suggests that the phase difference between QPO harmonics exhibits time variability. Phase-resolved spectral analysis reveals strong concurrent modulations of the spectral index and flux across the bright hard state. The modulation of the spectral index could potentially be explained by both the corona and jet precession models, with the latter requiring efficient acceleration within the jet. Furthermore, significant modulations in the reflection fraction are detected exclusively during the later stages of the bright hard state. These findings provide support for the geometric origin of LFQPOs and offer valuable insights into the evolution of the accretion geometry during the outburst in MAXI J1820+070. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2311.04153-b31b1b.svg)](https://arxiv.org/abs/arXiv:2311.04153) | **Kernel-, mean- and noise-marginalised Gaussian processes for exoplanet  transits and $H_0$ inference**  |
|| N. Kroupa, D. Yallup, W. Handley, <mark>M. Hobson</mark> |
|*Appeared on*| *2023-11-08*|
|*Comments*| *17 pages, 11 figures, submitted to Monthly Notices of the Royal Astronomical Society*|
|**Abstract**| Using a fully Bayesian approach, Gaussian Process regression is extended to include marginalisation over the kernel choice and kernel hyperparameters. In addition, Bayesian model comparison via the evidence enables direct kernel comparison. The calculation of the joint posterior was implemented with a transdimensional sampler which simultaneously samples over the discrete kernel choice and their hyperparameters by embedding these in a higher-dimensional space, from which samples are taken using nested sampling. This method was explored on synthetic data from exoplanet transit light curve simulations. The true kernel was recovered in the low noise region while no kernel was preferred for larger noise. Furthermore, inference of the physical exoplanet hyperparameters was conducted. In the high noise region, either the bias in the posteriors was removed, the posteriors were broadened or the accuracy of the inference was increased. In addition, the uncertainty in mean function predictive distribution increased due to the uncertainty in the kernel choice. Subsequently, the method was extended to marginalisation over mean functions and noise models and applied to the inference of the present-day Hubble parameter, $H_0$, from real measurements of the Hubble parameter as a function of redshift, derived from the cosmologically model-independent cosmic chronometer and {\Lambda}CDM-dependent baryon acoustic oscillation observations. The inferred $H_0$ values from the cosmic chronometers, baryon acoustic oscillations and combined datasets are $H_0$ = 66$\pm$6 km/s/Mpc, $H_0$ = 67$\pm$10 km/s/Mpc and $H_0$ = 69$\pm$6 km/s/Mpc, respectively. The kernel posterior of the cosmic chronometers dataset prefers a non-stationary linear kernel. Finally, the datasets are shown to be not in tension with ln(R)=12.17$\pm$0.02. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2311.03472.md
    + _build/html/tmp_2311.03472/./figures/GRAVITY_fit.png
    + _build/html/tmp_2311.03472/./figures/vlti_pol_paper_fig2.png
    + _build/html/tmp_2311.03472/./figures/ev_phaseerror.png
exported in  _build/html/2311.04146.md
    + _build/html/tmp_2311.04146/./paper/fig/new_pipeline_2.png
    + _build/html/tmp_2311.04146/./paper/fig/SDSS_one2one_MC.png
    + _build/html/tmp_2311.04146/./paper/fig/SDSS_exgla.png


## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$</div>



<div id="title">

# Polarization analysis of the VLTI and GRAVITY

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2311.03472-b31b1b.svg)](https://arxiv.org/abs/2311.03472)<mark>Appeared on: 2023-11-08</mark> -  _Accepted by A&A_

</div>
<div id="authors">

G. Collaboration, et al. -- incl., <mark>S. Scheithauer</mark>

</div>
<div id="abstract">

**Abstract:** The goal of this work is to characterize the polarization effects of the VLTI and GRAVITY. This is needed to calibrate polarimetric observations with GRAVITY for instrumental effects and to understand the systematic error introduced to the astrometry due to birefringence when observing targets with a significant intrinsic polarization. By combining a model of the VLTI light path and its mirrors and dedicated experimental data, we construct a full polarization model of the VLTI UTs and the GRAVITY instrument. We first characterize all telescopes together to construct a UT calibration model for polarized targets. We then expand the model to include the differential birefringence. With this, we can constrain the systematic errors for highly polarized targets. Together with this paper, we publish a standalone Python package to calibrate the instrumental effects on polarimetric observations. This enables the community to use GRAVITY to observe targets in a polarimetric observing mode. We demonstrate the calibration model with the galactic center star IRS 16C. For this source, we can constrain the polarization degree to within 0.4 % and the polarization angle within 5 deg while being consistent with the literature. Furthermore, we show that there is no significant contrast loss, even if the science and fringe-tracker targets have significantly different polarization, and we determine that the phase error in such an observation is smaller than 1 deg, corresponding to an astrometric error of 10 {\mu}as. With this work, we enable the use of the polarimetric mode with GRAVITY/UTs and outline the steps necessary to observe and calibrate polarized targets. We demonstrate that it is possible to measure the intrinsic polarization of astrophysical sources with high precision and that polarization effects do not limit astrometric observations of polarized targets. 

</div>

<div id="div_fig1">

<img src="tmp_2311.03472/./figures/GRAVITY_fit.png" alt="Fig18" width="100%"/>

**Figure 18. -** Measured polarization with GRAVITY in the different observing modes. The left column shows the measurement in the off-axis mode, and the right column in the on-axis mode. In the top row, the linear polarization filter is used for the input light source; in the bottom, it is not. In all plots, the Stokes Q data points are shown in red/orange and the Stokes U in grey. The data is the average over all four GRAVITY beams. The results from the fitted model are shown in black lines. (*fig:grav_full*)

</div>
<div id="div_fig2">

<img src="tmp_2311.03472/./figures/vlti_pol_paper_fig2.png" alt="Fig12" width="100%"/>

**Figure 12. -** Simplified version of the VLTI light path from \autoref{fig:sketch_path} to show the modeling and the experimental setup. The black rectangles show where the laser is launched and where the polarimeter is mounted. The names of the mirrors used in the text are given. The color of the mirror number shows the grouping which was used for the fitting. Grey mirrors are not fitted in our calibration model. Each colored group is located in one common plane: M4-M8 are in one vertical plane, and M10 - M18 are in one horizontal plane. (*fig:sketch_exp*)

</div>
<div id="div_fig3">

<img src="tmp_2311.03472/./figures/ev_phaseerror.png" alt="Fig20" width="100%"/>

**Figure 20. -** Error in the visibility phases due to differential birefringence for all telescope positions. The left two columns show the error for the first polarization P1 and the right two for the second polarization P2. (*fig:ev_phaseerr*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2311.03472"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\zhong}[1]{{\color{black}{#1}}}$
$\newcommand{\zhongblue}[1]{{\color{black}{#1}}}$
$\newcommand{\nic}[1]{{\color{magenta}{#1}}}$</div>



<div id="title">

# Galaxy Spectra neural Network (GaSNet). II. Using Deep Learning for Spectral Classification and Redshift Predictions

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2311.04146-b31b1b.svg)](https://arxiv.org/abs/2311.04146)<mark>Appeared on: 2023-11-08</mark> -  _23 pages and 31 figures. The draft has been submitted to MNRAS_

</div>
<div id="authors">

F. Zhong, et al. -- incl., <mark>G. Guiglion</mark>

</div>
<div id="abstract">

**Abstract:** Large sky spectroscopic surveys have reached the scale of photometric surveys in terms of sample sizes and data complexity. These huge datasets require efficient, accurate, and flexible automated tools for data analysis and science exploitation. We present the Galaxy Spectra Network/GaSNet-II, a supervised multi-network deep learning tool for spectra classification and redshift prediction. GaSNet-II can be trained to identify a customized number of classes and optimize the redshift predictions for classified objects in each of them.It also provides redshift errors, using a network-of-networks that reproduces a Monte Carlo test on each spectrum, by randomizing their weight initialization.As a demonstration of the capability of the deep learning pipeline, we use 260k Sloan Digital Sky Survey spectra from Data Release 16, separated into 13 classes including 140k galactic, and 120k extragalactic objects.GaSNet-II achieves 92.4 \% average classification accuracy over the 13 classes (larger than 90 \% for the majority of them), and an average redshift error of approximately 0.23 \% for galaxies and 2.1 \% for quasars.We further train/test the same pipeline to classify spectra and predict redshifts for a sample of 200k 4MOST mock spectra and 21k publicly released DESI spectra. On 4MOST mock data, we reach 93.4 \% accuracy in 10-class classification and an average redshift error of 0.55 \% for galaxies and 0.3 \% for active galactic nuclei. On DESI data, we reach 96 \% accuracy in (star/galaxy/quasar only) classification and an average redshift error of 2.8 \% for galaxies and 4.8 \% for quasars, despite the small sample size available. GaSNet-II can process $\sim40$ k spectra in less than one minute, on a normal Desktop GPU. This makes the pipeline particularly suitable for real-time analyses of Stage-IV survey observations and an ideal tool for feedback loops aimed at night-by-night survey strategy optimization.

</div>

<div id="div_fig1">

<img src="tmp_2311.04146/./paper/fig/new_pipeline_2.png" alt="Fig30" width="100%"/>

**Figure 30. -** _ Panel a)_: the general structure of the multi-networks pipeline. $\it ResNet\_P$ is used as a classifier and  $\it ResNet\_7-12$ is used for redshift prediction of extragalactic targets (note that $\it ResNet\_0-6$ are missing because we do not need to predict the redshift of stars). One of the advantages of this structure is that it is simple and controllable, and can be trained and predicted in parallel. _ Panel b)_: the detailed description of single sub-network $\it ResNet_i$(bottom figures) architecture, made by small blocks. The input of the network is 5001-pixel spectrum flux, and the output is the probability or redshift. The difference between classification ($\it n=13,  softmax$) and redshift prediction ($\it n=1,  None$) is the output dimension and the activation in the last layer. A feature-extract block $\it Block(n)$ and a fully connected block $\it Dense(n)$ are shown. $\it cov1d$ is the 1-D convolution layer. In one $\it cov1d$ rectangle, $5$ is the kernel size; $/3$ is the stride size; $n$ is the number of channels. $\it relu$, $\it softmax$ are the activate function, $\it None$ represents no activate function here, that means liner. The left $\it cov1d$ in the $\it Block(n)$ shortcut is used to match the shape. $\it pool1d$ is a 1-D Maxpooling layer. As a schematic, the top right panel shows how to predict the redshift error of the label 7 (GALAXY\_nan) subclass in parallel. Though 10 (customized) same sub-networks, trained by the same data but with different initial weights, 10 different redshifts were obtained from a single spectrum input. The expectation and error can be calculated. Other redshift errors are obtained in the same way. (*fig: full pipeline*)

</div>
<div id="div_fig2">

<img src="tmp_2311.04146/./paper/fig/SDSS_one2one_MC.png" alt="Fig32" width="100%"/>

**Figure 32. -** The mean redshift predictions and errors of the 6 extragalactic SDSS subclasses. The error bar of each sample point represents the standard deviation obtained from the MC estimation of 10 sub-networks. In the top left of each main panel the subclass name, MAE, $\Delta z$, and the GF are displayed. The points in the top panels display the mean of the distribution of the $\overline{z}_p$ residuals ($\overline{z}_p-z_t$) with respect to the true values ($z_t$) in each bin, and error bars corresponding mean $\sigma_z$ values (see text).
     (*fig: redshift error*)

</div>
<div id="div_fig3">

<img src="tmp_2311.04146/./paper/fig/SDSS_exgla.png" alt="Fig2" width="100%"/>

**Figure 2. -** Example spectra of SDSS extragalactic sub-classes,
    as listed in Table \ref{Table:1}. We can clearly see the different features characterizing the different classes. From top to bottom, in particular, we can notice the increasing importance of the emission lines that play an important role in redshift prediction. The `nan' type spectra generally lack such emission lines, although they might still contain some low-SNR ones, which are hard to see. This means that the `nan' sample might overlap with other emission line classes. QSOs also show a power-law continuum that does not carry any redshift information. (*fig: spectra_2*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2311.04146"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

319  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

6  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

3  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
