# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

S. Li  ->  S. Li  |  ['S. Li']
H. Beuther  ->  H. Beuther  |  ['H. Beuther']
P. Gaikwad  ->  P. Gaikwad  |  ['P. Gaikwad']
Arxiv has 54 new papers today
          2 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/2 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2403.07058


extracting tarball to tmp_2403.07058... done.


S. Li  ->  S. Li  |  ['S. Li']
H. Beuther  ->  H. Beuther  |  ['H. Beuther']


Found 84 bibliographic references in tmp_2403.07058/arxiv.bbl.
Retrieving document from  https://arxiv.org/e-print/2403.07498


extracting tarball to tmp_2403.07498...

 done.


P. Gaikwad  ->  P. Gaikwad  |  ['P. Gaikwad']


Found 74 bibliographic references in tmp_2403.07498/main.bbl.
syntax error in line 19: unbalanced braces


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2403.07058-b31b1b.svg)](https://arxiv.org/abs/arXiv:2403.07058) | **The ALMA Survey of 70 $μ$m Dark High-mass Clumps in Early Stages  (ASHES). XI. Statistical Study of Early Fragmentation**  |
|| K. Morii, et al. -- incl., <mark>S. Li</mark>, <mark>H. Beuther</mark> |
|*Appeared on*| *2024-03-13*|
|*Comments*| *Accepted for Publication in ApJ. 19 pages, 7 figures, 4 tables*|
|**Abstract**| Fragmentation during the early stages of high-mass star formation is crucial for understanding the formation of high-mass clusters. We investigated fragmentation within thirty-nine high-mass star-forming clumps as part of the Atacama Large Millimeter/submillimeter Array (ALMA) Survey of 70 $\mu$m Dark High-mass Clumps in Early Stages (ASHES). Considering projection effects, we have estimated core separations for 839 cores identified from the continuum emission and found mean values between 0.08 and 0.32 pc within each clump. We find compatibility of the observed core separations and masses with the thermal Jeans length and mass, respectively. We also present sub-clump structures revealed by the 7 m-array continuum emission. Comparison of the Jeans parameters using clump and sub-clump densities with the separation and masses of gravitationally bound cores suggests that they can be explained by clump fragmentation, implying the simultaneous formation of sub-clumps and cores within rather than a step-by-step hierarchical fragmentation. The number of cores in each clump positively correlates with the clump surface density and the number expected from the thermal Jeans fragmentation. We also find that the higher the fraction of protostellar cores, the larger the dynamic range of the core mass, implying that the cores are growing in mass as the clump evolves. The ASHES sample exhibits various fragmentation patterns: aligned, scattered, clustered, and sub-clustered. Using the Q-parameter, which can help to distinguish between centrally condensed and subclustered spatial core distributions, we finally find that in the early evolutionary stages of high-mass star formation, cores tend to follow a subclustered distribution. |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2403.07498-b31b1b.svg)](https://arxiv.org/abs/arXiv:2403.07498) | **FLAME: Fitting Lyα Absorption lines using Machine learning**  |
|| P. Jalan, V. Khaire, M. Vivek, <mark>P. Gaikwad</mark> |
|*Appeared on*| *2024-03-13*|
|*Comments*| *Submitted to A&A*|
|**Abstract**| We introduce FLAME, a machine learning algorithm designed to fit Voigt profiles to HI Lyman-alpha (Ly$\alpha$) absorption lines using deep convolutional neural networks. FLAME integrates two algorithms: the first determines the number of components required to fit Ly$\alpha$ absorption lines, and the second calculates the Doppler parameter $b$, the HI column density N$_{\rm HI}$, and the velocity separation of individual components. For the current version of FLAME, we trained it on low-redshift Ly$\alpha$ forests observed with the Far Ultraviolet gratings of the Cosmic Origin Spectrograph (COS) aboard the Hubble Space Telescope (HST). Drawing on this data, we trained FLAME on $\sim$ $10^6$ simulated Voigt profiles, forward-modeled to Ly$\alpha$ absorption lines observed with HST-COS, to classify lines as either single or double components and then determine Voigt profile fitting parameters. FLAME shows impressive accuracy on the simulated data by identifying more than 98% (90%) of single (double) component lines. It determines $b$ values within $\approx \pm{8}~(15)$ km s$^{-1}$ and log $N_{\rm HI}/ {\rm cm}^2$ values within $\approx \pm 0.3~(0.8)$ for 90% of the single (double) component lines. However, when applied to real data, FLAME's component classification accuracy drops by $\sim$ 10%. Despite this, there is a reasonable agreement between the $b$ and N$_{\rm HI}$ distributions obtained from traditional Voigt profile fitting methods and FLAME's predictions. Our mock HST-COS data analysis, designed to emulate real data parameters, demonstrated that FLAME could achieve consistent accuracy comparable to its performance with simulated data. This finding suggests that the drop in FLAME's accuracy when used on real data primarily arises from the difficulty of replicating the full complexity of real data in the training sample. |

## Failed papers

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2403.07058.md
    + _build/html/tmp_2403.07058/./fig_cl_subcl_core.png
    + _build/html/tmp_2403.07058/./G2428_12m7m_25105_class.png
exported in  _build/html/2403.07498.md
    + _build/html/tmp_2403.07498/./Fig/CNN_btrue_bpred_2p7M_COS_p005.png
    + _build/html/tmp_2403.07498/./Fig/CNN_Ntrue_Npred_2p7M_COS_p005.png
    + _build/html/tmp_2403.07498/./Fig/bmlcnn_2p7M_COS_p005.png
    + _build/html/tmp_2403.07498/./Fig/Nmlcnn_2p7M_COS_p005.png
    + _build/html/tmp_2403.07498/./Fig/b1_histo_allstudy_kde.png
    + _build/html/tmp_2403.07498/./Fig/N1_histo_allstudy_kde.png
    + _build/html/tmp_2403.07498/./Fig/confusion_matrix_Classification_test_roll_1M_final8a_p0.5.png


## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\xmark}{\ding{55}}$
$\newcommand{\red}[1]{\textcolor{red}{#1}}$
$\newcommand{\kms}{\mbox{km s^{-1}}}$
$\newcommand{\x}{\mbox{\times}}$
$\newcommand{\Msun}{\mbox{M_{\odot}}}$
$\newcommand{\Lsun}{\mbox{L_{\odot}}}$</div>



<div id="title">

# The ALMA Survey of 70 $\mu$m Dark High-mass Clumps in Early Stages (ASHES). XI. \\Statistical Study of Early Fragmentation

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2403.07058-b31b1b.svg)](https://arxiv.org/abs/2403.07058)<mark>Appeared on: 2024-03-13</mark> -  _Accepted for Publication in ApJ. 19 pages, 7 figures, 4 tables_

</div>
<div id="authors">

K. Morii, et al. -- incl., <mark>S. Li</mark>, <mark>H. Beuther</mark>

</div>
<div id="abstract">

**Abstract:** Fragmentation during the early stages of high-mass star formation is crucial for understanding the formation of high-mass clusters.We investigated fragmentation within thirty-nine high-mass star-forming clumps as part of the Atacama Large Millimeter/submillimeter Array (ALMA) Survey of 70 $\mu$ m Dark High-mass Clumps in Early Stages (ASHES).Considering projection effects, we have estimated core separations for 839 cores identified from the continuum emission and found mean values between 0.08 and 0.32 pc within each clump.We find compatibility of the observed core separations and masses with the thermal Jeans length and mass, respectively.We also present sub-clump structures revealed by the 7 m-array continuum emission.Comparison of the Jeans parameters using clump and sub-clump densities with the separation and masses of gravitationally bound cores suggests that they can be explained by clump fragmentation, implying the simultaneous formation of sub-clumps and cores within rather than a step-by-step hierarchical fragmentation.The number of cores in each clump positively correlates with the clump surface density and the number expected from the thermal Jeans fragmentation.We also find that the higher the fraction of protostellar cores, the larger the dynamic range of the core mass, implying that the cores are growing in mass as the clump evolves.The ASHES sample exhibits various fragmentation patterns: aligned, scattered, clustered, and sub-clustered.Using the $\mathcal{Q}$ -parameter, which can help to distinguish between centrally condensed and subclustered spatial core distributions, we finally find that in the early evolutionary stages of high-mass star formation, cores tend to follow a subclustered distribution.

</div>

<div id="div_fig1">

<img src="tmp_2403.07058/./fig_cl_subcl_core.png" alt="Fig1" width="100%"/>

**Figure 1. -** Continuum images for G024.524-00.139 obtained by (left) a single-dish telescope  ([Schuller, Menten and Contreras 2009]()) , and the gray circles correspond to sub-clumps.
    The three different colors of the crosses indicate the gravitational states of cores; bound cores (red), unbound cores (blue), and cores without detections of dense gas tracers such as $N_2$D$^+$ and DCO$^+$(gray).
    The blue line shows thermal Jeans fragmentation with $T$ = 15 K and $n(\mathrm{H_2})$ = [10$^2$, 10$^6$] cm$^{-3}$, and the blue shaded region corresponds to the same density range but with T = [10, 30] K.
    The five squares correspond to $n(\mathrm{H_2})$ of 10$^2$, 10$^3$, 10$^4$, 10$^5$, and 10$^6$ cm$^{-3}$ from right to left.
    The green-shaded region shows turbulent Jeans fragmentation to the same density and temperature range but $\sigma$ = [0.8, 2.7] km s$^{-1}$. (*fig:M-nns*)

</div>
<div id="div_fig3">

<img src="tmp_2403.07058/./G2428_12m7m_25105_class.png" alt="Fig6" width="100%"/>

**Figure 6. -** ALMA 1.3 mm continuum image of (left) G024.010+00.489  and (right) G028.273--00.167. The circle size represents the core mass, and the position is centered at the continuum peak of each core. The three different colors have meanings the same as Figure \ref{fig:M-nns}.  (*fig:cont_massdyn*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2403.07058"></div>

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\lya}{Ly\alpha }$
$\newcommand{\lyb}{Ly\beta }$
$\newcommand{\h2}{H_2}$
$\newcommand{\hi}{H{\sc i}}$
$\newcommand{\ovi}{O~{\sc vi}~ }$
$\newcommand{\ovii}{O~{\sc vii}~ }$
$\newcommand{\oviii}{O~{\sc viii}~ }$
$\newcommand{\nv}{N~{\sc v}~}$
$\newcommand{\niii}{N~{\sc iii}\lambda989~ }$
$\newcommand{\ovia}{O~{\sc vi}\lambda1031~ }$
$\newcommand{\ovib}{O~{\sc vi}\lambda1037~ }$
$\newcommand{\oviab}{O~{\sc vi}\lambda\lambda1031,1037~ }$
$\newcommand{\cii}{C~{\sc ii}~}$
$\newcommand{\ciis}{C~{\sc ii*}~}$
$\newcommand{\mgii}{Mg~{\sc ii}~}$
$\newcommand{\civ}{C~{\sc iv}~}$
$\newcommand{\civa}{C~{\sc iv}\lambda1548~}$
$\newcommand{\tcr}{\textcolor{red}}$
$\newcommand{\tcg}{\textcolor{brown}}$
$\newcommand{\kms}{km s^{-1}~}$
$\newcommand{\cms}{cm^2~}$</div>



<div id="title">

# FLAME: Fitting $\lya$ Absorption lines using Machine learning

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2403.07498-b31b1b.svg)](https://arxiv.org/abs/2403.07498)<mark>Appeared on: 2024-03-13</mark> -  _Submitted to A&A_

</div>
<div id="authors">

P. Jalan, V. Khaire, M. Vivek, <mark>P. Gaikwad</mark>

</div>
<div id="abstract">

**Abstract:** We introduce FLAME, a machine learning algorithm designed to fit Voigt profiles to H ${\sc i}$ Lyman-alpha ( $\lya$ ) absorption lines using deep convolutional neural networks. FLAME integrates two algorithms: the first determines the number of components required to fit $\lya$ absorption lines, and the second calculates the Doppler parameter $b$ , the H ${\sc i}$ column density N $_{\rm HI}$ , and the velocity separation of individual components. For the current version of FLAME, we trained it on low-redshift $\lya$ forests observed with the Far Ultraviolet gratings of the Cosmic Origin Spectrograph (COS) aboard the Hubble Space Telescope (HST). Drawing on this data, we trained FLAME on $\sim$ $10^6$ simulated Voigt profiles, forward-modeled to mimic $\lya$ absorption lines observed with HST-COS, to classify lines as either single or double components and then determine Voigt profile fitting parameters.FLAME shows impressive accuracy on the simulated data by identifying more than 98 \% (90 \% ) of single (double) component lines. It determines $b$ values within $\approx \pm{8} (15)$ km s $^{-1}$ and log $N_{\rm HI}/ {\rm cm}^2$ values within $\approx \pm 0.3 (0.8)$ for 90 \% of the single (double) component lines. However, when applied to real data, FLAME's component classification accuracy drops by $\sim$ 10 \% . Despite this, there is a reasonable agreement between the $b$ and N $_{\rm HI}$ distributions obtained from traditional Voigt profile fitting methods and FLAME's predictions. Our mock HST-COS data analysis, designed to emulate real data parameters, demonstrated that FLAME could achieve consistent accuracy comparable to its performance with simulated data. This finding suggests that the drop in FLAME's accuracy when used on real data primarily arises from the difficulty of replicating the full complexity of real data in the training sample. Nevertheless, FLAME's performance validates the use of machine learning for Voigt profile fitting, underscoring the significant potential of machine learning for detailed analysis of absorption lines.

</div>

<div id="div_fig1">

<img src="tmp_2403.07498/./Fig/CNN_btrue_bpred_2p7M_COS_p005.png" alt="Fig4.1" width="25%"/><img src="tmp_2403.07498/./Fig/CNN_Ntrue_Npred_2p7M_COS_p005.png" alt="Fig4.2" width="25%"/><img src="tmp_2403.07498/./Fig/bmlcnn_2p7M_COS_p005.png" alt="Fig4.3" width="25%"/><img src="tmp_2403.07498/./Fig/Nmlcnn_2p7M_COS_p005.png" alt="Fig4.4" width="25%"/>

**Figure 4. -** The upper panel compares the actual and predicted values for the two parameters, $b$ and N. The lower panel exhibits the distribution of the differences between the predicted and true values, with markers for the 90\% and 68\% percentiles. (*fig-result2*)

</div>
<div id="div_fig2">

<img src="tmp_2403.07498/./Fig/b1_histo_allstudy_kde.png" alt="Fig15.1" width="50%"/><img src="tmp_2403.07498/./Fig/N1_histo_allstudy_kde.png" alt="Fig15.2" width="50%"/>

**Figure 15. -** The histograms show the distribution of $b$ and log N for a single absorption line estimated by the CNN model in this work (green) overplotted with distributions from \citepalias{Danforth2016}(purple) and  \citetalias{Prakash2017}(red). The distributions for the corresponding mocks are shown in dotted lines. (*fig-result2_real1*)

</div>
<div id="div_fig3">

<img src="tmp_2403.07498/./Fig/confusion_matrix_Classification_test_roll_1M_final8a_p0.5.png" alt="Fig2" width="100%"/>

**Figure 2. -** Confusion matrix for the predictions for the number of absorption lines using the CNN (as shown in Fig. \ref{fig-classify_network}). The CNN was trained on 1.6 million lines and tested on 400K samples, with an equal number of single and double lines. We find the Sensitivity=97.47\%, Specificity=89.92\%, Precision=89.64\% and Negative Predictive Value=97.55\%. (*fig-confusion_matrix*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2403.07498"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

386  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

9  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

2  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
