# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# deal with the author list and edge cases of people that cannot be consistent on their name  

def filter_non_scientists(name: str) -> bool:
    """ Loose filter on expected authorships

    removing IT, administration, technical staff
    :param name: name
    :returns: False if name is not a scientist
    """
    remove_list = ['Wolf', 'Licht', 'Binroth', 'Witzel', 'Jordan',
                   'Zähringer', 'Scheerer', 'Hoffmann', 'Düe',
                   'Hellmich', 'Enkler-Scharpegge', 'Witte-Nguy',
                   'Dehen', 'Beckmann', 'Jager', 'Jäger'
                  ]

    for k in remove_list:
        if k in name:
            return False
    return True

def add_author_to_list(author_list: list) -> list:
    """ Add author to list if not already in list
    
    :param author: author name
    :param author_list: list of authors
    :returns: updated list of authors
    """
    add_list = ['T. Henning']

    for author in add_list:
        if author not in author_list:
            author_list.append(author)
    return author_list

# get list from MPIA website
# filter for non-scientists (mpia.get_mpia_mitarbeiter_list() does some filtering)
mpia_authors = [k[1] for k in mpia.get_mpia_mitarbeiter_list() if filter_non_scientists(k[1])]
# add some missing author because of inconsistencies in their MPIA name and author name on papers
mpia_authors = add_author_to_list(mpia_authors)

In [4]:
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

J. Li  ->  J. Li  |  ['J. Li']
J. Lian  ->  J. Lian  |  ['J. Lian']
S. Savvidou  ->  S. Savvidou  |  ['S. Savvidou']


Arxiv has 59 new papers today
          5 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [5]:
documents = []
failed = []
for paper in tqdm(candidates):
    # debug crap
    paper['identifier'] = paper['identifier'].lower().replace('arxiv:', '').replace(r'\n', '').strip()
    paper_id = paper['identifier']
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2406.05198


extracting tarball to tmp_2406.05198...

 done.
Retrieving document from  https://arxiv.org/e-print/2406.05604


extracting tarball to tmp_2406.05604... done.
Retrieving document from  https://arxiv.org/e-print/2406.06219


extracting tarball to tmp_2406.06219... done.


S. Savvidou  ->  S. Savvidou  |  ['S. Savvidou']






Found 50 bibliographic references in tmp_2406.06219/main.bbl.
Retrieving document from  https://arxiv.org/e-print/2406.06315


extracting tarball to tmp_2406.06315...

 done.
Retrieving document from  https://arxiv.org/e-print/2406.06319



  exec(code_obj, self.user_global_ns, self.user_ns)
'PosixPath' object is not subscriptable


extracting tarball to tmp_2406.06319...

 done.



  exec(code_obj, self.user_global_ns, self.user_ns)
'PosixPath' object is not subscriptable


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [6]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2406.06219-b31b1b.svg)](https://arxiv.org/abs/2406.06219) | **Mind the gap: Distinguishing disc substructures and their impact on the composition of the inner disc**  |
|| J. Mah, <mark>S. Savvidou</mark>, B. Bitsch |
|*Appeared on*| *2024-06-11*|
|*Comments*| *13 pages, 9 figures, accepted for publication in A&A*|
|**Abstract**|            Improved observational technologies have enabled the resolution of substructures and the measurement of chemical abundances in protoplanetary discs. Understanding the chemical composition of the inner disc allows us to infer the building blocks available for planet formation. Recently, the depletion of water in the inner disc has been suggested to be linked to the presence of substructures like gaps and rings further out in the disc. We investigate this hypothesis further by running 1D semi-analytical models of a protoplanetary disc with a gap to understand the combined effects of disc viscosity, gap depth, gap location and gap formation time on the composition of the inner disc. Our results show that for a specific value of disc viscosity, the simulation outcome can be classified into three regimes: shallow gap, 'traffic jam', and deep gap. While deep gaps may already be distinguishable with moderate resolution, shallow gaps remains a challenge to resolve with current capabilities. On the other hand, discs with traffic jams have a higher chance of being resolved when observed with high resolution but may appear as an intensity enhancement or even featureless when observed with moderate to low angular resolution. In this regard, information on the inner disc composition is useful because it can help to infer the existence of traffic jams or distinguish them from deep gaps: Discs with deep gaps are expected to have a low water content and thus high C/O ratio in the inner disc due to the effective blocking of pebbles, discs with shallow gaps would show the opposite trend, and discs with traffic jam would have a constant -- albeit low -- inward flux of water-rich pebbles resulting in a moderate water content and sub-stellar C/O ratios. Finally, we find that the effectiveness of gaps as pebble barriers diminishes quickly when they form late, as most of the pebbles already drifted inwards.         |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2406.05604-b31b1b.svg)](https://arxiv.org/abs/2406.05604) | **The size of the Milky Way galaxy**  |
|| <mark>J. Lian</mark>, et al. |
|*Appeared on*| *2024-06-11*|
|*Comments*| *30 pages, 4figures*|
|**Abstract**|            The size of a galaxy is one of the fundamental parameters that reflects its growth and assembly history. Traditionally, the size of the Milky Way has been characterized by the scale length of the disk, based on the assumption of an exponential density profile. Earlier scale length measurements suggest the Milky Way is an overly compact galaxy, compared to similar galaxies of its mass. These size measurements, however, ignore the presence of the bulge, and the assumption of a single-exponential disk profile faces growing challenges from the recent observations. The half-light radius is an alternative size measurement that is independent of the galaxy density profile and has been widely used to quantify the size of external galaxies. Here we report the half-light radius of the Milky Way, derived from a new measurement of the age-resolved Galactic surface brightness profile in an unprecedentedly wide radial range from ${\rm R=0}$ to 17~kpc. We find a broken surface brightness profile with a nearly flat distribution between 3.5 and 7.5 kpc, which results in a half-light radius of 5.75$\pm$0.38 kpc, significantly larger than the scale-length inferred from the canonical single-exponential disk profile but in good consistency with local disk galaxies of similar mass. Because our density profile can be decomposed by stellar age and extrapolated backwards in time, we can also confirm that the size history of the Milky Way is broadly consistent with high-redshift galaxies but with systematically smaller size at each look back time. Our results suggest that the Milky Way is a typical disk galaxy regarding its size and has likely experienced inefficient secular size growth.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: '69117' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2406.05198-b31b1b.svg)](https://arxiv.org/abs/2406.05198) | **On the Interpretation of Mid-Infrared Absorption Lines of Gas-Phase H$_2$O as Observed by JWST/MIRI**  |
|| <mark>J. Li</mark>, A. Boogert, A. G. G. M. Tielens |
|*Appeared on*| *2024-06-11*|
|*Comments*| *Accepted for publication in ApJS. 26 pages, 23 figures. Comments are more than welcome!*|
|**Abstract**|            Ro-vibrational absorption lines of H$_2$O in the 5-8 $\mu$m wavelength range selectively probe gas against the mid-infrared continuum emitting background of the inner regions of YSOs and AGN and deliver important information about these warm, dust-obscured environments. JWST/MIRI detects these lines in many lines of sight at a moderate spectral resolving power of $R\sim3500$ (FWHM of 85 km/s). Based on our analysis of high-resolution SOFIA/EXES observations, we find that the interpretation of JWST/MIRI absorption spectra can be severely hampered by the blending of individual transitions and the lost information on the intrinsic line width or the partial coverage of the background continuum source. In this paper, we point out problems such as degeneracy that arise in deriving physical properties from an insufficiently resolved spectrum. This can lead to differences in the column density by two orders of magnitude. We emphasize the importance of weighting optically thin and weak lines in spectral analyses and provide recipes for breaking down the coupled parameters. We also provide an online tool to generate the H$_2$O absorption line spectra that can be compared to observations.         |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2406.06315-b31b1b.svg)](https://arxiv.org/abs/2406.06315) | **Inference of the Mass Composition of Cosmic Rays with energies from $\mathbf{10^{18.5}}$ to $\mathbf{10^{20}}$ eV using the Pierre Auger Observatory and Deep Learning**  |
|| P. A. Collaboration, et al. |
|*Appeared on*| *2024-06-11*|
|*Comments*| *submitted to Phys. Rev. Lett., 10 pages, 3 figures, 1 table*|
|**Abstract**|            We present measurements of the atmospheric depth of the shower maximum $X_\mathrm{max}$, inferred for the first time on an event-by-event level using the Surface Detector of the Pierre Auger Observatory. Using deep learning, we were able to extend measurements of the $X_\mathrm{max}$ distributions up to energies of 100 EeV ($10^{20}$ eV), not yet revealed by current measurements, providing new insights into the mass composition of cosmic rays at extreme energies. Gaining a 10-fold increase in statistics compared to the Fluorescence Detector data, we find evidence that the rate of change of the average $X_\mathrm{max}$ with the logarithm of energy features three breaks at $6.5\pm0.6~(\mathrm{stat})\pm1~(\mathrm{sys})$ EeV, $11\pm 2~(\mathrm{stat})\pm1~(\mathrm{sys})$ EeV, and $31\pm5~(\mathrm{stat})\pm3~(\mathrm{sys})$ EeV, in the vicinity to the three prominent features (ankle, instep, suppression) of the cosmic-ray flux. The energy evolution of the mean and standard deviation of the measured $X_\mathrm{max}$ distributions indicates that the mass composition becomes increasingly heavier and purer, thus being incompatible with a large fraction of light nuclei between 50 EeV and 100 EeV.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error 'PosixPath' object is not subscriptable</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-2406.06319-b31b1b.svg)](https://arxiv.org/abs/2406.06319) | **Measurement of the Depth of Maximum of Air-Shower Profiles with energies between $\mathbf{10^{18.5}}$ and $\mathbf{10^{20}}$ eV using the Surface Detector of the Pierre Auger Observatory and Deep Learning**  |
|| P. A. Collaboration, et al. |
|*Appeared on*| *2024-06-11*|
|*Comments*| *submitted to Phys. Rev. D, 28 pages, 18 figures, 5 tables*|
|**Abstract**|            We report an investigation of the mass composition of cosmic rays with energies from 3 to 100 EeV (1 EeV=$10^{18}$ eV) using the distributions of the depth of shower maximum $X_\mathrm{max}$. The analysis relies on ${\sim}50,000$ events recorded by the Surface Detector of the Pierre Auger Observatory and a deep-learning-based reconstruction algorithm. Above energies of 5 EeV, the data set offers a 10-fold increase in statistics with respect to fluorescence measurements at the Observatory. After cross-calibration using the Fluorescence Detector, this enables the first measurement of the evolution of the mean and the standard deviation of the $X_\mathrm{max}$ distributions up to 100 EeV. Our findings are threefold: (1.) The evolution of the mean logarithmic mass towards a heavier composition with increasing energy can be confirmed and is extended to 100 EeV. (2.) The evolution of the fluctuations of $X_\mathrm{max}$ towards a heavier and purer composition with increasing energy can be confirmed with high statistics. We report a rather heavy composition and small fluctuations in $X_\mathrm{max}$ at the highest energies. (3.) We find indications for a characteristic structure beyond a constant change in the mean logarithmic mass, featuring three breaks that are observed in proximity to the ankle, instep, and suppression features in the energy spectrum.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error 'PosixPath' object is not subscriptable</p> |

## Export documents

We now write the .md files and export relevant images

In [7]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [8]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2406.06219.md
    + _build/html/tmp_2406.06219/./bump-10au_uf5_3e-4.png
    + _build/html/tmp_2406.06219/./Intensity_plot_traffic_1Myr.png
    + _build/html/tmp_2406.06219/./bump-10au_uf5_water.png


## Display the papers

Not necessary but allows for a quick check.

In [9]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$</div>



<div id="title">

# Mind the gap: Distinguishing disc substructures and their impact on the composition of the inner disc

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2406.06219-b31b1b.svg)](https://arxiv.org/abs/2406.06219)<mark>Appeared on: 2024-06-11</mark> -  _13 pages, 9 figures, accepted for publication in A&A_

</div>
<div id="authors">

J. Mah, <mark>S. Savvidou</mark>, B. Bitsch

</div>
<div id="abstract">

**Abstract:** Improved observational technologies have enabled the resolution of substructures and the measurement of chemical abundances in protoplanetary discs. Understanding the chemical composition of the inner disc allows us to infer the building blocks available for planet formation. Recently, the depletion of water in the inner disc has been suggested to be linked to the presence of substructures like gaps and rings further out in the disc. We investigate this hypothesis further by running 1D semi-analytical models of a protoplanetary disc with a gap to understand the combined effects of disc viscosity, gap depth, gap location and gap formation time on the composition of the inner disc (water abundance, C/O, O/H and C/H ratios). Our results show that for a specific value of disc viscosity, the simulation outcome can be classified into three regimes: shallow gap, 'traffic jam', and deep gap. While deep gaps may already be distinguishable with moderate resolution (FWHM $\sim$ 10 AU), shallow gaps remains a challenge to resolve with current capabilities. On the other hand, discs with traffic jams have a higher chance of being resolved when observed with high resolution (FWHM $\lesssim$ 5 AU) but may appear as an intensity enhancement or even featureless when observed with moderate to low angular resolution (FWHM $\gtrsim$ 10 AU). In this regard, information on the inner disc composition is useful because it can help to infer the existence of traffic jams or distinguish them from deep gaps: Discs with deep gaps are expected to have a low water content and thus high C/O ratio in the inner disc due to the effective blocking of pebbles, discs with shallow gaps would show the opposite trend (water-rich and low C/O ratio), and discs with traffic jam would have a constant -- albeit low -- inward flux of water-rich pebbles resulting in a moderate water content and sub-stellar C/O ratios. Finally, we find that the effectiveness of gaps as pebble barriers diminishes quickly when they form late $(t_{\rm gap} \gtrsim 0.1 {\rm Myr})$ , as most of the pebbles already drifted inwards.

</div>

<div id="div_fig1">

<img src="tmp_2406.06219/./bump-10au_uf5_3e-4.png" alt="Fig1" width="100%"/>

**Figure 1. -** Time evolution of the normalised water vapour abundance, C/O, O/H and C/H abundance ratios at $r = 0.5 {\rm AU}$ as a function of gap depth for a disc with a gap at 10 AU. We plot here the results for disc viscosity $\alpha = 3\times10^{-4}$ and pebble fragmentation velocity $u_{\rm frag} = 5 {\rm ms}^{-1}$. Regimes I, II and III correspond respectively to the scenario of shallow gap, traffic jam, and deep gap. Circles and triangles are selected examples from the three regimes at different time snapshots where we further investigate their observability (see Section \ref{sec:results_observability}). (*fig:bump-10au_3e-4*)

</div>
<div id="div_fig2">

<img src="tmp_2406.06219/./Intensity_plot_traffic_1Myr.png" alt="Fig2" width="100%"/>

**Figure 2. -** Solids surface density (top) and normalised intensity (bottom) as a function of orbital distance at 1 Myr for three gap depths (marked by circles in Fig. \ref{fig:bump-10au_3e-4}). We compare the unconvolved intensity with the ones convolved with three different beams. The gray dashed lines show the evaporation fronts that mainly cause the spikes in the intensity. (*fig:intensity_plot_1Myr*)

</div>
<div id="div_fig3">

<img src="tmp_2406.06219/./bump-10au_uf5_water.png" alt="Fig3" width="100%"/>

**Figure 3. -** Time evolution of the water vapour abundance at $r = 0.5 {\rm AU}$ normalised to the gas disc's initial value at this location (($H_2$O/H)$_{\rm gas,0} = 3.54\times10^{-4}$), as a function of disc viscosity and the time of gap formation, where the insertion time of the gap increases from left to right. The gap is located at 10 AU. (*fig:bump-10au_water*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2406.06219"></div>

# Create HTML index

In [10]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

90  publications files modified in the last 7 days.


In [11]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

8  publications in the last 7 days.


In [12]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [13]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [14]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

3  publications in the last day.


In [15]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
