# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

X. Zhang  ->  X. Zhang  |  ['X. Zhang']
A. Gould  ->  A. Gould  |  ['A. Gould']
K. Jahnke  ->  K. Jahnke  |  ['K. Jahnke']


R. Zhang  ->  R. Zhang  |  ['R. Zhang']
Arxiv has 92 new papers today
          4 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/4 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2401.11134
extracting tarball to tmp_2401.11134...

 done.
Retrieving document from  https://arxiv.org/e-print/2401.11329


extracting tarball to tmp_2401.11329...

 done.


Unable to locate Ghostscript on paths


Retrieving document from  https://arxiv.org/e-print/2401.11826


extracting tarball to tmp_2401.11826...

 done.


K. Jahnke  ->  K. Jahnke  |  ['K. Jahnke']


Found 168 bibliographic references in tmp_2401.11826/main.bbl.
Retrieving document from  https://arxiv.org/e-print/2401.12006


extracting tarball to tmp_2401.12006...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2401.11826-b31b1b.svg)](https://arxiv.org/abs/arXiv:2401.11826) | **Tracing the rise of supermassive black holes: A panchromatic search for  faint, unobscured quasars at z > 6 with COSMOS-Web and other surveys**  |
|| I. T. Andika, et al. -- incl., <mark>K. Jahnke</mark> |
|*Appeared on*| *2024-01-23*|
|*Comments*| *Submitted to the Astronomy & Astrophysics journal. 25 pages, 10 figures, and 4 tables. We welcome comments from the reader*|
|**Abstract**| We report the identification of 64 new candidates of compact galaxies, potentially hosting faint quasars with bolometric luminosities of $L_\mathrm{bol} = 10^{43}$--10$^{46}$ erg s$^{-1}$, residing in the reionization epoch within the redshift range of $6 \lesssim z \lesssim 8$. These candidates were selected by harnessing the rich multiband datasets provided by the emerging JWST-driven extragalactic surveys, focusing on COSMOS-Web, as well as JADES, UNCOVER, CEERS, and PRIMER. Our search strategy includes two stages: applying stringent photometric cuts to catalog-level data and detailed spectral energy distribution fitting. These techniques effectively isolate the quasar candidates while mitigating contamination from low-redshift interlopers, such as brown dwarfs and nearby galaxies. The selected candidates indicate physical traits compatible with low-luminosity active galactic nuclei, likely hosting $\approx10^5$--$10^7~M_\odot$ supermassive black holes (SMBHs) living in galaxies with stellar masses of $\approx10^8$--$10^{10}~M_\odot$. The SMBHs selected in this study, on average, exhibit elevated mass compared to their hosts, with the mass ratio distribution slightly higher than those of galaxies in the local universe. As with other high-$z$ studies, this is at least in part due to the selection method for these quasars. An extensive Monte Carlo analysis provides compelling evidence that heavy black hole seeds from the direct collapse scenario appear to be the preferred pathway to mature this specific subset of SMBHs by $z\approx7$. This work underscores the significance of further spectroscopic observations, as the quasar candidates presented here offer exceptional opportunities to delve into the nature of the earliest galaxies and SMBHs formed during cosmic infancy. |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2401.11134-b31b1b.svg)](https://arxiv.org/abs/arXiv:2401.11134) | **Detection of Solar-like Oscillations in Sub-giant and Red Giant Stars  Using 2-minute Cadence TESS Data**  |
|| J. Zhou, et al. -- incl., <mark>X. Zhang</mark> |
|*Appeared on*| *2024-01-23*|
|*Comments*| **|
|**Abstract**| Based on all 2-minute cadence $TESS$ light curves from Sector 1 to 60, we provide a catalog of 8,651 solar-like oscillators, including frequency at maximum power ($\nu_{\rm max}$, with its median precision, $\sigma$=5.39\%), large frequency separation ($\Delta\nu$, $\sigma$=6.22\%), seismically derived masses, radii, and surface gravity. In this sample, we have detected 2,173 new oscillators and added 4,373 new $\Delta\nu$ measurements. Our seismic parameters are consistent with those from $Kepler$, $K2$, and previous $TESS$ data. The median fractional residual in $\nu_{\rm max}$ is $1.63\%$ with a scatter of $14.75\%$, and in $\Delta\nu$ it is $0.11\%$ with a scatter of $10.76\%$. We have detected 476 solar-like oscillators with $\nu_{\rm max}$ exceeding the $Nyquist$ frequency of $Kepler$ long-cadence data during the evolutionary phases of sub-giant and the base of the red-giant branch, which provide a valuable resource for understanding angular momentum transport. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2401.12006-b31b1b.svg)](https://arxiv.org/abs/arXiv:2401.12006) | **A comprehensive correction of the Gaia DR3 XP spectra**  |
|| B. Huang, et al. -- incl., <mark>R. Zhang</mark> |
|*Appeared on*| *2024-01-23*|
|*Comments*| *20 pages, 17 figures*|
|**Abstract**| By combining spectra from the CALSPEC and NGSL, as well as spectroscopic data from the LAMOST Data Release 7 (DR7), we have analyzed and corrected the systematic errors of the Gaia DR3 BP/RP (XP) spectra. The errors depend on the normalized spectral energy distribution (simplified by two independent ``colors'') and $G$ magnitude. Our corrections are applicable in the range of approximately $-0.5<BP-RP<2$, $3<G<17.5$ and $E(B-V)<0.8$. To validate our correction, we conduct independent tests by comparing with the MILES and LEMONY spectra. The results demonstrate that the systematic errors of $BP-RP$ and $G$ have been effectively corrected, especially in the near ultraviolet. The consistency between the corrected Gaia XP spectra and the MILES and LEMONY is better than 2 per cent in the wavelength range of $336-400$\,nm and 1 per cent in redder wavelengths. A global absolute calibration is also carried out by comparing the synthetic Gaia photometry from the corrected XP spectra with the corrected Gaia DR3 photometry. Our study opens up new possibilities for using XP spectra in many fields. A Python package is publicly available to do the corrections (https://doi.org/10.12149/101375 or https://github.com/HiromonGON/GaiaXPcorrection). |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2401.11329-b31b1b.svg)](https://arxiv.org/abs/arXiv:2401.11329) | **MOA-2022-BLG-563Lb, KMT-2023-BLG-0469Lb, and KMT-2023-BLG-0735Lb: Three  sub-Jovian-mass microlensing planets**  |
|| C. Han, et al. -- incl., <mark>A. Gould</mark> |
|*Appeared on*| *2024-01-23*|
|*Comments*| *11 pages, 7 tables, 10 figures*|
|**Abstract**| We analyze the anomalies appearing in the light curves of the three microlensing events MOA-2022-BLG-563, KMT-2023-BLG-0469, and KMT-2023-BLG-0735. The anomalies exhibit common short-term dip features that appear near the peak. From the detailed analyses of the light curves, we find that the anomalies were produced by planets accompanied by the lenses of the events. For all three events, the estimated mass ratios between the planet and host are on the order of $10^{-4}$: $q\sim 8 \times 10^{-4}$ for MOA-2022-BLG-563L, $q\sim 2.5\times 10^{-4}$ for KMT-2023-BLG-0469L, and $q\sim 1.9\times 10^{-4}$ for KMT-2023-BLG-0735L. The interpretations of the anomalies are subject to a common inner-outer degeneracy, which causes ambiguity when estimating the projected planet-host separation. We estimated the planet mass, $M_{\rm p}$, host mass, $M_{\rm h}$, and distance, $D_{\rm L}$, to the planetary system by conducting Bayesian analyses using the observables of the events. The estimated physical parameters of the planetary systems are $(M_{\rm h}/M_\odot, M_{\rm p}/M_{\rm J}, D_{\rm L}/{\rm kpc}) = (0.48^{+0.36}_{-0.30}, 0.40^{+0.31}_{-0.25}, 6.53^{+1.12}_{-1.57})$ for MOA-2022-BLG-563L, $(0.47^{+0.35}_{-0.26}, 0.124^{+0.092}_{-0.067}, 7.07^{+1.03}_{-1.19})$ for KMT-2023-BLG-0469L, and $(0.62^{+0.34}_{-0.35}, 0.125^{+0.068}_{-0.070}, 6.26^{+1.27}_{-1.67})$ for KMT-2023-BLG-0735L. According to the estimated parameters, all planets are cold planets with projected separations that are greater than the snow lines of the planetary systems, they have masses that lie between the masses of Uranus and Jupiter of the Solar System, and the hosts of the planets are main-sequence stars that are less massive than the Sun. |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error Unable to locate Ghostscript on paths</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2401.11826.md
    + _build/html/tmp_2401.11826/figures/fig_MBH_MStar.png
    + _build/html/tmp_2401.11826/figures/fig_known_prop.png
    + _build/html/tmp_2401.11826/figures/fig_zphot.png


## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\nodata}{ ~\cdots~ }$
$\newcommand{\MBH}{M_\mathrm{BH}}$
$\newcommand{\MSeed}{M_\mathrm{seed}}$
$\newcommand{\MStar}{M_*}$
$\newcommand{\MSun}{M_\odot}$
$\newcommand{\LEdd}{L_\mathrm{Edd}}$
$\newcommand{\Lbol}{L_\mathrm{bol}}$
$\newcommand{\arraystretch}{1.5}$</div>



<div id="title">

# Tracing the rise of supermassive black holes:

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2401.11826-b31b1b.svg)](https://arxiv.org/abs/2401.11826)<mark>Appeared on: 2024-01-23</mark> -  _Submitted to the Astronomy & Astrophysics journal. 25 pages, 10 figures, and 4 tables. We welcome comments from the reader_

</div>
<div id="authors">

I. T. Andika, et al. -- incl., <mark>K. Jahnke</mark>

</div>
<div id="abstract">

**Abstract:** We report the identification of 64 new candidates of compact galaxies, potentially hosting faint quasars with bolometric luminosities of $L_\mathrm{bol} = 10^{43}$ --10 $^{46}$ erg s $^{-1}$ , residing in the reionization epoch within the redshift range of $6 \lesssim z \lesssim 8$ .These candidates were selected by harnessing the rich multiband datasets provided by the emerging JWST-driven extragalactic surveys, focusing on COSMOS-Web, as well as JADES, UNCOVER, CEERS, and PRIMER.Our search strategy includes two stages: applying stringent photometric cuts to catalog-level data and detailed spectral energy distribution fitting.These techniques effectively isolate the quasar candidates while mitigating contamination from low-redshift interlopers, such as brown dwarfs and nearby galaxies.The selected candidates indicate physical traits compatible with low-luminosity active galactic nuclei, likely hosting $\approx10^5$ -- $10^7 M_\odot$ supermassive black holes (SMBHs) living in galaxies with stellar masses of $\approx10^8$ -- $10^{10} M_\odot$ .The SMBHs selected in this study, on average, exhibit elevated mass compared to their hosts, with the mass ratio distribution slightly higher than those of galaxies in the local universe.As with other high- $z$ studies, this is at least in part due to the selection method for these quasars.An extensive Monte Carlo analysis provides compelling evidence that heavy black hole seeds from the direct collapse scenario appear to be the preferred pathway to mature this specific subset of SMBHs by $z\approx7$ .Notably, most of the selected candidates might have emerged from seeds with masses of $\sim10^5 M_\odot$ , assuming a thin disk accretion with an average Eddington ratio of $f_\mathrm{Edd}=0.6\pm0.3$ and a radiative efficiency of $\epsilon = 0.2\pm0.1$ .This work underscores the significance of further spectroscopic observations, as the quasar candidates presented here offer exceptional opportunities to delve into the nature of the earliest galaxies and SMBHs formed during cosmic infancy.

</div>

<div id="div_fig1">

<img src="tmp_2401.11826/figures/fig_MBH_MStar.png" alt="Fig8" width="100%"/>

**Figure 8. -** 
		Relation between the black hole mass ($\MBH$) and its host galaxy stellar mass ($\MStar$).
		The red contour represents our quasar candidates at $z\gtrsim6$, where our measurements can only provide lower limits for $M_\mathrm{BH}$, considering Eddington ratio values ranging from 0.1 to 1.
		The typical statistical errors for $M_*$ are indicated in the lower right corner of the panel.
		High-$z$ quasar samples with available JWST spectroscopic data from [Harikane, Zhang and Nakajima (2023)](), [Yue, Eilers and Simcoe (2023)](), [Ding, Onoue and Silverman (2023)](), and \citet[][excluding dual AGNs]{2023arXiv230801230M} are presented with blue, green, orange, and purple circles with error bars.
		Additional AGN samples from [Larson, Finkelstein and Kocevski (2023)](), [Übler, Maiolino and Curtis-Lake (2023)](), [Stone, et. al (2023)](), [Kocevski, Onoue and Inayoshi (2023)](), [Kokorev, Fujimoto and Labbe (2023)](), and [Goulding, Greene and Setton (2023)]() are indicated with cyan circles.
		The gray dots and crosses are nearby galaxies and AGNs from [ and Kormendy (2013)]() and [ and Reines (2015)]().
		The black dashed lines mark the limits where $M_\mathrm{BH}$/$M_*$ equals 0.1 and 0.01.
		Our candidates show a slightly higher $M_\mathrm{BH}$ to $M_*$ ratios than other galaxies at $z\sim0$ with consistent properties compared to high-$z$ low-luminosity quasars.
	 (*fig:mbh_mstar*)

</div>
<div id="div_fig2">

<img src="tmp_2401.11826/figures/fig_known_prop.png" alt="Fig9" width="100%"/>

**Figure 9. -** 
		Distribution of black hole masses ($\MBH$), stellar masses ($\MStar$), and the fraction of AGN emission ($f_\mathrm{AGN}$) of known sources (see text).
		The left panel compares the lower limit $\MBH$ assuming an Eddington ratio of $f_\mathrm{Edd}=1$ that we calculated and actual values reported in the literature.
		The data points are color-coded according to the inferred $f_\mathrm{AGN}$ of each source.
		The middle panel shows the $\MStar$ from other studies versus our own measurements.
		The right panel illustrates the distribution of $f_\mathrm{AGN}$ for active and inactive galaxies.
		To compensate for the difference in sample sizes, we normalize the bin heights of the histogram, ensuring that the integral of the distribution equals unity.
	 (*fig:known_prop*)

</div>
<div id="div_fig3">

<img src="tmp_2401.11826/figures/fig_zphot.png" alt="Fig1" width="100%"/>

**Figure 1. -** 
		Comparison between $z_\mathrm{phot}$ and $z_\mathrm{spec}$.
		The number count ($N$), average bias ($|\Delta z|$), scatter ($\sigma$), and outlier fraction ($|\Delta z| > 0.15$) of all sources (blue squares) with available spectroscopic data are reported.
		The region with darker colors corresponds to a higher number of sources within the 2D histogram bins.
		We also show the metrics for a subset that satisfies our high-$z$ selection criteria (white circles with error bars).
		Samples of spectroscopically confirmed AGNs from the literature are depicted with red circles.
	 (*fig:zphot*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2401.11826"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

357  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

3  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

2  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
