# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

J. Li  ->  J. Li  |  ['J. Li']
M. Habouzit  ->  M. Habouzit  |  ['M. Habouzit']
E. Schinnerer  ->  E. Schinnerer  |  ['E. Schinnerer']
S. Stuber  ->  S. Stuber  |  ['S. Stuber']
J. Liu  ->  J. Liu  |  ['J. Liu']
M. Cecil  ->  M. Cecil  |  ['M. Cecil']
Arxiv has 54 new papers today
          5 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/5 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/
        2405.05308
      
Retrieving document from  https://arxiv.org/e-print/
        2405.05319
      
Retrieving document from  https://arxiv.org/e-print/
        2405.05364
      
Retrieving document from  https://arxiv.org/e-print/
        2405.05798
      
Retrieving document from  https://arxiv.org/e-print/
        2405.05816
      


        2405.05308
       did not run properly
URL can't contain control characters. '/e-print/\n        2405.05308' (found at least '\n')
        2405.05319
       did not run properly
URL can't contain control characters. '/e-print/\n        2405.05319' (found at least '\n')
        2405.05364
       did not run properly
URL can't contain control characters. '/e-print/\n        2405.05364' (found at least '\n')
        2405.05798
       did not run properly
URL can't contain control characters. '/e-print/\n        2405.05798' (found at least '\n')
        2405.05816
       did not run properly
URL can't contain control characters. '/e-print/\n        2405.05816' (found at least '\n')


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-
        arXiv:2405.05308
      -b31b1b.svg)](https://arxiv.org/abs/
        arXiv:2405.05308
      ) | **The Variation of the Galaxy-Wide IMF for Low-Mass Stars: Modeling and Observational Insights**  |
|| Z. Yan, et al. -- incl., <mark>J. Li</mark> |
|*Appeared on*| *2024-05-10*|
|*Comments*| *15 pages, 5 figures, accepted for publication in The Astrophysical Journal*|
|**Abstract**|            The Stellar Initial Mass Function (IMF) characterizes the mass distribution of newly formed stars in various cosmic environments, serving as a fundamental assumption in astrophysical research. Recent findings challenge the prevalent notion of a universal and static IMF, proposing instead that the IMF's shape is contingent upon the star formation environment. In this study, we analyze the galaxy-wide variation of the IMF for low-mass stars in both dwarf and massive galaxies with diverse observational methods. Despite systematic discrepancies between different approaches, an IMF model with a metallicity-dependent slope for the low-mass stars aligns with the majority of observations, indicating a high degree of uniformity in the star formation processes across the universe. We also emphasize the need for a more comprehensive understanding of the variation of the low-mass IMF, considering measurement biases and factors beyond metallicity.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error URL can't contain control characters. '/e-print/\n        2405.05308' (found at least '\n')</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-
        arXiv:2405.05319
      -b31b1b.svg)](https://arxiv.org/abs/
        arXiv:2405.05319
      ) | **Is the James Webb Space Telescope detecting too many AGN candidates?**  |
|| <mark>M. Habouzit</mark> |
|*Appeared on*| *2024-05-10*|
|*Comments*| *11 pages, submitted to the journal*|
|**Abstract**|            In less than two years of operation, the James Webb Space Telescope (JWST) has already accelerated significantly our quest to identify active massive black holes (BHs) in the first billion years of the Universe's history. At the time of writing, about 50 AGN detections and candidates have been identified through spectroscopy, photometry, and/or morphology. Broad-line AGN are about a hundred times more numerous than the faint end of the UV-bright quasar population at z~5-6. In this paper, we compare the observational constraints on the abundance of these AGN at z~5 to the populations of AGN produced in large-scale cosmological simulations. Assuming a null fraction of obscured simulated AGN, we find that while some simulations produce more AGN than discovered so far, some others produce a similar abundance or even fewer AGN in the bolometric luminosity range probed by JWST. Keeping in mind the large uncertainty on the constraints, we discuss the implications for the theoretical modeling of BH formation and evolution in case similar constraints continue to accumulate. At the redshift of interest, the simulated AGN populations diverge the most at Lbol~1e44 erg/s (by more than a dex in the bolometric luminosity function). This regime is most affected by incompleteness in JWST surveys. However, it holds significant potential for constraining the physical processes determining the assembly of BHs (e.g., seeding, feedback from supernova and AGN) and the current abundance of broad-line AGN with >1e44.5 erg/s.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error URL can't contain control characters. '/e-print/\n        2405.05319' (found at least '\n')</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-
        arXiv:2405.05364
      -b31b1b.svg)](https://arxiv.org/abs/
        arXiv:2405.05364
      ) | **Do spiral arms enhance star formation efficiency?**  |
|| M. Querejeta, et al. -- incl., <mark>E. Schinnerer</mark>, <mark>S. Stuber</mark> |
|*Appeared on*| *2024-05-10*|
|*Comments*| *26 pages, 16 figures. Accepted for publication in A&A*|
|**Abstract**|            Spiral arms are some of the most spectacular features in disc galaxies, and also present in our own Milky Way. It has been argued that star formation should proceed more efficiently in spiral arms as a result of gas compression. Yet, observational studies have so far yielded contradictory results. Here we examine arm/interarm surface density contrasts at ~100 pc resolution in 28 spiral galaxies from the PHANGS survey. We find that the arm/interarm contrast in stellar mass surface density (Sigma_*) is very modest, typically a few tens of percent. This is much smaller than the contrasts measured for molecular gas (Sigma_mol) or star formation rate (Sigma_SFR) surface density, which typically reach a factor of ~2-3. Yet, Sigma_mol and Sigma_SFR contrasts show a significant correlation with the enhancement in Sigma_*, suggesting that the small stellar contrast largely dictates the stronger accumulation of gas and star formation. All these contrasts increase for grand-design spirals compared to multi-armed and flocculent systems (and for galaxies with high stellar mass). The median star formation efficiency (SFE) of the molecular gas is 16% higher in spiral arms than in interarm regions, with a large scatter, and the contrast increases significantly (median SFE contrast 2.34) for regions of particularly enhanced stellar contrast (Sigma_* contrast >1.97). The molecular-to-atomic gas ratio (Sigma_mol/Sigma_atom) is higher in spiral arms, pointing to a transformation of atomic to molecular gas. In conclusion, the boost in the star formation efficiency of molecular gas in spiral arms is generally modest or absent, except for locations with exceptionally large stellar contrasts. (abridged)         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error URL can't contain control characters. '/e-print/\n        2405.05364' (found at least '\n')</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-
        arXiv:2405.05798
      -b31b1b.svg)](https://arxiv.org/abs/
        arXiv:2405.05798
      ) | **A PAge-like Unified Dark Fluid Model**  |
|| J. Wang, et al. -- incl., <mark>J. Liu</mark> |
|*Appeared on*| *2024-05-10*|
|*Comments*| *6 pages, 4 figures, 1 table*|
|**Abstract**|            The unified dark fluid model unifies dark matter and dark energy into a single component, providing an alternative and more concise framework for interpreting cosmological observations. We introduce a PAge-like Unified Dark Fluid (PUDF) model based on the PAge approximation (Huang 2020), which is parameterized by the age of the universe and an $\eta$ parameter indicating the deviation from Einstein-De Sitter Universe. The PUDF model shares many similar features of the standard Lambda cold dark matter ($\Lambda$CDM) model and can effectively describe the large-scale structure formation and late-time cosmic acceleration. We constrain the PUDF model with the Planck 2018 cosmic microwave background anisotropies, baryon acoustic oscillation measurements including those from the most recent DESI 2024, the Pantheon+ sample of Type Ia supernovae, and the Cosmic Chronometers compilation. Although the PUDF performs well in fitting all the cosmological datasets, the joint analysis of the data still favors the $\Lambda$CDM model over the PUDF model, according to the Bayesian evidence of model comparison.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error URL can't contain control characters. '/e-print/\n        2405.05798' (found at least '\n')</p> |


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-
        arXiv:2405.05816
      -b31b1b.svg)](https://arxiv.org/abs/
        arXiv:2405.05816
      ) | **Time-dependent long-term hydrodynamic simulations of the inner protoplanetary disk III: The influence of photoevaporation**  |
|| <mark>M. Cecil</mark>, L. Gehrig, D. Steiner |
|*Appeared on*| *2024-05-10*|
|*Comments*| *15 pages, 10 figures, to be published in Astronomy & Astrophysics*|
|**Abstract**|            The final stages of a protoplanetary disk are essential for our understanding of the formation and evolution of planets. Photoevaporation is an important mechanism that contributes to the dispersal of an accretion disk and has significant consequences for the disk's lifetime. However, the combined effects of photoevaporation and star-disk interaction have not been investigated in previous studies. We combined an implicit disk evolution model with a photoevaporative mass-loss profile. By including the innermost disk regions down to 0.01 AU, we could calculate the star-disk interaction, the stellar spin evolution, and the transition from an accreting disk to the propeller regime self-consistently. Starting from an early Class II star-disk system, we calculated the long-term evolution of the system until the disk becomes almost completely dissolved. Photoevaporation has a significant effect on disk structure and evolution. The radial extent of the dead zone decreases, and the number of episodic accretion events (outbursts) is reduced by high stellar X-ray luminosities. Reasonable accretion rates in combination with photoevaporative gaps are possible for a dead zone that is still massive enough to develop episodic accretion events. Furthermore, the stellar spin evolution during the Class II evolution is less affected by the star-disk interaction in the case of high X-ray luminosities. Our results suggest that the formation of planets, especially habitable planets, in the dead zone is strongly impaired in the case of strong X-ray luminosities. Additionally, the importance of the star-disk interaction during the Class II phase with respect to the stellar spin evolution is reduced.         |
|<p style="color:red"> **ERROR** </p>| <p style="color:red">latex error URL can't contain control characters. '/e-print/\n        2405.05816' (found at least '\n')</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

414  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

2  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

0  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
