# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

Arxiv has 56 new papers today
          3 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates[:-1]):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/2 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2304.06074


extracting tarball to tmp_2304.06074...

 done.


list index out of range
Retrieving document from  https://arxiv.org/e-print/2304.06693


extracting tarball to tmp_2304.06693...

 done.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2304.06074-b31b1b.svg)](https://arxiv.org/abs/arXiv:2304.06074) | **Apocenter pile-up and arcs: a narrow dust ring around HD 129590**  |
|| <mark>J. Olofsson</mark>, et al. |
|*Appeared on*| *2023-04-14*|
|*Comments*| *Accepted for publication in A&A, abstract shortened*|
|**Abstract**| Observations of debris disks have significantly improved over the past decades, both in terms of sensitivity and spatial resolution. At near-infrared wavelengths, new observing strategies and post-processing algorithms allow us to drastically improve the final images, revealing faint structures in the disks. These structures inform us about the properties and spatial distribution of the small dust particles. We present new $H$-band observations of the disk around HD 129590, which display an intriguing arc-like structure in total intensity but not in polarimetry, and propose an explanation for the origin of this arc. Assuming geometric parameters for the birth ring of planetesimals, our model provides the positions of millions of particles of different sizes to compute scattered light images. We demonstrate that if the grain size distribution is truncated or strongly peaks at a size larger than the radiation pressure blow-out size we are able to produce an arc quite similar to the observed one. If the birth ring is radially narrow, given that particles of a given size have similar eccentricities, they will have their apocenters at the same distance from the star. Since this is where the particles will spend most of their time, this results in a "apocenter pile-up" that can look like a ring. Due to more efficient forward scattering this arc only appears in total intensity observations and remains undetected in polarimetric data. This scenario requires sharp variations either in the grain size distribution or for the scattering efficiencies $Q_\mathrm{sca}$. Alternative possibilities such as a wavy size distribution and a size-dependent phase function are interesting candidates to strengthen the apocenter pile-up. We also discuss why such arcs are not commonly detected in other systems, which can mainly be explained by the fact that most parent belts are usually broad. |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2304.06693-b31b1b.svg)](https://arxiv.org/abs/arXiv:2304.06693) | **CATS: The Hubble Constant from Standardized TRGB and Type Ia Supernova  Measurements**  |
|| D. Scolnic, et al. -- incl., <mark>S. Li</mark> |
|*Appeared on*| *2023-04-14*|
|*Comments*| *Submitted to ApJL, comments welcome*|
|**Abstract**| The Tip of the Red Giant Branch (TRGB) provides a luminous standard candle for constructing distance ladders to measure the Hubble constant. In practice its measurements via edge-detection response (EDR) are complicated by the apparent fuzziness of the tip and the multi-peak landscape of the EDR. As a result, it can be difficult to replicate due to a case-by-case measurement process. Previously we optimized an unsupervised algorithm, Comparative Analysis of TRGBs (CATs), to minimize the variance among multiple halo fields per host without reliance on individualized choices, achieving state-of-the-art $\sim$ $<$ 0.05 mag distance measures for optimal data. Further, we found an empirical correlation at 5$\sigma$ confidence in the GHOSTS halo survey between our measurements of the tip and their contrast ratios (ratio of stars 0.5 mag just below and above the tip), useful for standardizing the apparent tips at different host locations. Here, we apply this algorithm to an expanded sample of SN Ia hosts to standardize these to multiple fields in the geometric anchor, NGC 4258. In concert with the Pantheon$+$ SN Ia sample, this analysis produces a (baseline) result of $H_0= 73.22 \pm 2.06$ km/s/Mpc. The largest difference in $H_0$ between this and similar studies employing the TRGB derives from corrections for SN survey differences and local flows used in most recent SN Ia compilations but which were absent in earlier studies. SN-related differences total $\sim$ 2.0 km/s/Mpc. A smaller share, $\sim$ 1.4 km/s/Mpc, results from the inhomogeneity of the TRGB calibration across the distance ladder. We employ a grid of 108 variants around the optimal TRGB algorithm and find the median of variants is $72.94\pm1.98$ km/s/Mpc with an additional uncertainty due to algorithm choices of 0.83 km/s/Mpc. None of these TRGB variants result in $H_0$ less than 71.6 km/s/Mpc. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2304.06074.md
    + _build/html/tmp_2304.06074/./adi_dpi.png
    + _build/html/tmp_2304.06074/./pfunc_rings.png
    + _build/html/tmp_2304.06074/./all_data.png


## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$
$\newcommand{\}{natexlab}$</div>



<div id="title">

# Apocenter pile-up and arcs: a narrow dust ring around HD 129590$\thanks{Based on observations made with ESO Telescopes at the Paranal Observatory under programs ID 105.20GP.001 and 109.237K.001.}$

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2304.06074-b31b1b.svg)](https://arxiv.org/abs/2304.06074)<mark>Appeared on: 2023-04-14</mark> -  _Accepted for publication in A&A, abstract shortened_

</div>
<div id="authors">

<mark>J. Olofsson</mark>, et al.

</div>
<div id="abstract">

**Abstract:** Observations of debris disks have significantly improved over the past decades, both in terms of sensitivity and spatial resolution. At near-infrared wavelengths, new observing strategies and post-processing algorithms allow us to drastically improve the final images, revealing faint structures in the disks. These structures inform us about the properties and spatial distribution of the small dust particles. We present new $H$ -band observations of the disk around the solar type star HD 129590, which display an intriguing arc-like structure in total intensity but not in polarimetry, and propose an explanation for the origin of this arc. Assuming geometric parameters for the birth ring of planetesimals, our model provides the positions of millions of particles of different sizes to compute scattered light images. The code can either produce images over the full size distribution or over several smaller intervals of grain sizes. We demonstrate that if the grain size distribution is truncated or strongly peaks at a size larger than the radiation pressure blow-out size we are able to produce an arc quite similar to the one detected in the observations. If the birth ring is radially narrow, given that particles of a given size have similar eccentricities, they will have their apocenters at the same distance from the star. Since this is where the particles will spend most of their time, this results in a "apocenter pile-up" that can look like a ring. Due to more efficient forward scattering this arc only appears in total intensity observations and remains undetected in polarimetric data, in good agreement with our observations. This scenario requires sharp variations either in the grain size distribution or for the scattering efficiencies $Q_\mathrm{sca}$ (or a combination of both). Alternative possibilities such as a wavy size distribution and a size-dependent phase function are interesting candidates to strengthen the apocenter pile-up. We also discuss why such arcs are not commonly detected in other systems, which can mainly be explained by the fact that most parent belts are usually broad.

</div>

<div id="div_fig1">

<img src="tmp_2304.06074/./adi_dpi.png" alt="Fig7" width="100%"/>

**Figure 7. -** Simulated disk images for total intensity (top panels) and polarized intensity (bottom panels) for different narrow intervals of $\beta$, and the average value of $\beta$ is reported in each panel. The value of the asymmetry parameter $g$ is set to $0.7$ for all images. The scaling is linear and adjusted to the $99.9$ percent for all frames. The images are convolved with a 2D gaussian with a standard deviation of $2$ pixels. (*fig:adi_dpi*)

</div>
<div id="div_fig2">

<img src="tmp_2304.06074/./pfunc_rings.png" alt="Fig4" width="100%"/>

**Figure 4. -** Surface brightness as a function of the scattering angle, extracted in concentric annulii (accounting for the inclination and position angle of the disk) with increasing deprojected stellocentric distances $r$(in arcsec). For clarity, the profiles have been offset and the horizontal lines show the zero point on the top panel, while no offset has been included for the bottom panel to best compare the different profiles. The fainter lines on the top panel show the best fit with an HG phase function (the $g$ values being reported in the legend). The shaded gray areas show the scattering angle that are not accessible to us for an inclination of $82^\circ$. (*fig:pfuncr*)

</div>
<div id="div_fig3">

<img src="tmp_2304.06074/./all_data.png" alt="Fig6" width="100%"/>

**Figure 6. -** SPHERE-IRDIS observations of the disk around HD 129590, showing the DI-sNMF (total intensity, left), PCA (total intensity, top right) and $Q_\phi$(polarized intensity, bottom right) reductions. The detection of the birth ring and the presence of a faint arc are highlighted by two arrows on the left panel. The inset on the left panel also shows the DI-sNMF reduction with the birth ring (best fit from \citealp{Olofsson2022}) and the arc highlighted. The location of the birth ring is also shown on the $Q_\phi$ image as a solid black line. For each image, the central mask has a radius of $0.125$\arcsec, and the color scale is linear. North is up, east is left and the pixel scale is of $12.26$ mas. (*fig:data*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2304.06074"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

187  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

2  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

1  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
