# MPIA Arxiv on Deck 2

Contains the steps to produce the paper extractions.

In [1]:
# Imports
import os
from IPython.display import Markdown, display
from tqdm.notebook import tqdm
import warnings
from PIL import Image 

# requires arxiv_on_deck_2

from arxiv_on_deck_2.arxiv2 import (get_new_papers, 
                                    get_paper_from_identifier,
                                    retrieve_document_source, 
                                    get_markdown_badge)
from arxiv_on_deck_2 import (latex,
                             latex_bib,
                             mpia,
                             highlight_authors_in_list)

# Sometimes images are really big
Image.MAX_IMAGE_PIXELS = 1000000000 

In [2]:
# Some useful definitions.

class AffiliationWarning(UserWarning):
    pass

class AffiliationError(RuntimeError):
    pass

def validation(source: str):
    """Raises error paper during parsing of source file
    
    Allows checks before parsing TeX code.
    
    Raises AffiliationWarning
    """
    check = mpia.affiliation_verifications(source, verbose=True)
    if check is not True:
        raise AffiliationError("mpia.affiliation_verifications: " + check)

        
warnings.simplefilter('always', AffiliationWarning)


def get_markdown_qrcode(paper_id: str):
    """ Generate a qrcode to the arxiv page using qrserver.com
    
    :param paper: Arxiv paper
    :returns: markdown text
    """
    url = r"https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="
    txt = f"""<img src={url}"https://arxiv.org/abs/{paper_id}">"""
    txt = '<div id="qrcode">' + txt + '</div>'
    return txt

## get list of arxiv paper candidates

We use the MPIA mitarbeiter list webpage from mpia.de to get author names
We then get all new papers from Arxiv and match authors

In [3]:
# get list from MPIA website
# it automatically filters identified non-scientists :func:`mpia.filter_non_scientists`
mpia_authors = mpia.get_mpia_mitarbeiter_list()
normed_mpia_authors = [k[1] for k in mpia_authors]   # initials + fullname
new_papers = get_new_papers()
# add manual references
add_paper_refs = []
new_papers.extend([get_paper_from_identifier(k) for k in add_paper_refs])

candidates = []
for paperk in new_papers:
    # Check author list with their initials
    normed_author_list = [mpia.get_initials(k) for k in paperk['authors']]
    hl_authors = highlight_authors_in_list(normed_author_list, normed_mpia_authors, verbose=True)
    matches = [(hl, orig) for hl, orig in zip(hl_authors, paperk['authors']) if 'mark' in hl]
    paperk['authors'] = hl_authors
    if matches:
        # only select paper if an author matched our list
        candidates.append(paperk)
print("""Arxiv has {0:,d} new papers today""".format(len(new_papers)))        
print("""          {0:,d} with possible author matches""".format(len(candidates)))

K. El-Badry  ->  K. El-Badry  |  ['K. El-Badry']
M. Samland  ->  M. Samland  |  ['M. Samland']
X. Zhang  ->  X. Zhang  |  ['X. Zhang']
Arxiv has 62 new papers today
          3 with possible author matches


# Parse sources and generate relevant outputs

From the candidates, we do the following steps:
* get their tarball from ArXiv (and extract data)
* find the main .tex file: find one with \documentclass{...} (sometimes it's non trivial)
* Check affiliations with :func:`validation`, which uses :func:`mpia.affiliation_verifications`
* If passing the affiliations: we parse the .tex source
   * inject sub-documents into the main (flatten the main document)
   * parse structure, extract information (title, abstract, authors, figures...)
   * handles `\graphicspath` if provided
* Generate the .md document.

In [4]:
documents = []
failed = []
for paper in tqdm(candidates[:-1]):
    paper_id = paper['identifier'].lower().replace('arxiv:', '')
    
    folder = f'tmp_{paper_id}'

    try:
        if not os.path.isdir(folder):
            folder = retrieve_document_source(f"{paper_id}", f'tmp_{paper_id}')
        
        try:
            doc = latex.LatexDocument(folder, validation=validation)    
        except AffiliationError as affilerror:
            msg = f"ArXiv:{paper_id:s} is not an MPIA paper... " + str(affilerror)
            failed.append((paper, "affiliation error: " + str(affilerror) ))
            continue
        
        # Hack because sometimes author parsing does not work well
        if (len(doc.authors) != len(paper['authors'])):
            doc._authors = paper['authors']
        else:
            # highlight authors (FIXME: doc.highlight_authors)
            # done on arxiv paper already
            doc._authors = highlight_authors_in_list(
                [mpia.get_initials(k) for k in doc.authors], 
                normed_mpia_authors, verbose=True)
        if (doc.abstract) in (None, ''):
            doc._abstract = paper['abstract']
            
        doc.comment = (get_markdown_badge(paper_id) + 
                       "<mark>Appeared on: " + paper['date'] + "</mark> - ")
        if paper['comments']:
            doc.comment += " _" + paper['comments'] + "_"
        
        full_md = doc.generate_markdown_text()
        
        full_md += get_markdown_qrcode(paper_id)
        
        # replace citations
        try:
            bibdata = latex_bib.LatexBib.from_doc(doc)
            full_md = latex_bib.replace_citations(full_md, bibdata)
        except Exception as e:
            print(e)
        
        documents.append((paper_id, full_md))
    except Exception as e:
        warnings.warn(latex.LatexWarning(f"{paper_id:s} did not run properly\n" +
                                         str(e)
                                        ))
        failed.append((paper, "latex error " + str(e)))

  0%|          | 0/2 [00:00<?, ?it/s]

Retrieving document from  https://arxiv.org/e-print/2304.13750


extracting tarball to tmp_2304.13750...

 done.
Retrieving document from  https://arxiv.org/e-print/2304.14063


extracting tarball to tmp_2304.14063...

 done.


M. Samland  ->  M. Samland  |  ['M. Samland']


Found 83 bibliographic references in tmp_2304.14063/trap4vapp.bbl.


### Export the logs

Throughout, we also keep track of the logs per paper. see `logs-{today date}.md` 

In [5]:
import datetime
today = str(datetime.date.today())
logfile = f"_build/html/logs/log-{today}.md"


with open(logfile, 'w') as logs:
    # Success
    logs.write(f'# Arxiv on Deck 2: Logs - {today}\n\n')
    logs.write("""* Arxiv had {0:,d} new papers\n""".format(len(new_papers)))
    logs.write("""    * {0:,d} with possible author matches\n\n""".format(len(candidates)))
    logs.write("## Sucessful papers\n\n")
    display(Markdown("## Successful papers"))
    success = [k[0] for k in documents]
    for candid in candidates:
        if candid['identifier'].split(':')[-1] in success:
            display(candid)
            logs.write(candid.generate_markdown_text() + '\n\n')

    ## failed
    logs.write("## Failed papers\n\n")
    display(Markdown("## Failed papers"))
    failed = sorted(failed, key=lambda x: x[1])
    current_reason = ""
    for paper, reason in failed:
        if 'affiliation' in reason:
            color = 'green'
        else:
            color = 'red'
        data = Markdown(
                paper.generate_markdown_text() + 
                f'\n|<p style="color:{color:s}"> **ERROR** </p>| <p style="color:{color:s}">{reason:s}</p> |'
               )
        if reason != current_reason:
            logs.write(f'### {reason:s} \n\n')
            current_reason = reason
        logs.write(data.data + '\n\n')
        
        # only display here the important errors (all in logs)
        # if color in ('red',):
        display(data)

## Successful papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2304.14063-b31b1b.svg)](https://arxiv.org/abs/arXiv:2304.14063) | **Applying a temporal systematics model to vector Apodizing Phase Plate  coronagraphic data: TRAP4vAPP**  |
|| P. Liu, et al. -- incl., <mark>M. Samland</mark> |
|*Appeared on*| *2023-04-28*|
|*Comments*| *15 pages, 10 figures, accepted to A&A*|
|**Abstract**| The vector Apodizing Phase Plate (vAPP) is a pupil plane coronagraph that suppresses starlight by forming a dark hole in its point spread function (PSF). The unconventional and non-axisymmetrical PSF arising from the phase modification applied by this coronagraph presents a special challenge to post-processing techniques. We aim to implement a recently developed post-processing algorithm, temporal reference analysis of planets (TRAP) on vAPP coronagraphic data. The property of TRAP that uses non-local training pixels, combined with the unconventional PSF of vAPP, allows for more flexibility than previous spatial algorithms in selecting reference pixels to model systematic noise. Datasets from two types of vAPPs are analysed: a double grating-vAPP (dgvAPP360) that produces a single symmetric PSF and a grating-vAPP (gvAPP180) that produces two D-shaped PSFs. We explore how to choose reference pixels to build temporal systematic noise models in TRAP for them. We then compare the performance of TRAP with previously implemented algorithms that produced the best signal-to-noise ratio (S/N) in companion detections in these datasets. We find that the systematic noise between the two D-shaped PSFs is not as temporally associated as expected. Conversely, there is still a significant number of systematic noise sources that are shared by the dark hole and the bright side in the same PSF. We should choose reference pixels from the same PSF when reducing the dgvAPP360 dataset or the gvAPP180 dataset with TRAP. In these datasets, TRAP achieves results consistent with previous best detections, with an improved S/N for the gvAPP180 dataset. |

## Failed papers


|||
|---:|:---|
| [![arXiv](https://img.shields.io/badge/arXiv-arXiv:2304.13750-b31b1b.svg)](https://arxiv.org/abs/arXiv:2304.13750) | **Sodium enhancement in evolved CVs**  |
|| N. Yamaguchi, et al. -- incl., <mark>K. El-Badry</mark> |
|*Appeared on*| *2023-04-28*|
|*Comments*| **|
|**Abstract**| We present follow-up spectroscopy of 21 cataclysmic variables (CVs) with evolved secondaries and ongoing or recently-terminated mass transfer. Evolutionary models predict that the secondaries should have anomalous surface abundances owing to nuclear burning in their cores during their main-sequence evolution and subsequent envelope stripping by their companion white dwarfs. To test these models, we measure sodium (Na) abundances of the donors from the Fraunhofer "D" doublet. Accounting for interstellar absorption, we find that {\it all} objects in our sample have enhanced Na abundances. We measure 0.3 $\lesssim$ [Na/H] $\lesssim$ 1.5 dex across the sample, with a median [Na/H] = 0.956 dex, i.e., about an order of magnitude enhancement over solar values. To interpret these values, we run MESA binary evolution models of CVs in which mass transfer begins just as the donor leaves the main sequence. These generically predict Na enhancement in donors with initial donor masses $\gtrsim 1\,M_{\odot}$, consistent with our observations. In the models, Na enrichment occurs in the donors' cores via the NeNa cycle near the end of their main-sequence evolution. Na-enhanced material is exposed when the binaries reach orbital periods of a few hours. Donors with higher initial masses are predicted to have higher Na abundances at fixed orbital period owing to their higher core temperatures during main-sequence evolution. The observed [Na/H] values are on average $\approx$0.3 dex higher than predicted by the models. Surface abundances of evolved CV donors provide a unique opportunity to study nuclear burning products in the cores of intermediate-mass stars. |
|<p style="color:green"> **ERROR** </p>| <p style="color:green">affiliation error: mpia.affiliation_verifications: 'Heidelberg' keyword not found.</p> |

## Export documents

We now write the .md files and export relevant images

In [6]:
def export_markdown_summary(md: str, md_fname:str, directory: str):
    """Export MD document and associated relevant images"""
    import os
    import shutil
    import re

    if (os.path.exists(directory) and not os.path.isdir(directory)):
        raise RuntimeError(f"a non-directory file exists with name {directory:s}")

    if (not os.path.exists(directory)):
        print(f"creating directory {directory:s}")
        os.mkdir(directory)

    fig_fnames = (re.compile(r'\[Fig.*\]\((.*)\)').findall(md) + 
                  re.compile(r'\<img src="([^>\s]*)"[^>]*/>').findall(md))
    for fname in fig_fnames:
        if 'http' in fname:
            # No need to copy online figures
            continue
        destdir = os.path.join(directory, os.path.dirname(fname))
        destfname = os.path.join(destdir, os.path.basename(fname))
        try:
            os.makedirs(destdir)
        except FileExistsError:
            pass
        shutil.copy(fname, destfname)
    with open(os.path.join(directory, md_fname), 'w') as fout:
        fout.write(md)
    print("exported in ", os.path.join(directory, md_fname))
    [print("    + " + os.path.join(directory,fk)) for fk in fig_fnames]

In [7]:
for paper_id, md in documents:
    export_markdown_summary(md, f"{paper_id:s}.md", '_build/html/')

exported in  _build/html/2304.14063.md
    + _build/html/tmp_2304.14063/./figures/vAPP180_reference_pixels.png
    + _build/html/tmp_2304.14063/./figures/vAPP180_contrast_curve.png
    + _build/html/tmp_2304.14063/./figures/vAPP180_PSF.png


## Display the papers

Not necessary but allows for a quick check.

In [8]:
[display(Markdown(k[1])) for k in documents];

<div class="macros" style="visibility:hidden;">
$\newcommand{\ensuremath}{}$
$\newcommand{\xspace}{}$
$\newcommand{\object}[1]{\texttt{#1}}$
$\newcommand{\farcs}{{.}''}$
$\newcommand{\farcm}{{.}'}$
$\newcommand{\arcsec}{''}$
$\newcommand{\arcmin}{'}$
$\newcommand{\ion}[2]{#1#2}$
$\newcommand{\textsc}[1]{\textrm{#1}}$
$\newcommand{\hl}[1]{\textrm{#1}}$
$\newcommand{\footnote}[1]{}$</div>



<div id="title">

# Applying a temporal systematics model to vector Apodizing Phase Plate coronagraphic data: TRAP4vAPP

</div>
<div id="comments">

[![arXiv](https://img.shields.io/badge/arXiv-2304.14063-b31b1b.svg)](https://arxiv.org/abs/2304.14063)<mark>Appeared on: 2023-04-28</mark> -  _15 pages, 10 figures, accepted to A&A_

</div>
<div id="authors">

P. Liu, et al. -- incl., <mark>M. Samland</mark>

</div>
<div id="abstract">

**Abstract:** The vector Apodizing Phase Plate (vAPP) is a pupil plane coronagraph that suppresses starlight by forming a dark hole in its point spread function (PSF).   The unconventional and non-axisymmetrical PSF arising from the phase modification applied by this coronagraph presents a special challenge to post-processing techniques. We aim to implement a recently developed post-processing algorithm, temporal reference analysis of planets (TRAP) on vAPP coronagraphic data. The property of TRAP that uses non-local training pixels, combined with the unconventional PSF of vAPP, allows for more flexibility than previous spatial algorithms in selecting reference pixels to model systematic noise. Datasets from two types of vAPPs are analysed: a double grating-vAPP (dgvAPP360) that produces a single symmetric PSF and a grating-vAPP (gvAPP180) that produces two D-shaped PSFs. We explore how to choose reference pixels to build temporal systematic noise models in TRAP for them. We then compare the performance of TRAP with previously implemented algorithms that produced the best signal-to-noise ratio (S/N) in companion detections in these datasets. We find that the systematic noise between the two D-shaped PSFs is not as temporally associated as expected. Conversely, there is still a significant number of systematic noise sources that are shared by the dark hole and the bright side in the same PSF. We should choose reference pixels from the same PSF when reducing the dgvAPP360 dataset or the gvAPP180 dataset with TRAP. In these datasets, TRAP achieves results consistent with previous best detections, with an improved S/N for the gvAPP180 dataset.

</div>

<div id="div_fig1">

<img src="tmp_2304.14063/./figures/vAPP180_reference_pixels.png" alt="Fig4" width="100%"/>

**Figure 4. -** Six reference pixel designs for an assumed planet position (cyan asterisk) in the upper dark hole of a gvAPP180 PSF, as shown by the white pixels. Upper dark: choosing reference pixels from the dark side of the same PSF; upper bright: choosing reference pixels from the bright side of the same PSF; lower dark: choosing reference pixels from the dark side of the complementary PSF; lower bright: choosing reference pixels from the bright side of the complementary PSF; upper dark+bright: choosing reference pixels from the dark and bright sides of the same PSF; joined dark holes: choosing reference pixels from the joined dark holes.
     (*fig:gvAPP180_reference_pixels*)

</div>
<div id="div_fig2">

<img src="tmp_2304.14063/./figures/vAPP180_contrast_curve.png" alt="Fig9" width="100%"/>

**Figure 9. -** 5$\sigma$ contrast curves of the gvAPP180 datasets with TRAP. The contrast curve is calculated as five times the median of a three-pixel-wide annulus as a function of separation in the normalised uncertainty map. Left panel: the contrast curves of the upper dark region of cube A of the HR 2562 dataset reduced by choosing reference pixels exclusively from the dark hole (labelled as `dark') or both dark and bright sides (labelled as `dark and bright'). The detection significance of HR 2562 B is also marked in the figure. Right panel: the contrast curves of the dark region of the Altair dataset reduced by choosing reference pixels only from the dark hole or both dark and bright sides. The contrast curve of the transition region is also compared in the right panel, which is not much worse than that of the dark region.
           (*Fig:gvAPP180_contrast_curve*)

</div>
<div id="div_fig3">

<img src="tmp_2304.14063/./figures/vAPP180_PSF.png" alt="Fig2" width="100%"/>

**Figure 2. -** On-sky coronagraphic PSF from the gvAPP180 mounted on MagAO/Clio2 at 3.94 $\mu$m. One PSF (upper PSF) is above another PSF (lower PSF) with a leakage term in the middle. The red circles in dashed lines show the IWA and OWA. The dark holes of the two PSFs complement the FoV of each other. The four different regions of the two PSFs we defined in this work are marked with white text. The colourbar shows the flux intensity after background subtraction. (*fig:gvAPP180_PSF*)

</div><div id="qrcode"><img src=https://api.qrserver.com/v1/create-qr-code/?size=100x100&data="https://arxiv.org/abs/2304.14063"></div>

# Create HTML index

In [9]:
from datetime import datetime, timedelta, timezone
from glob import glob
import os

files = glob('_build/html/*.md')
days = 7
now = datetime.today()
res = []
for fk in files:
    stat_result = os.stat(fk).st_ctime
    modified = datetime.fromtimestamp(stat_result, tz=timezone.utc).replace(tzinfo=None)
    delta = now.today() - modified
    if delta <= timedelta(days=days):
        res.append((delta.seconds, fk))
res = [k[1] for k in reversed(sorted(res, key=lambda x:x[1]))]
npub = len(res)
print(len(res), f" publications files modified in the last {days:d} days.")
# [ print('\t', k) for k in res ];

198  publications files modified in the last 7 days.


In [10]:
import datetime
from glob import glob

def get_last_n_days(lst, days=1):
    """ Get the documents from the last n days """
    sorted_lst = sorted(lst, key=lambda x: x[1], reverse=True)
    for fname, date in sorted_lst:
        if date >= str(datetime.date.today() - datetime.timedelta(days=days)):
            yield fname

def extract_appearance_dates(lst_file):
    dates = []

    def get_date(line):
        return line\
            .split('Appeared on:')[-1]\
            .split('</mark>')[0].strip()

    for fname in lst:
        with open(fname, 'r') as f:
            found_date = False
            for line in f:
                if not found_date:
                    if "Appeared on" in line:
                        found_date = True
                        dates.append((fname, get_date(line)))
                else:
                    break
    return dates

from glob import glob
lst = glob('_build/html/*md')
days = 7
dates = extract_appearance_dates(lst)
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last {days:d} days.")

4  publications in the last 7 days.


In [11]:
def create_carousel(npub=4):
    """ Generate the HTML code for a carousel with `npub` slides """
    carousel = ["""  <div class="carousel" """,
                """       data-flickity='{ "autoPlay": 10000, "adaptiveHeight": true, "resize": true, "wrapAround": true, "pauseAutoPlayOnHover": true, "groupCells": 1 }' id="asyncTypeset">"""
                ]
    
    item_str = """    <div class="carousel-cell"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        carousel.append(item_str.format(k=k))
    carousel.append("  </div>")
    return '\n'.join(carousel)

def create_grid(npub=4):
    """ Generate the HTML code for a flat grid with `npub` slides """
    grid = ["""  <div class="grid"> """,
                ]
    
    item_str = """    <div class="grid-item"> <div id="slide{k}" class="md_view">Content {k}</div> </div>"""
    for k in range(1, npub + 1):
        grid.append(item_str.format(k=k))
    grid.append("  </div>")
    return '\n'.join(grid)

In [12]:
carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "7-day archives" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
with open("_build/html/index_7days.html", 'w') as fout:
    fout.write(page)

In [13]:
# redo for today
days = 1
res = list(get_last_n_days(dates, days))
npub = len(res)
print(len(res), f" publications in the last day.")

carousel = create_carousel(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("daily_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- carousel:s --%}", carousel)\
               .replace("{%-- suptitle:s --%}",  "Daily" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(carousel, docs, slides)
# print(page)
with open("_build/html/index_daily.html", 'w') as fout:
    fout.write(page)

2  publications in the last day.


In [14]:
# Create the flat grid of the last N papers (fixed number regardless of dates)
from itertools import islice 

npub = 6
res = [k[0] for k in (islice(reversed(sorted(dates, key=lambda x: x[1])), 6))]
print(len(res), f" {npub} publications selected.")

grid = create_grid(npub)
docs = ', '.join(['"{0:s}"'.format(k.split('/')[-1]) for k in res])
slides = ', '.join([f'"slide{k}"' for k in range(1, npub + 1)])

with open("grid_template.html", "r") as tpl:
    page = tpl.read()
    page = page.replace("{%-- grid-content:s --%}", grid)\
               .replace("{%-- suptitle:s --%}",  f"Last {npub:,d} papers" )\
               .replace("{%-- docs:s --%}", docs)\
               .replace("{%-- slides:s --%}", slides)
    
# print(grid, docs, slides)
# print(page)
with open("_build/html/index_npub_grid.html", 'w') as fout:
    fout.write(page)

6  6 publications selected.
