In [1]:
import re

import xmltodict

In [2]:
with open('./export-refxml.xml') as fd:
    doc = xmltodict.parse(fd.read())

In [3]:
records = doc['records']
records.keys()

odict_keys(['@retrieved', '@selected', '@start', '@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', 'record'])

In [4]:
entries = records['record']
print(type(entries))
print(entries[0].keys())

<class 'list'>
odict_keys(['bibcode', 'title', 'author', 'journal', 'pubdate', 'link', 'url', 'DOI', 'eprintid'])


## Formatting

### Title

In [5]:
title = entries[0]['title']
print(title)

Understanding the HERA Phase I receiver system with simulations and its impact on the detectability of the EoR delay power spectrum


### Name

In [6]:
def get_lastname_initials(authorlist):
    lastname_initials = []
    for name in authorlist:
        l, fmi = name.split(', ', 1)
        fmi_split = fmi.split(' ', 1)
        try:
            f, mi = fmi.split(' ', 1)
        except ValueError:
            f = fmi
            mi = ''
        fi = f[0].upper() + '.'
        lastname_initials.append((l + ', ' + fi + ' ' + mi).strip())
    return lastname_initials

In [7]:
def parse_authorlist(authorlist, mode=0, n_authors=10, myname='Lastname, F. M.'):
    """
    authorlist: list
        List of strings of names in "Lastname, Firstname M." format.
    mode: {0, 1, 2, 3} or {'first_author_only', 'print_n_author', 'print_to_myname', 'print_all'}
        0: 'first_author_only'
        1: 'print_n_author'
        2: 'print_to_myname'
        3: 'print_all'
        Other values will 'print_all'
    n_author: int, optional
        Number of authors to print to for 'print_n_author' mode.
    myname: str, optional
        Your name in "Lastname, F. M." format. 
        Requied if `mode=2`.
    """
    authorlist_li = get_lastname_initials(authorlist)
    
    # First author; et al., [n authors]
    if (mode == 0 or mode == 'first_author_only'):  
        authorlist_str = '{:s}; et al., [{:d} authors]'.format(
            authorlist_li[0], len(authorlist_li) - 1
        )
    # First author; second author; ...; nth author; et al., [n authors]
    elif (mode == 1 or mode == 'print_n_authors'):
        authorlist_str = '{:s}; et al., [{:d} authors]'.format(
            '; '.join(authorlist_li[:n_authors]), len(authorlist_li) - n_authors
        )
    # First author; second; ...; myname; et al., [n authors]
    elif (mode == 2 or mode == 'print_to_myname'):
        try:
            mypos = authorlist_li.index(myname)
            authorlist_str = '{:s}; et al., [{:d} authors]'.format(
                '; '.join(authorlist_li[:mypos+1]), len(authorlist_li) - mypos
            )
        except ValueError:
            raise ValueError("'{:s}' is not in the author list".format(myname))
    # Print all names
    else:
        authorlist_str = '; '.join(authorlist_li)
    return authorlist_str

In [8]:
parse_authorlist(entries[0]['author'], mode=2, n_authors=5, myname='Beardsley, A. P.')

'Fagnoni, N.; de Lera Acedo, E.; DeBoer, D. R.; Abdurashidova, Z.; Aguirre, J. E.; Alexander, P.; Ali, Z. S.; Balfour, Y.; Beardsley, A. P.; et al., [62 authors]'

### Year

In [9]:
def parse_year(pubdate):
    return pubdate.split()[-1]

In [10]:
parse_year(entries[0]['pubdate'])

'2021'

### Journal

In [11]:
def parse_journal(journal):
    # Use re to repalce unwanted words.
    # See https://stackoverflow.com/a/6117124
    rep = {'Volume': '', 'Vol.': '', 'vol.': '',
           'Issue': '',
           'article': '', 'id.': ''}
    rep = dict((re.escape(k), v) for k, v in rep.items()) 
    pattern = re.compile("|".join(rep.keys()))
    journal = pattern.sub(lambda m: rep[re.escape(m.group(0))], journal)
    # Strip out spaces
    journal = [i.strip() for i in journal.split(',')]
    if len(journal) >= 3:
        journal_str = ', '.join(journal[:3])
    else:
        journal_str = journal
    return journal_str

In [12]:
parse_journal(entries[0]['journal'])

'Monthly Notices of the Royal Astronomical Society, 500, 1'

### All together

In [23]:
def parse_entries(entry, str_pattern='\\texttt{{{title}}}, {author}, {year}, {journal}',
                  mode=0, n_authors=10, myname='Lastname, F. M.', bold_myname=True):
    """
    authorlist: list
        List of strings of names in "Lastname, Firstname M." format.
    mode: {0, 1, 2, 3} or {'first_author_only', 'print_n_author', 'print_to_myname', 'print_all'}
        0: 'first_author_only'
        1: 'print_n_author'
        2: 'print_to_myname'
        3: 'print_all'
        Other values will 'print_all'
    n_author: int, optional
        Number of authors to print to for 'print_n_author' mode.
    myname: str, optional
        Your name in "Lastname, F. M." format. 
        Requied if `mode=2` or if `bold_myname=True`.
    bold_myname: {True, False}, optional
        If true will bold `myname` with `\textbf` if the `myname` exists.
    """
    title = entry['title']
    author = parse_authorlist(
        entry['author'], mode=mode, n_authors=n_authors, myname=myname
    )
    if bold_myname:
        author = author.replace(myname, '\\textbf{{{:s}}}'.format(myname))
    year = parse_year(entry['pubdate'])
    journal = parse_journal(entry['journal'])
    return str_pattern.format(title=title, author=author, year=year, journal=journal)

In [38]:
str_pattern='{{\\textit{{{title}}}\\newline{{}}{author}, {year}, {journal}}}'
entries_str = []
n_entries = len(entries)
for i in range(n_entries):
    s = parse_entries(
        entries[i], str_pattern=str_pattern, 
        mode=0, myname='Kittiwisit, P.', bold_myname=True
    )
    s = '\\cvitem{{({:d})}}'.format(n_entries - i) + s
    print(s)
    entries_str.append(s)

\cvitem{(21)}{\textit{Understanding the HERA Phase I receiver system with simulations and its impact on the detectability of the EoR delay power spectrum}\newline{}Fagnoni, N.; et al., [69 authors], 2021, Monthly Notices of the Royal Astronomical Society, 500, 1}
\cvitem{(20)}{\textit{Redundant-baseline calibration of the hydrogen epoch of reionization array}\newline{}Dillon, J. S.; et al., [78 authors], 2020, Monthly Notices of the Royal Astronomical Society, 499, 4}
\cvitem{(19)}{\textit{Measuring HERA's Primary Beam in Situ: Methodology and First Results}\newline{}Nunhokee, C. D.; et al., [66 authors], 2020, The Astrophysical Journal, 897, 1}
\cvitem{(18)}{\textit{Detection of cosmic structures using the bispectrum phase. II. First results from application to cosmic reionization using the Hydrogen Epoch of Reionization Array}\newline{}Thyagarajan, N.; et al., [71 authors], 2020, Physical Review D, 102, 2}
\cvitem{(17)}{\textit{Imaging and Modeling Data from the Hydrogen Epoch of Rei

In [39]:
for i in entries_str:
    print(i)

\cvitem{(21)}{\textit{Understanding the HERA Phase I receiver system with simulations and its impact on the detectability of the EoR delay power spectrum}\newline{}Fagnoni, N.; et al., [69 authors], 2021, Monthly Notices of the Royal Astronomical Society, 500, 1}
\cvitem{(20)}{\textit{Redundant-baseline calibration of the hydrogen epoch of reionization array}\newline{}Dillon, J. S.; et al., [78 authors], 2020, Monthly Notices of the Royal Astronomical Society, 499, 4}
\cvitem{(19)}{\textit{Measuring HERA's Primary Beam in Situ: Methodology and First Results}\newline{}Nunhokee, C. D.; et al., [66 authors], 2020, The Astrophysical Journal, 897, 1}
\cvitem{(18)}{\textit{Detection of cosmic structures using the bispectrum phase. II. First results from application to cosmic reionization using the Hydrogen Epoch of Reionization Array}\newline{}Thyagarajan, N.; et al., [71 authors], 2020, Physical Review D, 102, 2}
\cvitem{(17)}{\textit{Imaging and Modeling Data from the Hydrogen Epoch of Rei

In [28]:
'{{\\texttt{{{title}}}, {author}, {year}, {journal}}}'

'{{\\texttt{{{title}}}, {author}, {year}, {journal}}}'