# Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 

The core python code is also in `pubsFromBibs.py`. 
Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
* bib file names
* specific venue keys based on your bib file preferences
* any specific pre-text for specific files
* Collection Name (future feature)

TODO: Make this work with other databases of citations, 
TODO: Merge this with the existing TSV parsing solution

In [1]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex 
from time import strptime
import string
import html
import os
import re

ModuleNotFoundError: No module named 'pybtex'

In [6]:
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
    "journal":{
        "file": "pubs.bib",
        "venuekey" : "journal",
        "venue-pretext" : "",
        "collection" : {"name":"publications",
                        "permalink":"/publication/"}
    },
#     "proceeding": {
#         "file" : "proceedings.bib",
#         "venuekey": "booktitle",
#         "venue-pretext": "In the proceedings of ",
#         "collection" : {"name":"publications",
#                         "permalink":"/publication/"}
        
#     }
}

In [18]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return ""#.join(html_escape_table.get(c,c) for c in text)

In [19]:
def stripchar_plain(s):
    return s#.replace("{", "").replace("}","").replace("\\","").replace(" ","-").replace("^","-").replace("--","-").replace('"','')
def stripchar_rich(s):
    return s#.replace("{", "").replace("}","").replace("\\","").replace("^","-").replace("--","-").replace("$","").replace('"','')


In [20]:
for pubsource in publist:
    parser = bibtex.Parser()
    bibdata = parser.parse_file(publist[pubsource]["file"])
    bibdata.entries

In [21]:
for f in os.listdir('../_publications'):
    os.remove(os.path.join('../_publications', f))

    
for pubsource in publist:
    parser = bibtex.Parser()
    bibdata = parser.parse_file(publist[pubsource]["file"])

    #loop through the individual references in a given bibtex file
    for bib_id in bibdata.entries:
        #reset default date
        pub_year = "1900"
        pub_month = "01"
        pub_day = "01"
        
        b = bibdata.entries[bib_id].fields
        
        try:
            pub_year = f'{b["year"]}'

            #todo: this hack for month and day needs some cleanup
            if "month" in b.keys(): 
                if(len(b["month"])<3):
                    pub_month = "0"+b["month"]
                    pub_month = pub_month[-2:]
                elif(b["month"] not in range(12)):
                    tmnth = strptime(b["month"][:3],'%b').tm_mon   
                    pub_month = "{:02d}".format(tmnth) 
                else:
                    pub_month = str(b["month"])
            if "day" in b.keys(): 
                pub_day = str(b["day"])

                
            pub_date = pub_year+"-"+pub_month+"-"+pub_day
            
            #strip out {} as needed (some bibtex entries that maintain formatting)
            clean_title = stripchar_plain(b["title"])

            
            url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
            url_slug = url_slug.replace("--","-")

            md_filename = stripchar_plain(str(pub_date) + "-" + url_slug + ".md")
            html_filename = stripchar_plain(str(pub_date) + "-" + url_slug)

            
            
            #########################################
            # citation authors
            author_citation = ""
            authors = bibdata.entries[bib_id].persons["author"]
            for author in authors:
                author_citation += " "\
                            +stripchar_rich(author.first_names[0])+" "\
                            +stripchar_rich(author.last_names[0])+", "
                
            if len(authors) > 10:
                author_citation = stripchar_rich(authors[0].first_names[0])+" "\
                            +stripchar_rich(authors[0].last_names[0])+" and others, "

            ##########################################
            # Build Citation from text
            citation = ""

            # citation title
            citation += "\"" + stripchar_rich(b["title"]) + "\","
            
            # citation author list
            citation += author_citation 
            
            # add venue logic depending on citation type
            venue = publist[pubsource]["venue-pretext"]+stripchar_rich(b[publist[pubsource]["venuekey"]])
            
            if venue == 'prd':
                venue = '<strong>PRD</strong>'
            if venue == 'prl':
                venue = '<strong>PRL</strong>'
            if venue == 'arXiv e-prints':
                venue = '<em>arXiv preprint</em>'

            citation += " " + html_escape(venue)
            citation += ", " + pub_year

            ##########################################
            ## YAML variables
            md = "---\ntitle: \""   + stripchar_rich(b["title"])+ '"\n'

            md += """authors: """   + html_escape(author_citation)[:-2] + '\n'

            md += """collection: """ +  publist[pubsource]["collection"]["name"]

            md += """\npermalink: """ + publist[pubsource]["collection"]["permalink"]  + html_filename
            
            note = False
            if "note" in b.keys():
                if len(str(b["note"])) > 5:
                    md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
                    note = True

            md += "\ndate: " + str(pub_date) 

            md += "\nvenue: '" + html_escape(venue) + "'"
            
            url = False
            if "url" in b.keys():
                if len(str(b["url"])) > 5:
                    md += "\npaperurl: '" + b["url"] + "'"
                    url = True
            
            md += "\ncitation: '" + citation + ", '"

            md += "\neprint: '" + html_escape(b["eprint"]) +"'\n"

            md += "---"
                
            md_filename = os.path.basename(md_filename)

            with open("../_publications/" + md_filename, 'w') as f:
                f.write(md)
            print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
        # field may not exist for a reference
        except KeyError as e:
            print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
            continue


SUCESSFULLY PARSED 2022arXiv220704137A: " {DarkNews: a Python-based event generator for heavy neutral  ... "
SUCESSFULLY PARSED 2022arXiv220607100K: " {Dipole-Coupled Neutrissimo Explanations of the MiniBooNE Ex ... "
SUCESSFULLY PARSED 2022arXiv220512273A: " {A New Way To Seek Out Dark Neutrino Sectors And To Boldly E ... "
SUCESSFULLY PARSED 2022arXiv220102603H: " {Dark sectors in neutron-shining-through-a-wall and nuclear  ... "
SUCESSFULLY PARSED 2022PhRvL.128x1802A: " {MicroBooNE and the {\ensuremath{\nu}}$_{e}$ Interpretation  ... "
SUCESSFULLY PARSED 2021arXiv210903831A: " {Heavy neutral leptons below the kaon mass at hodoscopic det ... "
SUCESSFULLY PARSED 2020RPPh...83l4201A: " {New opportunities at the next-generation neutrino experimen ... "
SUCESSFULLY PARSED 2020arXiv201202142H: " {Novel multi-lepton signatures of dark sectors in light meso ... "
SUCESSFULLY PARSED 2020PhRvD.102e5016H: " {Pair production of dark particles in meson decays}  "
SUCESSFULLY PARSED 2021PhRvD.10

In [15]:
parser = bibtex.Parser()
bibdata = parser.parse_file(publist[pubsource]["file"])

#loop through the individual references in a given bibtex file
for bib_id in bibdata.entries:
    #reset default date
    pub_year = "1900"
    pub_month = "01"
    pub_day = "01"

    b = bibdata.entries[bib_id].fields

In [16]:
import urllib.request, json
import numpy as np

In [8]:
#!/usr/bin/env python3

"""
Given an author identified by his/her BAI, this Python3 script counts the number of
citations and the number of citations excluding self cites in the Inspirehep database
(https://inspirehep.net/) for each author's paper.
Additionally, it allows saving a snapshot for later detection of new/removed papers and
change in the number of citations of individual papers
Built on & inspired by https://github.com/efranzin/python
"""

AUTHOR             = 'M.Hostert.1'
MAX_NUM_PAPERS     = 1000       #Number of papers requested from INSPIRE-HEP
SHORT_TITLE_LENGTH = 50         #Shorten long paper titles
NEED_WRITE_CONFIRM = True       #Whether to ask user for permission to save to disk
FILENAME           = 'old_biblio.npy'

# Import the modules to open and reading URLs and the JSON encoder

# Open the INSPIRE-HEP profile
inspirehep_profile = 'https://inspirehep.net/api/literature?sort=mostrecent&size=' + \
                        str(MAX_NUM_PAPERS) + '&q=a%20' + AUTHOR

# Load the data
data     = json.loads(urllib.request.urlopen(inspirehep_profile).read())
num_hits = data['hits']['total']

# Data type to store paper id, beginning of the title, number of citations and number of
# citations without self-citations
bibliography_dtype = np.dtype([
                        ('id',          np.int64),
                        ('title',       np.unicode_, SHORT_TITLE_LENGTH),
                        ('cits',        np.int64),
                        ('cits_noself', np.int64),
                    ])

# Fill in information about author's papers from the website response
biblio = np.zeros(num_hits, dtype = bibliography_dtype)

for i in range(num_hits):
    biblio[i]['id']          = data['hits']['hits'][i]['id']
    biblio[i]['title']       = data['hits']['hits'][i]['metadata']['titles'][0]['title']
    biblio[i]['cits']        = data['hits']['hits'][i]['metadata']['citation_count']
    biblio[i]['cits_noself'] = data['hits']['hits'][i]['metadata']['citation_count_without_self_citations']

# Print the total number of citations and the total number of citations excluding self cites
print(
        '\nTotal number of citations: ', 
        sum(biblio['cits']), 
        '; Excluding self cites: ', 
        sum(biblio['cits_noself']), 
        '\n',
        sep=''
    )

# Function to save current snapshot of the author's citations
def save_snapshot():
    """
    Saves a current snapshot of the bibliography. 
    If NEED_WRITE_CONFIRM is True, asks the user for permission first.
    """

    if NEED_WRITE_CONFIRM:
        rewrite = input('\nDo you want to save a snapshot [y/n]? ')
        if rewrite != 'y':
            print('Not saved.')
            return

    np.save(FILENAME, biblio)
    print('Saved.')
    return

#If snapshot does not exist, create it (potentially confirming with the user) and exit
from os.path import exists
if not exists(FILENAME):
    save_snapshot()
    exit()

#Load snapshot
old_biblio = np.load(FILENAME)

#Get a set of paper IDs that were added/removed/stayed
new_paper_ids = set(    biblio['id'])
old_paper_ids = set(old_biblio['id'])

added_paper_ids   = new_paper_ids.difference(old_paper_ids)
removed_paper_ids = old_paper_ids.difference(new_paper_ids)
stayed_paper_ids  = new_paper_ids.intersection(old_paper_ids)

#Keep track of whether we had any changes
changes_present = False

#Print information about papers that were added or removed
for i in removed_paper_ids:
    changes_present = True

    idx       = np.argmax(old_biblio['id'] == i)
    title     = old_biblio[idx]['title'] 
    num_cites = old_biblio[idx]['cits']

    if num_cites == 1:
        print('Removed paper: "' + title + '" with ' +  str(num_cites) + ' citation')
    else:
        print('Removed paper: "' + title + '" with ' +  str(num_cites) + ' citations')

for i in added_paper_ids:
    changes_present = True

    idx       = np.argmax(biblio['id'] == i)
    title     = biblio[idx]['title'] 
    num_cites = biblio[idx]['cits']

    if num_cites == 1:
        print('Added paper: "' + title + '" with ' +  str(num_cites) + ' citation')
    else:
        print('Added paper: "' + title + '" with ' +  str(num_cites) + ' citations')

#For papers not added or removed, check if number of citations has changed
for i in stayed_paper_ids:

    idx_old       = np.argmax(old_biblio['id'] == i)
    idx_new       = np.argmax(    biblio['id'] == i)
    title         = biblio[idx_new]['title'] 
    num_new_cites = biblio[idx_new]['cits'] - old_biblio[idx_old]['cits']

    if num_new_cites != 0:
        changes_present = True

        if   num_new_cites == 1:
            print('1 new citation: "' + title + '"')
        elif num_new_cites == -1:
            print('1 citation removed: "' + title + '"')
        elif num_new_cites  > 1:
            print(str(num_new_cites) + ' new citations: "' + title + '"')
        elif num_new_cites  < -1:
            print(str(abs(num_new_cites)) + ' citations removed: "' + title + '"')

#Save current snapshot if anything changed (potentially confirming with the user)
if changes_present:
    save_snapshot()


Total number of citations: 543; Excluding self cites: 299

Not saved.


FileNotFoundError: [Errno 2] No such file or directory: 'old_biblio.npy'

: 

In [None]:
import os
import requests
response = requests.get(f"https://inspirehep.net/api/literature?q=author:{ref}&format=bibtex")
    if response.status_code == 200:
        f.write((response.content).decode("utf-8") )
        added_ids.append(ref)
    else:
        print(f"Could not find Inspire entry for texkey={ref}.")


MY_AUTHOR_ID = '1621061'