# Publications markdown generator for academicpages

Takes a set of bibtex of publications and converts them for use with [academicpages.github.io](academicpages.github.io). This is an interactive Jupyter notebook ([see more info here](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/what_is_jupyter.html)). 

The core python code is also in `pubsFromBibs.py`. 
Run either from the `markdown_generator` folder after replacing updating the publist dictionary with:
* bib file names
* specific venue keys based on your bib file preferences
* any specific pre-text for specific files
* Collection Name (future feature)

TODO: Make this work with other databases of citations, 
TODO: Merge this with the existing TSV parsing solution

In [51]:
from pybtex.database.input import bibtex
import pybtex.database.input.bibtex 
from time import strptime
import string
import html
import os
import bibtexparser

import re

In [3]:
#todo: incorporate different collection types rather than a catch all publications, requires other changes to template
publist = {
    "journal":{
        "file": "/home/mojko/Documents/PhD_Projects/my_papers.bib",
        "venuekey" : "journal",
        "venue-pretext" : "",
        "collection" : {"name":"publications",
                        "permalink":"/publication/"}
    } 
}

In [52]:
def load_bibtex(filename):
    # Open the BibTeX file
    with open(filename) as bibtex_file:
        # Use bibtexparser to parse the file
        bib_database = bibtexparser.load(bibtex_file)
        
    # Convert to a dictionary
    entries_dict = {}
    for entry in bib_database.entries:
        # Extract the ID or use a unique identifier
        entry_id = entry['ID']
        # Store each entry in the dictionary
        entries_dict[entry_id] = entry

    return entries_dict

In [4]:
html_escape_table = {
    "&": "&amp;",
    '"': "&quot;",
    "'": "&apos;"
    }

def html_escape(text):
    """Produce entities within text."""
    return "".join(html_escape_table.get(c,c) for c in text)

In [94]:
import re

def latex_to_unicode(text):
    # Mapping of LaTeX to Unicode
    mappings = {
        r"\\'e": "é", r"\\'a": "á", r"\\'i": "í", r"\\'o": "ó", r"\\'u": "ú",
        r"\\`e": "è", r"\\`a": "à", r"\\`i": "ì", r"\\`o": "ò", r"\\`u": "ù",
        r"\\^e": "ê", r"\\^a": "â", r"\\^i": "î", r"\\^o": "ô", r"\\^u": "û",
        r'\\"e': "ë", r'\\"a': "ä", r'\\"i': "ï", r'\\"o': "ö", r'\\"u': "ü", r"\\'y": "ý",
        r"\\~n": "ñ", r"\\c{c}": "ç", r"\\v{s}": "š", r"\\v{z}": "ž",
        r"\\ae": "æ", r"\\oe": "œ", r"\\aa": "å", r"\\o": "ø", r"\\ss": "ß",
        # Adding mappings for the braces version
        r"\\'{e}": "é", r"\\'{a}": "á", r"\\'{i}": "í", r"\\'{o}": "ó", r"\\'{u}": "ú", r"\\'{y}": "ý",r"\\'{i}": "í",
        r"\\`{e}": "è", r"\\`{a}": "à", r"\\`{i}": "ì", r"\\`{o}": "ò", r"\\`{u}": "ù",
        r"\\^{e}": "ê", r"\\^{a}": "â", r"\\^{i}": "î", r"\\^{o}": "ô", r"\\^{u}": "û",
        r'\\"{e}': "ë", r'\\"{a}': "ä", r'\\"{i}': "ï", r'\\"{o}': "ö", r'\\"{u}': "ü",
    }

    # Regular expression to match LaTeX special characters
    regex = re.compile(
        r"\\['`^\"~]{?[a-zA-Z]}?|\\[cvo]['{][a-zA-Z]{1,2}['}]?|\\ss|\\[aeo]e|\\aa|\\o"
    )
    
    def replace(match):
        # Default to the original if no mapping found
        return mappings.get(match.group(0), match.group(0))
    
    return regex.sub(replace, text)


def filter_alphabetic(input_string):
    # Use a list comprehension to filter only alphabetic characters
    filtered_string = ''.join([char for char in input_string if char.isalpha() or char ==' ' or char ==','])
    return filtered_string

In [95]:
file = "/home/mojko/Documents/PhD_Projects/my_papers.bib"

In [96]:
#for pubsource in publist:
parser = bibtex.Parser()
#bibdata = parser.parse_file(file)
bibdata = load_bibtex(file)

In [97]:
bibdata

{'Mutny2018b': {'year': '2018',
  'url': 'https://las.inf.ethz.ch/files/Mutny2018b.pdf',
  'title': 'Efficient High Dimensional Bayesian Optimization with Additivity and Quadrature Fourier Features',
  'month': 'December',
  'booktitle': 'Neural and Information Processing Systems (NeurIPS)',
  'author': "Mutn{\\'y}, Mojmir and Krause, Andreas",
  'abstract': 'We develop an efficient and provably no-regret Bayesian optimization (BO) algorithm for optimization of black-box functions in high dimensions. We assume a generalized additive model with possibly overlapping variable groups. When the groups do not overlap, we are able to provide the first provably no-regret\\emph {polynomial time}(in the number of evaluations of the acquisition function) algorithm for solving high dimensional BO. To make the optimization efficient and feasible, we introduce a novel deterministic Fourier Features approximation based on numerical integration with detailed analysis for the squared exponential kernel

In [98]:
bibdata['Mutny2018b']

{'year': '2018',
 'url': 'https://las.inf.ethz.ch/files/Mutny2018b.pdf',
 'title': 'Efficient High Dimensional Bayesian Optimization with Additivity and Quadrature Fourier Features',
 'month': 'December',
 'booktitle': 'Neural and Information Processing Systems (NeurIPS)',
 'author': "Mutn{\\'y}, Mojmir and Krause, Andreas",
 'abstract': 'We develop an efficient and provably no-regret Bayesian optimization (BO) algorithm for optimization of black-box functions in high dimensions. We assume a generalized additive model with possibly overlapping variable groups. When the groups do not overlap, we are able to provide the first provably no-regret\\emph {polynomial time}(in the number of evaluations of the acquisition function) algorithm for solving high dimensional BO. To make the optimization efficient and feasible, we introduce a novel deterministic Fourier Features approximation based on numerical integration with detailed analysis for the squared exponential kernel. The error of this a

In [99]:

#loop through the individual references in a given bibtex file
for b in bibdata.values():
    #reset default date
    pub_year = "1900"
    pub_month = "01"
    pub_day = "01"
    
    #b = bibdata.values[bib_id].fields
    #print (b)
    try:
        pub_year = f'{b["year"]}'

        # #todo: this hack for month and day needs some cleanup
        if "month" in b.keys(): 
            if(len(b["month"])<3):
                pub_month = "0"+b["month"]
                pub_month = pub_month[-2:]
            elif(b["month"] not in range(12)):
                tmnth = strptime(b["month"][:3],'%b').tm_mon   
                pub_month = "{:02d}".format(tmnth) 
            else:
                pub_month = str(b["month"])
        if "day" in b.keys(): 
            pub_day = str(b["day"])

        pub_date = pub_year+"-"+pub_month+"-"+pub_day
        print (pub_date)
        #strip out {} as needed (some bibtex entries that maintain formatting)
        clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

        url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
        url_slug = url_slug.replace("--","-")

        md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
        html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

        #Build Citation from text
        citation = ""
        
        for author in bibdata.entries[bib_id].persons["author"]:
            print (author.first_names[0], author.last_names[0])
            citation += f" {author.first_names[0]} {author.last_names[0]}, "
            #citation += author.first_names[0]+" "+author.last_names[0]+", "

            #citation title
        citation = citation + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

        #add venue logic depending on citation type
        if 'booktitle' in b.keys():
            venue =b['booktitle']
        elif 'journal' in b.keys():
            venue =b['journal']
        elif b['type']=='preprint':
            venue = 'preprint'
        elif b['type']=='phd':
            venue = 'PhD Thesis, ETH Zurich'
        else:
            venue = ''
        #venue = ''
        #print (b)
        citation = citation + ", " + pub_year + ". " + venue 
       
        ## YAML variables
        md = "---\ntitle: \""   + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
        
        #md += """collection: """ +  "publication"

        md += """\npermalink: """ +"/publication/"  + html_filename
        
        note = False
        # if "note" in b.keys():
        #     if len(str(b["note"])) > 5:
        #         md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
        #         note = True

        md += "\ndate: " + str(pub_date) 
        md += "\nvenue: '" + html_escape(venue) + "'"
        
        url = False
        if "url" in b.keys():
            if len(str(b["url"])) > 5:
                md += "\nurl: '" + b["url"] + "'"
                url = True

        #md += "\ncitation: '" + html_escape(citation) + "'"

        md += "\n---\n"
        md+="\n"+citation

        if 'abstract' in b.keys():
            md += "\n\n**Abstract**: " + b['abstract']
        

        if url:
            md += "\n\n[Full text here](" + b["url"] + "){:target=\"_blank\"}\n" 
        else:
            md += "\n\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"


        md +="<!--more-->"
        md_filename = os.path.basename(md_filename)

        with open("../_publications/" + md_filename, 'w', encoding="utf-8") as f:
            f.write(md)
        #print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
    # field may not exist for a reference
    except KeyError as e:
        print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
        continue

2018-12-01


AttributeError: 'dict' object has no attribute 'entries'

In [102]:

#loop through the individual references in a given bibtex file
for b in bibdata.values():
    #reset default date
    pub_year = "1900"
    pub_month = "01"
    pub_day = "01"
    
    #b = bibdata.values[bib_id].fields
    #print (b)
    print (b)
    try:
        pub_year = f'{b["year"]}'

        # #todo: this hack for month and day needs some cleanup
        if "month" in b.keys(): 
            if(len(b["month"])<3):
                pub_month = "0"+b["month"]
                pub_month = pub_month[-2:]
            elif(b["month"] not in range(12)):
                tmnth = strptime(b["month"][:3],'%b').tm_mon   
                pub_month = "{:02d}".format(tmnth) 
            else:
                pub_month = str(b["month"])
        if "day" in b.keys(): 
            pub_day = str(b["day"])

        pub_date = pub_year+"-"+pub_month+"-"+pub_day
        print (pub_date)
        #strip out {} as needed (some bibtex entries that maintain formatting)
        clean_title = b["title"].replace("{", "").replace("}","").replace("\\","").replace(" ","-")    

        url_slug = re.sub("\\[.*\\]|[^a-zA-Z0-9_-]", "", clean_title)
        url_slug = url_slug.replace("--","-")

        md_filename = (str(pub_date) + "-" + url_slug + ".md").replace("--","-")
        html_filename = (str(pub_date) + "-" + url_slug).replace("--","-")

        #Build Citation from text
        citation = ""
        
        # for author in b["author"]:
        #     #print (author.first_names[0], author.last_names[0])
        #     #citation += f" {author.first_names[0]} {author.last_names[0]}, "
        #     citation =+str(author) + ", "
        #     #citation += author.first_names[0]+" "+author.last_names[0]+", "

            #citation title
        citation = b["author"]
        citation = filter_alphabetic(citation) + ", " + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + ".\""

        #add venue logic depending on citation type
        if 'booktitle' in b.keys():
            venue =b['booktitle']
        elif 'journal' in b.keys():
            venue =b['journal']
        elif b['type']=='preprint':
            venue = 'preprint'
        elif b['type']=='phd':
            venue = 'PhD Thesis, ETH Zurich'
        else:
            venue = ''
        #venue = ''
        #print (b)
        citation = citation + ", " + pub_year + ". " + venue 
       
        ## YAML variables
        md = "---\ntitle: \""   + html_escape(b["title"].replace("{", "").replace("}","").replace("\\","")) + '"\n'
        
        #md += """collection: """ +  "publication"

        md += """\npermalink: """ +"/publication/"  + html_filename
        
        note = False
        # if "note" in b.keys():
        #     if len(str(b["note"])) > 5:
        #         md += "\nexcerpt: '" + html_escape(b["note"]) + "'"
        #         note = True

        md += "\ndate: " + str(pub_date) 
        md += "\nvenue: '" + html_escape(venue) + "'"
        
        url = False
        if "url" in b.keys():
            if len(str(b["url"])) > 5:
                md += "\nurl: '" + b["url"] + "'"
                url = True

        #md += "\ncitation: '" + html_escape(citation) + "'"

        md += "\n---\n"
        md+="\n"+citation

        if 'abstract' in b.keys():
            md += "\n\n**Abstract**: " + b['abstract']
        

        if url:
            md += "\n\n[Full text here](" + b["url"] + "){:target=\"_blank\"}\n" 
        else:
            md += "\n\nUse [Google Scholar](https://scholar.google.com/scholar?q="+html.escape(clean_title.replace("-","+"))+"){:target=\"_blank\"} for full citation"


        md +="<!--more-->"
        md_filename = os.path.basename(md_filename)
        print (md)
        with open("../_publications/" + md_filename, 'w', encoding="utf-8") as f:
            f.write(md)
        #print(f'SUCESSFULLY PARSED {bib_id}: \"', b["title"][:60],"..."*(len(b['title'])>60),"\"")
    # field may not exist for a reference
    except KeyError as e:
        print(f'WARNING Missing Expected Field {e} from entry {bib_id}: \"', b["title"][:30],"..."*(len(b['title'])>30),"\"")
        continue

{'year': '2018', 'url': 'https://las.inf.ethz.ch/files/Mutny2018b.pdf', 'title': 'Efficient High Dimensional Bayesian Optimization with Additivity and Quadrature Fourier Features', 'month': 'December', 'booktitle': 'Neural and Information Processing Systems (NeurIPS)', 'author': "Mutn{\\'y}, Mojmir and Krause, Andreas", 'abstract': 'We develop an efficient and provably no-regret Bayesian optimization (BO) algorithm for optimization of black-box functions in high dimensions. We assume a generalized additive model with possibly overlapping variable groups. When the groups do not overlap, we are able to provide the first provably no-regret\\emph {polynomial time}(in the number of evaluations of the acquisition function) algorithm for solving high dimensional BO. To make the optimization efficient and feasible, we introduce a novel deterministic Fourier Features approximation based on numerical integration with detailed analysis for the squared exponential kernel. The error of this approxi