In [1]:
### STEP 1. Preamble
import coloredlogs
import ipdb
import json
import logging
import os
import pylatex
import pylatexenc
from pylatexenc import latexencode  # line is necessary
import re
import subprocess
import tkinter
import tkinter.filedialog

verbose = True

### STEP 2. Set up logger
log = logging.getLogger(__name__)
if verbose:
    coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level=logging.DEBUG, logger=log)
else:
    coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO, logger=log)

In [2]:
### STEP 3. Read JSON infile
tkinter.Tk().withdraw() ## Prevent root window
in_fn = tkinter.filedialog.askopenfilename()

output_fn = in_fn.strip(".json")

with open(in_fn, 'r') as file:
    masterDict = json.load(file)   

In [1]:
### STEP 4. Writing results to output LaTeX document
action = "writing output to LaTeX file"
log.info("%s" % action)
geometry_options = {'tmargin': '2cm', 'lmargin': '2cm', 'bmargin': '2cm', 'rmargin': '2cm'}
doc = pylatex.Document(geometry_options=geometry_options)#, inputenc = 'utf8x')#, fontenc='T1')
with doc.create(pylatex.Section('Results of literature mining')):
    
    for uid, article in masterDict.items():
        if uid == 'pubmed_query':
            continue
        if article['article_result_paragraphs']:
            with doc.create(pylatex.Subsection(article['article_title'])):
                doc.append('Year: %s | PubMed ID: %s | PubMedCentral ID: %s\n\n' % (article['article_publdate']['Year'], article['article_pmid'], article['article_pmcid']))
                doc.append('Keywords: %s\n\n' % (article['article_keywords']))
                doc.append('MeSH: %s\n\n' % (article['article_mesh']))
                
                # Fuse all paragraphs into single string
                main_text = '\n\n'.join(article['article_result_paragraphs'])

                # Conducting case-insensitive string replacements
                for gene in article['found_genes']:
                    if gene.casefold() in main_text.casefold():
                        gene_repl = '___' + gene + '|||'
                        main_text = re.sub(gene, gene_repl, main_text, flags=re.IGNORECASE)
                for toxin in article['found_toxins']:
                    if toxin.casefold() in main_text.casefold():
                        toxin_repl = '$$$' + toxin + '!!!'
                        main_text = re.sub(toxin, toxin_repl, main_text, flags=re.IGNORECASE)
                main_text = main_text.replace(" gene ", ' +++'+"gene"+'=== ')
                main_text = main_text.replace(" gene,", ' +++'+"gene"+'===,')
                main_text = main_text.replace(" gene.", ' +++'+"gene"+'===.')
                    
                # Enforce a character encoding, if any
                #main_text = main_text.encode('utf8')
                main_text = pylatexenc.latexencode.unicode_to_latex(main_text)
                doc.append(pylatex.utils.NoEscape(main_text))  ## NoEscape tells pylatex your string is already encoded as latex and not to encode it again

                #doc.append(main_text)

#doc.packages.append(pylatex.Package('xcolor'))compiling LaTeX file to PDF
doc.preamble.append(pylatex.Command('usepackage', 'helvet'))
doc.preamble.append(pylatex.NoEscape('\\usepackage[rgb,dvipsnames]{xcolor}'))
doc.preamble.append(pylatex.NoEscape('\\usepackage[utf8]{inputenc}'))
doc.preamble.append(pylatex.NoEscape(r'\renewcommand{\familydefault}{\sfdefault}'))

NameError: name 'log' is not defined

In [2]:
'''
# Until a better solution has been found:
# START
doc.preamble.append(pylatex.Command('usepackage', 'newunicodechar'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{²}{\ensuremath{{}^2}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{ }{\,}')) ## (U+2009)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{ }{\,}')) ## (U+2005)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{ }{\,}')) ## (U+200A)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{​}{\,}')) ## (U+200B)
doc.preamble.append(pylatex.NoEscape(r"\newunicodechar{′}{'}"))
doc.preamble.append(pylatex.NoEscape(r"\newunicodechar{ʹ}{'}")) ## (U+02B9)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{≈}{$\approx$}')) ## (U+2248)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{∼}{$\sim$}')) ## (U+223C)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{−}{\ensuremath{-}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{∶}{\ensuremath{:}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{≤}{\ensuremath{\leq}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{≥}{\ensuremath{\geq}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{α}{\ensuremath{\alpha}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{β}{\ensuremath{\beta}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{Δ}{\ensuremath{\Delta}}')) ## (U+0394)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{∆}{\ensuremath{\Delta}}')) ## (U+2206)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{△}{\ensuremath{\Delta}}')) ## (U+25B3)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{λ}{\ensuremath{\lambda}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{μ}{\ensuremath{\mu}}'))
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{ε}{\ensuremath{\epsilon}}')) ## (U+03B5)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{κ}{\ensuremath{\kappa}}')) ## (U+03BA)
doc.preamble.append(pylatex.NoEscape(r'\newunicodechar{γ}{\ensuremath{\gamma}}')) ## (U+03B3)
# END
'''

'\n# Until a better solution has been found:\n# START\ndoc.preamble.append(pylatex.Command(\'usepackage\', \'newunicodechar\'))\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{²}{\\ensuremath{{}^2}}\'))\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{\u2009}{\\,}\')) ## (U+2009)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{\u2005}{\\,}\')) ## (U+2005)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{\u200a}{\\,}\')) ## (U+200A)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{\u200b}{\\,}\')) ## (U+200B)\ndoc.preamble.append(pylatex.NoEscape(r"\newunicodechar{′}{\'}"))\ndoc.preamble.append(pylatex.NoEscape(r"\newunicodechar{ʹ}{\'}")) ## (U+02B9)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{≈}{$\x07pprox$}\')) ## (U+2248)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{∼}{$\\sim$}\')) ## (U+223C)\ndoc.preamble.append(pylatex.NoEscape(r\'\newunicodechar{−}{\\ensuremath{-}}\'))\ndoc.preamble.append(pylatex.NoEscape(r\'\new

In [None]:
#doc.generate_pdf(output_fn, clean_tex=False)
doc.generate_tex(output_fn)

In [4]:
### STEP 5. Writing results to output LaTeX document
action = "postprocessing the LaTeX file"
log.info("%s" % action)
with open(output_fn+'.tex', 'r') as file:
    tex_doc = file.read()
    # Gene highlights
    tex_doc = tex_doc.replace('\_\_\_', '\colorbox{blue!30}{')
    tex_doc = tex_doc.replace('|||', '}')
    # Toxin highlights
    tex_doc = tex_doc.replace('\$\$\$', '\colorbox{red!30}{')
    tex_doc = tex_doc.replace('!!!', '}')
    # Keyword highlights
    tex_doc = tex_doc.replace('+++', '\colorbox{OliveGreen!30}{')
    tex_doc = tex_doc.replace('===', '}')
    ## Special unicode characters
    tex_doc = tex_doc.replace('Ⅱ', 'II')
    
with open(output_fn+'.tex', 'w') as file:
    file.write(tex_doc)
    
action = "compiling LaTeX file to PDF"
log.info("%s" % action)
subprocess.run(["pdflatex", output_fn+'.tex'], 
               stderr=subprocess.DEVNULL,
               stdout=subprocess.DEVNULL)

2022-07-28 09:24:10 [INFO] postprocessing the LaTeX file
2022-07-28 09:24:10 [INFO] compiling LaTeX file to PDF


CompletedProcess(args=['pdflatex', '/home/mgruenst/tmp/results/litMiningPubmed_Results_2022_07_28_0911.tex'], returncode=0)