#### Step 1. Set up the environment.

In [None]:
# import modules
import os
import re
import json
import random
import tkinter
import zipfile
import textwrap
try:
    import requests
    import html2text
except ModuleNotFoundError as err:
    print('Warning: %s.'%err)
import ipywidgets as iw
from collections import OrderedDict
from IPython.display import display, clear_output, Markdown

# set output directory and file names
out_dir = os.path.join('.', 'results')
refs_dir = os.path.join(out_dir, 'refs')
eqns_dir = os.path.join(out_dir, 'eqns')
results_file = os.path.join(out_dir, 'results.zip')
summary_file = os.path.join(out_dir, 'summary.csv')
records_file = os.path.join('.', 'savedrecs.txt')

# check if archive file exists and create output directories
processing_complete = os.path.isfile(results_file)
[os.makedirs(di, exist_ok=True) for di in [refs_dir, eqns_dir]]

# configure http header, ignore insecure request warnings and test connection status
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Gecko/20100101 Firefox/60.0'}
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
try:
    requests.get('https://www.google.com/', verify=False)
    online = True
except requests.ConnectionError:
    print('Connection Error: Proceeding in offline mode.')
    online = False

# configure box alignment
align_center = iw.Layout(align_items='center')
    
print('Environment setup complete.')

if processing_complete:
    def make_repr(button):
        global processing_complete
        processing_complete = 0
        print('Allowing to reprocess results.')
                    
    print('All processing is complete. Extracting data... ', end='')
    with zipfile.ZipFile(results_file, 'r') as zf:
        for fi in zf.namelist():
            if fi == 'records.json':
                with zf.open(fi, 'r') as f:
                    recs = json.load(f)
            else:
                zf.extract(fi, eqns_dir)
    print('done.')
    
    processing_complete_message = iw.HTML('''
        <div class="alert alert-block alert-info" style="font-weight: 600">
        Processing is complete &mdash; run step 6 to view results.<br></div>
        ''')
    
    repr_button = iw.Button(description='Allow to reprocess results',
                            layout=iw.Layout(
                                    width='200px',
                                    style={'description_width': 'initial'}))
    repr_button.on_click(make_repr)
    
    display(iw.VBox([processing_complete_message, repr_button], layout=align_center))


#### Step 2. Download reference records from Web of Science.

In [None]:
if processing_complete:
    display(iw.VBox([processing_complete_message], layout=align_center))

# define query
query = '''
((TS=("network neuroscience")) OR ((TS=("connectom*")) AND (TS=("analy*") OR
TS=("model*"))) OR ((TS=("*brain*") OR TS=("*cort*")) AND (TS=("network theor*") OR
TS=("network analy*") OR TS=("network topolog*") OR TS=("network control*") OR
TS=("graph theor*") OR TS=("complex network*")))) AND (SO=("nature") OR
SO=("science") OR SO=("nature communications") OR
SO=("proceedings of the national academy of sciences of the united states of america")
OR SO=("nature neuroscience") OR SO=("neuron") OR SO=("elife") OR
SO=("plos biology") OR SO=("brain") OR SO=("biological psychiatry")) AND
(PY=("2014") OR PY=("2015") OR PY=("2016") OR PY=("2017") OR PY=("2018")) AND
(DT=("article"))
'''.replace('\n', ' ')

# format structured version of query
query_structured = indent = ''
for i, qi in enumerate(query):
        qs = qi
        qg, qh = query[i-2:i-1], query[i-1:i]
        qj, qk = query[i+1:i+2], query[i+2:i+3]
        if (qi=='(' and not qj=='"') or (qh+qi=='OR') or (qg+qh+qi=='AND'):
            indent += ('  ' if qi=='(' else '')
            qs = qs+'\n'+(indent[:-1] if qj==' ' else indent)
        if (qi==')' and not qh=='"') or (qi+qj=='OR') or (qi+qj+qk=='AND'):
            indent = (indent[:-2] if qi==')' else indent) 
            qs = '\n'+indent+qs
        query_structured += qs
        
# format compressed version of query
query_compressed = '"'.join([qi.replace(' ', '_') if i%2 else qi for i, qi in enumerate(query.split('"'))])
query_compressed = textwrap.fill(query_compressed, width=84, break_long_words=False, break_on_hyphens=False)
query_compressed = '"'.join([qi.replace('_', ' ') if i%2 else qi for i, qi in enumerate(query_compressed.split('"'))])

# define titles and values of tabs
substeps = [
['Step 2A', (
    'Navigate to the [webofknowledge.com](http://www.webofknowledge.com) website and '
    'click on the _Advanced Search_ link. ![screenshot](screenshots/step_2a.png)')],
['Search query', []],
['Step 2B', (
    'Paste the _`Search query`_ into the text area and press the _Search_ button.'
    '![screenshot](screenshots/step_2b.png)')],
['Step 2C', (
    'Click on the number in the _Results_ column to view the results.'
    '![screenshot](screenshots/step_2c.png)')],
['Step 2D', (
    'Choose the _Save to Other File Formats_ option from the main drop-down menu.'
    '![screenshot](screenshots/step_2d.png)')],
['Step 2E','''
Export all references:

- Select the _Records_ radio button and enter the values `1` and _`total number of records`_ in the text fields.
- Choose the _Author, Title, Source, Abstract_ option from the _Record Content_ drop-down menu.
- Choose the _Plain Text_ option from the _File Format_ drop-down menu.
- Press the _Send_ button and save the output to: `%s`

![screenshot](screenshots/step_2e.png)
'''%os.path.abspath(records_file)]]

# create query widget
query_menu_structured = iw.Output()
query_menu_compressed = iw.Output()
query_menu_structured.append_display_data(Markdown('<pre>'+query_structured+'</pre>'))
query_menu_compressed.append_display_data(Markdown('<pre>'+query_compressed+'</pre>'))
query_menu = iw.Tab(children=[query_menu_structured, query_menu_compressed])
    
# create menu widgets and visualize
range_l = range(len(substeps))
menu_children = [query_menu if i==1 else iw.Output() for i in range_l]
menu = iw.Tab(children=menu_children)
for i, si in enumerate(substeps):
    menu.set_title(i, si[0])
    if i != 1:
        menu.children[i].append_display_data(Markdown(si[1]))
    else:
        menu.children[i].set_title(0, 'Structured format')
        menu.children[i].set_title(1, 'Compressed format')
        
display(menu)


#### Step 3. Parse reference records and get article text.

In [None]:
if processing_complete:
    display(iw.VBox([processing_complete_message], layout=align_center))
    
else:
    # get reference metadata
    try:
        with open(records_file, 'r') as f:
            recs = f.readlines()
    except:
        display(iw.VBox([iw.HTML('''
                        <div class="alert alert-block alert-danger" style="font-weight: 600">
                        Cannot open %s<br>
                        Make sure to download this file as described in the previous cell.</div>
                        '''%os.path.abspath(records_file))], layout=align_center))

    # separate metadata by record
    recs = (''.join(recs[2:-2])).split('\n\n')
    recs = [ri.strip().split('\n') for ri in recs]

    # convert metadata to dictionaries
    for i, ri in enumerate(recs):
        ri = ['<<<'+li[:2]+'>>>'+li[2:] if li[0].isalpha() else li for li in ri]
        ri = ('\n'.join(ri)).split('<<<')[1:]
        ri = [li.split('>>>') for li in ri]
        recs[i] = OrderedDict(ri)
        
    # remove duplicates (records with matching dois)
    dupl = [i for i, ri in enumerate(recs) for rj in recs[i+1:] if ri['DI'] == rj['DI']]
    recs = [ri for i, ri in enumerate(recs) if i not in dupl]

    # loop over doi of references
    for i, ri in enumerate(recs):
        # reset variables
        pid = doi = acc = ref = jnl = url = None
        
        # try to get doi from pubmed (more reliable)
        try:
            pid = ri['PM'].strip()
            idc = 'https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids='+pid
            xml = requests.get(idc, headers=hdr, verify=False).content.decode()
            doi = re.findall(r'(?s)doi="(.*?)"', xml)[0]
        # get doi directly from record if pubmed fails
        except (KeyError, IndexError):
            doi = ri['DI'].strip()
            
        # define accession number, ref filename and journal name
        acc = ri['UT'].split(':')[1].strip()
        ref = os.path.join(refs_dir, 'ref%s.htm'%acc)
        jnl = ' '.join(ri['SO'].upper().replace('\n', ' ').split())

        # get url from pii if elsevier
        if jnl in {'BIOLOGICAL PSYCHIATRY', 'NEURON'} and online:
            aid = requests.get('https://dx.doi.org/'+doi, verify=False).url.split('/')[-1]
            pii = aid[:5]+'-'+aid[5:9]+'('+aid[9:11]+')'+aid[11:16]+'-'+aid[16]
            if jnl == 'BIOLOGICAL PSYCHIATRY':
                url = 'https://www.biologicalpsychiatryjournal.com/article/'+pii+'/fulltext'
            elif jnl == 'NEURON':
                url = 'https://www.cell.com/neuron/fulltext/'+pii

        # get url from doi otherwise
        else:
            url = 'https://dx.doi.org/'+doi
        
        # download if article doesn't exist
        print('Article %d/%d'%(i+1, len(recs)))
        if not os.path.isfile(ref):
            print('Saving to ref: '+ref)

            # download, convert to markdown, and save
            get = requests.get(url, headers=hdr, verify=False)
            with open(ref, 'wb') as f:
                f.write(get.content)

        # store accession number, journal name and html
        recs[i]['UT'] = acc
        recs[i]['SO'] = jnl
        with open(ref, 'rb') as f:
            recs[i]['HTM'] = f.read().decode()

    print('\n\nArticle text loaded.')
    

#### Step 4. Clean article text formatting.

In [None]:
if processing_complete:
    display(iw.VBox([processing_complete_message], layout=align_center))

else:
    # define auxiliary functions
    notitle = lambda si: re.sub(r'(?s)title=(".*?"|\'.*?\')', '', si)
    dat2src = lambda si: si.replace('data-src', 'src')
    img2url = lambda si, jnl: '!'+si if isimg(si, jnl) else si
    isimg = lambda si, jnl: (si[1:5]=='Math') or (si[1]==']' and jnl not in {'ELIFE', 'SCIENCE'})
    natrhdg = lambda si: si.replace('## ', '### ')

    # html to markdown conversion
    def htm2txt(si):
        text_maker = html2text.HTML2Text()
        text_maker.body_width = 0
        text_maker.ignore_emphasis = True
        text_maker.use_automatic_links = False
        return text_maker.handle(si)

    for i, ri in enumerate(recs):
        # get journal name and text
        print('Article %d/%d'%(i+1, len(recs)))
        jnl = ri['SO']
        txt = ri['HTM']

        # remove tables, remove link titles and fix image tags
        txt = re.sub(r'(?s)<table.*?</table>', ' ', txt)
        txt = re.sub(r'(?s)(<a(.*?)>)', lambda fi: notitle(fi.group()), txt)
        txt = re.sub(r'(?s)<img.*?>', lambda fi: dat2src(fi.group()), txt)

        # protect headings and math formatting
        txt = txt.replace('</h2>','</h2>\n\n')
        txt = re.sub(r'(?s)(</){,1}mml:', '\g<1>', txt)
        txt = re.sub(r'(?s)<math.*?>', '<math>', txt)

        # convert to markdown
        if '<math>' in txt:
            # extract math formatting
            txt = txt.split('<math>')
            t0 = txt.pop(0)
            txtm, txt = zip(*(ti.split('</math>') for ti in txt))
            txtm = [('<math>'+ti+'</math>').replace('>', '> ') for ti in txtm]
            txt = htm2txt('[math]'.join([t0] + list(txt)))

            # recover math formatting
            txt = txt.split('[math]')
            t0 = txt.pop(0)
            txt = ''.join([t0] + [''.join(ti) for ti in zip(txtm, txt)])
        else:
            txt = htm2txt(txt)

        # remove inline images except equations
        txt = re.sub(r'(?s)\!(\[.*?\]\(.*?\))', lambda fi: img2url(fi.group(1), jnl), txt)

        # format equations
        eqn_style = 'alt="Equation" style="display:inline-block; vertical-align:bottom"'
        txt = re.sub(r'(?s)\!\[.*?\]\((.*?)\)', '<img src="\g<1>" '+eqn_style+'>', txt)

        # remove all links
        txt = txt.split('[')
        t0 = txt.pop(0)
        txt = [re.sub(r'(?s)\]\(.*?\)', ']', ti) for ti in txt]
        txt = '['.join([t0] + txt)

        # journal-specific formatting fixes
        if jnl == 'BRAIN':
            # remove duplicate elements
            idx = [fi.start() for fi in re.finditer(r'(?s)\n\n(?:Figure|Table)\s[0-9]{1,2}\n\n', txt)]
            dbl = [txt[i:j] for i, j in zip(idx[:-1],idx[1:]) if txt[i:j] in txt[j:]]
            for di in dbl:
                txt = filter(None, txt.split(di))
                txt = ('\n'+di+'\n').join(txt)
            # fix figure and table captions
            txt = re.sub(r'(?s)(Figure\s[0-9]{1,2}\n*?)\[.*?\]\n\n', '\g<1>', txt)

        elif jnl == 'BIOLOGICAL PSYCHIATRY':
            # fix inline citations
            txt = re.sub(r'(?s)\n*?\[([0-9].*?)\].*?See\sall\sReferences', '(\g<1>)', txt)
            txt = re.sub(r'(?s)\(+[0-9]{1,3}\)+\(*([0-9]{1,3})\)*', '(\g<1>)', txt)
            # remove inline table of contents
            txt = re.sub(r'(?s)Jump\sto\sSection.*?(#|\Z)', '\g<1>', txt)
            # fix figure caption duplications
            cpt = 'Figure\s[0-9]{1,2}[A-Z]{,1}'
            txt = re.sub(r'(?s)(\['+cpt+'\])'+cpt, '\g<1>', txt)
            cpt = '(?:Figures{,1}\sS|Tables{,1}\sS|Supplement\s)[0-9]{1,2}[A-Z]{,1}';
            txt = re.sub(r'(?s)('+cpt+')'+cpt, '\g<1>', txt)

        elif jnl == 'NEURON':
            # fix inline citations
            yyyy = '[12][90][0-9][0-9][a-z]{,1}'
            txt = re.sub(r'(?s)(\({,1}'+yyyy+'\){,1})\n\n.{,1000}?\[Google\sScholar\]', '\g<1> ', txt)
            # fix figure caption display
            txt = re.sub(r'(?s)(\n\n(?:Figure|Table)\s[0-9]{1,2})([A-Za-z]{2,})', '\g<1>. \g<2>', txt)
            txt = re.sub(r'(\[Download \(PPT\)\]).*?\n', '\g<1>', txt)

        elif jnl == 'PLOS BIOLOGY':
            # fix image urls
            txt = txt.replace('<img src="article/', '<img src="https://journals.plos.org/plosbiology/article/')

        elif jnl == 'NATURE':
            # filter all text after Editorial Summary
            txt = txt.split('\n# Editorial Summary')
            assert(len(txt) <= 2)
            if len(txt) == 2:
                txt[1] = txt[1].replace('\n## ', '\n# ')
            txt = '\n## Editorial Summary'.join(txt)

        # clean up space and escape asterisks
        txt = re.sub(r'(?s)\s+(\]|\))', '\g<1>', txt)
        txt = re.sub(r'(?s)(\[|\()\s+', '\g<1>', txt)
        txt = re.sub(r'(?s)(\]|\))\s+(\[|\()', '\g<1> \g<2>', txt)
        txt = re.sub(r'(?s)\s+(\.|:|;|,)', '\g<1>', txt)
        txt = re.sub(r'(?s)(;|,)\s+', '\g<1> ', txt)
        txt = re.sub(r'(?s)(\n\n)\n+', '\g<1>', txt)
        txt = txt.replace('*', '\*')

        recs[i]['TXT'] = txt

    print('\n\nArticle formatting cleaned.')
    

#### Step 5. Filter sections, highlight relevant terms, and download equation images.

In [None]:
if processing_complete:
    display(iw.VBox([processing_complete_message], layout=align_center))

else:    
    # set of article sections to be removed
    null = {r'ABOUT THIS ARTICLE', r'ABSTRACT', r'ACCESSION NUMBERS', r'ACCESSIONS', 
            r'ACKNOWLEDGEMENT', r'ACKNOWLEDGEMENTS', r'ACKNOWLEDGEMENTS AND DISCLOSURES', 
            r'ACKNOWLEDGMENTS', r'ACKNOWLEDGMENTS AND DISCLOSURES', r'ADDITIONAL INFORMATION', 
            r'AFFILIATIONS', r'APPENDIX A SUPPLEMENTARY MATERIAL', 
            r'APPENDIX A SUPPLEMENTARY MATERIALS', r'APPENDIX A SUPPLEMENTARY METERIALS', 
            r'APPENDIX A SUPPORTING INFORMATION', r'ARTICLE AND AUTHOR INFORMATION', 
            r'ARTICLE INFO', r'ARTICLE OUTLINE', r'ARTICLE RELATED AUDIO', 
            r'ARTICLE RELATED VIDEO', r'ARTICLE TOOLS', r'ARTICLES', r'AUTHOR CONTRIBUTIONS', 
            r'AUTHOR INFORMATION', r'AUTHOR NOTES', r'AUTHOR RESPONSE', r'AUTHOR SUMMARY', 
            r'AUTHORS', r'AVAILABILITY OF DATA AND CODE', 
            r'BE THE FIRST TO READ NEW ARTICLES FROM ELIFE', r'CHANGE HISTORY', 
            r'CITATION MANAGER FORMATS', r'CODE AVAILABILITY', r'COMMENTS', 
            r'COMPETING INTERESTS', r'CONCLUSION', r'CONCLUSIONS', r'CONFLICT OF INTEREST', 
            r'CONSORTIA', r'COPYRIGHT NOTICE', r'CORRESPONDING AUTHOR', 
            r'CORRESPONDING AUTHORS', r'CSV FILES', r'DATA AVAILABILITY', r'DECISION LETTER', 
            r'DISCUSSION', r'DOWNLOAD LINKS', r'EDITORIAL SUMMARY', 
            r'ELECTRONIC SUPPLEMENTARY MATERIAL', r'ELIFE DIGEST', r'EXCEL FILES', 
            r'EXTENDED DATA', r'EXTENDED DATA FIGURES', r'EXTENDED DATA FIGURES AND TABLES', 
            r'EXTENDED DATA TABLES', r'FIGURES', r'FOOTNOTES', r'FUNDING', r'FURTHER READING', 
            r'HIGHLIGHTS', r'INFORMATION', r'INTEGRATED SUPPLEMENTARY INFORMATION', 
            r'INTRODUCTION', r'JUMP TO SECTION', r'KEYWORDS', r'LINKED ARTICLES', r'MAIN', 
            r'MAIN MENU', r'METRICS', r'NATURE', r'NATURE COMMUNICATIONS', 
            r'NATURE COMMUNICATIONS MENU', r'NATURE MENU', r'NATURE NEUROSCIENCE', 
            r'NATURE NEUROSCIENCE MENU', r'NATURECOM SITEMAP', r'NEW RESEARCH IN', 
            r'NMHS SOURCE CODE', r'NOT PERMITTED', r'PDF FILES', r'PERMITTED', r'PNAS PORTALS', 
            r'REFERENCES', r'REFERENCES AND NOTES', r'RELATED ARTICLES', 
            r'RIGHTS AND PERMISSIONS', r'SEARCH', r'SI DISCUSSION', 
            r'SIGN UP FOR ARTICLE ALERTS', r'SIGNIFICANCE', r'SOURCE DATA', r'SUMMARY', 
            r'SUMMARY AND CONCLUSIONS', r'SUPPLEMENTAL INFORMATION', r'SUPPLEMENTARY DATA', 
            r'SUPPLEMENTARY FIGURES', r'SUPPLEMENTARY INFORMATION', r'SUPPLEMENTARY MATERIAL', 
            r'SUPPLEMENTARY MATERIALS', r'SUPPORTING INFORMATION', r'USER MENU', r'VIDEOS', 
            r'YOU MAY ALSO BE INTERESTED IN', r'ZIP FILES', r'[SCIENCE]', 
            r'[SHOW]ARTICLE INFO'}

    term = {r'centralit', r'clubs{,1}\b', r'cluster[ie]', r'communit', r'controllab', r'cores{,1}\b', 
            r'degree', r'distan', r'divers', r'efficien', r'flexib', r'graph', r'hub', r'modular', 
            r'module', r'motif', r'participat', r'paths{,1}\b', r'strength', r'topolog', r'world'}
    
    for i, ri in enumerate(recs):
        # get journal name and article text
        print('Article %d/%d'%(i+1, len(recs)))
        jnl = ri['SO']
        sec = ri['TXT']
        acc = ri['UT']

        # download equations and modify markdown code
        for ui in re.findall(r'(?s)<img.*?src="(.*?)"', sec.split('## ', 1)[1]):
            if len(re.findall(r'(?s)^/\w', ui)):
                continue
            url = re.sub(r'(?s)^(https{,1}:){,1}/*', 'http://', ui)
            eqn = os.path.join(eqns_dir, re.sub(r'(?s)(\W|_)+', '_', 'ref%s_'%acc+url))
            if not os.path.isfile(eqn):
                print('Saving eqn: '+url)
                get = requests.get(url, headers=hdr, verify=False)
                with open(eqn, 'wb') as f:
                    f.write(get.content)
            sec = re.sub(r'(?s)(<img.*?src=)"'+re.escape(ui)+'"', '\g<1>"'+repr(eqn)[1:-1]+'"', sec)

        # disable autolinks
        sec = sec.replace('http://', 'h<span>tt</span>p://')
        sec = sec.replace('https://', 'h<span>tt</span>ps://')
        sec = sec.replace('www.', 'w<span>ww</span>.')
        
        # split text by paragraphs
        sec = re.sub(' *\n *', '\n', sec).split('\n\n')
        sec = [pi.strip() for pi in sec]
        
        # highlight terms in pink
        spanbgc = lambda i: '<span style="background-color: '+('lightyellow' if i else 'pink')+'">'
        fnd = r'(?is)([\s\-](?:'+'|'.join(term)+')[a-zA-Z\-]*)'
        rpl = spanbgc(0)+'\g<1></span>'
        for j, pi in enumerate(sec):
            # exclude headings
            if not pi.startswith('#'):
                # protect math
                if '<math>' in pi:
                    txt = pi.split('<math>')
                    t0 = txt.pop(0)
                    txtm, txt = zip(*(ti.split('</math>') for ti in txt))
                    txt = [re.sub(fnd, rpl, ' '+ti) for ti in [t0] + list(txt)]
                    txtm = ['<math>'+ti+'</math>' for ti in txtm]
                    sec[j] = ''.join(txt[:1] + [''.join(ti) for ti in zip(txtm, txt[1:])])
                elif '$$' in pi:
                    txt = pi.split('$$')
                    if len(txt) % 2:
                        txt, txtm = txt[0::2], txt[1::2]
                        txt = [re.sub(fnd, rpl, ' '+ti) for ti in txt]
                        txtm = ['$$'+re.sub(r'\\+', r'\\', ti)+'$$' for ti in txtm]
                        sec[j] = ''.join(txt[:1] + [''.join(ti) for ti in zip(txtm, txt[1:])])
                    else:
                        sec[j] = re.sub(fnd, rpl, ' '+pi)
                else:
                    sec[j] = re.sub(fnd, rpl, ' '+pi)
        # highlight paragraphs in yellow
        sec = [spanbgc(1)+pi+'</span>' if spanbgc(0) in pi else pi for pi in sec]
        
        # split text by sections and clean up
        sec = ['<<<'+pi+'>>>' if pi.startswith('## ') else pi for pi in sec]
        sec = ('\n\n'.join(sec)).split('<<<')[1:]
        sec = [si.split('>>>') for si in sec]
        sec = [[re.sub(r'[0-9#:\.!\\\n]', '', si[0]).strip().upper(), si[1]] for si in sec]

        # include the main section of nature journals papers
        # for short papers that have no results sections
        main = {''}
        if 'NATURE' in jnl:
            head = set(list(zip(*sec))[0]) - null
            if (len(head) <= 2) and ('RESULTS' not in head):
                main = {'MAIN'}

        # filter and store article text
        tru = [si for si in sec if not si[0] in (null - main)]
        tru = '\n\n'.join(['## '+si[0]+'\n\n'+si[1] for si in tru])        
        recs[i]['TRU'] = tru
        
    print('\n\nArticle text filtered and key terms highlighted.')

    # # optionally print the null and term sets
    # for h in [null, term]:
    #     ci = 0
    #     print('{', end='')
    #     for i, ni in enumerate(sorted(h)):
    #         ci += len(ni) + 4
    #         if ci > 80:
    #             print('\n            ', end='')
    #             ci = len(ni) + 4
    #         print('r\'%s\''%ni, end=', ' if i+1 < len(h) else '}')
    #     print('\n\n')

    # # optionally print frequency of terms
    # word = {ki: 0 for ki in term}
    # for ri in recs:
    #     for wi in re.findall(r'(?s)'+re.escape(spanbgc(0))+'.*?</span>', ri['TRU']):
    #         for ki in word:
    #             word[ki] += (1 if re.findall('(?is)'+ki, wi) else 0)

    # k, v = zip(*word.items())
    # for vi, ki in sorted(zip(v, k))[::-1]:
    #     print('% 5d %s'%(vi, ki))

#### Step 6. Display and evaluate processed article text.

In [None]:
# # optionally filter articles by type:
# evidence_circularity = 'yes'
# recs = [ri for ri in recs if ri['Q4']==evidence_circularity]

# # optionally display only highlighted paragraphs
# for i, ri in enumerate(recs):
#     recs[i]['TRU'] = '\n\n'.join([pi for pi in ri['TRU'].split('\n\n') if pi.strip().startswith('<')])

if not processing_complete:
    # shuffle article order and start with an example
    random.shuffle(recs)
    recs.insert(0, recs.pop([i for i, ri in enumerate(recs) if ri['UT']=='000389746000001'][0]))
    
    # specify that processing is now complete
    processing_complete = 1

# define constants
n = len(recs)
h = tkinter.Tk().winfo_screenheight()

# store value function
def store_value(button):
    # index of record
    i = paper_field.value - 1
    
    # store value of pressed button
    ki = button['owner'].description
    vi = button['owner'].value
    recs[i][ki] = vi
    
    # get button index and adjust other values
    di = int(ki[1]) - 1
    if di < 2:
        iwa[di+1].value = 'not answered'
    # set bottom decision
    if (di == 1 and vi != 'yes') or di == 2:
        iwa[3].value = vi
    # specify final assessment
    if di == 0:
        if vi == 'no':
            iwa[3].value = 'not applicable'
        else:
            iwa[3].value = 'not answered'
    # disable or enable buttons   
    if di < 2:
        iwa[di+1].disabled = vi != 'yes'
        
# change paper with button
def button_press(button):
    if button.description == 'Previous article':
        paper_field.value = max(paper_field.value-1, 1)
    elif button.description == 'Next article':
        paper_field.value = min(paper_field.value+1, n)

# change paper with inpaper_field
def change_paper(value):
    ri = recs[value['new']-1]
    # change text
    text.clear_output()
    sec = '\n\n'.join(['`Article '+ri['UT']+'`', '`'+ri['AB'].replace("`", "'")+'`', ri['TRU']])
    text.append_display_data(Markdown(sec))
    
    # restore or reset evaluations
    for i, ai in enumerate(iwa):
        ki = ai.description
        if ki in ri:
            iwa[i].value = ri[ki]
        else:
            iwa[i].value = 'not answered'
        
    # reset tabs and set button status
    tabs.selected_index = 0
    prev_button.disabled = paper_field.value == 1
    next_button.disabled = paper_field.value == n

# change viewing box height
def change_height(value):
    height = str(round(h * value['new'] / 100))
    text.layout.height = height+'px'
    evln.layout.height = height+'px'

# create widgets
prev_button = iw.Button(description='Previous article')
next_button = iw.Button(description='Next article')
paper_field = iw.BoundedIntText(description='Article:',
                                value=n, min=1, max=n, step=1,
                                layout=iw.Layout(width='150px'),
                                style={'description_width': 'initial'},
                                continuous_update=False)
height_slider = iw.IntSlider(description='Text-window height:',
                           value=100, min=30, max=100, step=1, 
                           style={'description_width': 'initial'},
                           continuous_update=True)

menu = iw.HBox([prev_button, next_button, paper_field, height_slider],
            layout=iw.Layout(justify_content='space-around'))
text = iw.Output()

# define widget behavior
prev_button.on_click(button_press)
next_button.on_click(button_press)
paper_field.observe(change_paper, names='value')
height_slider.observe(change_height, names='value')

# create evaluation widget and define behavior
hsub = lambda M: tuple('<i>'+m[0]+'</i><sub>'+m[1]+'</sub></i>' for m in M)

ques = [('<ol start="1"><li>'
         'Presence of at least one network-neuroscience model.'
         '</li></ol>'),
        ('<hr><ol start="2"><li>'
         'Acceptance of at least one %s, where:'
         '<ul>'
         '<li>%s is a network-neuroscience model of the studied data.</li>'
         '<li>%s includes a feature %s that represents some function %s.</li>'
         '<li>There is no strong known mechanistic link between %s and %s.</li>'
         '</ul>'
         '</li></ol>')%hsub(('M1', 'M1', 'M1', 'X1', 'F1', 'X1', 'F1')),
        ('<hr><ol start="3"><li>'
         'No test of %s against at least one %s, where:'
         '<ul>'
         '<li>%s is a model of the same studied data.</li>'
         '<li>%s includes only features with known mechanistic links to function.</li>'
         '<li>%s is known, or likely to explain, %s as a redundant feature.</li>'
         '</ul>'
         '</li></ol>')%hsub(('M1', 'M0', 'M0', 'M0', 'M0', 'X1')),
        ('<hr><ol start="4"><li>Evidence of circular network-neuroscience analysis.</li></ol>')]
answ = ['not answered', 'no', 'yes', 'unclear', 'not applicable']

iwa = []; iwq = [];
for i, qi in enumerate(ques):
    iwq.append(iw.HTML(value=qi))
    iwa.append(iw.ToggleButtons(options=answ[:i+(3 if i < 2 else 2)],
                                description='Q%d'%(i+1),
                                style={'description_width': '0px'}))
    if i:
        iwa[-1].disabled = True
    iwa[-1].observe(store_value, names='value')

evln = iw.VBox(      [iwq[0], iwa[0],
                      iwq[1], iwa[1],
                      iwq[2], iwa[2],
                      iwq[3], iwa[3]])

text.append_display_data(Markdown(' '))

tabs = iw.Tab(children=[text, evln])
tabs.set_title(0, 'Methods and Results')
tabs.set_title(1, 'Evaluation')

paper_field.value = 1
height_slider.value = 70
display(menu)
display(tabs)


#### Step 7. Save results to a database file and a summary table.

In [None]:
def save_info(button):
    if button is not None:
        # create and write to the archive file (8 specifies deflation)
        keys = ['UT', 'AB', 'TRU'] + [ai.description for ai in iwa]
        filt_dict = lambda ri: {ki: vi for (ki, vi) in ri.items() if ki in keys}
        try:
            with zipfile.ZipFile(results_file, 'w', 8) as zf:
                zf.writestr('records.json', json.dumps([filt_dict(ri) for ri in recs]))
                for fi in os.listdir(eqns_dir):
                    zf.write(os.path.join(eqns_dir, fi), fi)
            print(results_file+' saved;', end=' ')        
        except PermissionError:
            print(results_file+' not saved: permission denied;', end=' ')
        try:
            with open(summary_file, 'w') as f:
                f.write('Accession Number,'+','.join([ai.description for ai in iwa])+'\n')
                for ri in recs:
                    f.write(ri['UT']+',')
                    for ai in iwa:
                        ki = ai.description
                        if ki in ri:
                            f.write(ri[ki]+',')
                        else:
                            f.write('not answered,')
                    f.write('\n')
                print(summary_file+' saved.', end=' ')
        except PermissionError:
            print(summary_file+' not saved: permission denied.', end=' ')
        print('\n', end='')

menu_messag = iw.HTML('''
                    <div class="alert alert-block alert-warning" style="font-weight: 600">
                    Save results to %s<br>
                    NB: this may overwrite existing files.</div>
                    '''%os.path.abspath(out_dir))
save_button = iw.Button(description='Save results')
save_button.on_click(save_info)
display(iw.VBox([menu_messag, save_button], layout=align_center))
