# Scientific literature extract to_XML process

In [38]:
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.converter import TextConverter
from pdfminer3.layout import LAParams, LTTextContainer, LTTextContainer, LTTextLine, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfparser import PDFParser
from pdfminer3.pdfdocument import PDFDocument
#from pdfminer3.high_level import extract_pages
from pdfminer3.pdfdocument import PDFNoOutlines
from io import StringIO
import nltk
from nltk import ngrams
#nltk.download('punkt')

In [39]:
import os
from glob import glob

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## PDFminer

In [41]:
def extract_pdf(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    metadata=PDFDocument(PDFParser(fp)).info
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()
    lines = text.splitlines()
    fp.close()
    device.close()
    retstr.close()
    return text, lines, metadata

def findTitles(titlekeys, lines):
    i = -1
    titles = []
    for sentence in lines:
        i += 1
        if sentence == '' or sentence == ' ':
            continue
        else:
            sent_token = nltk.word_tokenize(sentence)
            if any('Abstract' in token for token in sent_token):
                titles.append([('Abstract',i)])
            if len(sent_token)<7:
                titles.append([(key,i) for key in titlekeys for token in sent_token if key == token])
    titles = list(filter(None, titles))
    return titles

In [42]:
#titles.append([(key, n, sent_token.index(key), len(sent_token)) for key in titlekeys for tokens in Ptitle for token in tokens if key == token])

### Extract text

In [43]:
###CHANGE VARIABLE
file = 'input_pdf_here/A new class of bio-composite materials of unique collagen fibers.pdf'

In [44]:
file_aux = file
file_aux

'input_pdf_here/A new class of bio-composite materials of unique collagen fibers.pdf'

#### Extract

In [45]:
pdf, lines, metadata=extract_pdf(file_aux)

### Metadata

In [46]:
Gmetadata = {}
for key in metadata[0]:
    try:
        text = metadata[0][key].decode(errors='replace')
        Gmetadata.update({key:text})
    except AttributeError:
        text = metadata[0][key]
        Gmetadata.update({key:text})
Gmetadata

{'CrossMarkDomains[2]': 'sciencedirect.com',
 'CreationDate': "D:20140503133037+05'30'",
 'CrossmarkMajorVersionDate': '2010-04-23',
 'Subject': 'Journal of the Mechanical Behavior of Biomedical Materials, 36 + (2014) 71-81. doi:10.1016/j.jmbbm.2014.04.008',
 'Author': 'Mirit Sharabi',
 'Creator': 'Elsevier',
 'Keywords': 'Collagen; Bio-composite; Soft tissue; Mechanical behavior; Hyperelastic',
 'ElsevierWebPDFSpecifications': '6.4',
 'CrossmarkDomainExclusive': 'true',
 'robots': 'noindex',
 'ModDate': "D:20140503133037+05'30'",
 'doi': '10.1016/j.jmbbm.2014.04.008',
 'CrossMarkDomains[1]': 'elsevier.com',
 'Title': 'A new class of bio-composite materials of unique collagen fibers'}

### Extract Titles

#### Extract outlines (table_of_content)

In [47]:
fp = open(file_aux, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)

# Get the outlines of the document:
try:
    outlines = document.get_outlines()
    outl = []
    for level in outlines:
        outl.append(level)
    outl
except PDFNoOutlines:
    outl = ''
    pass

In [48]:
title_outl = []
for m in range(len(outl)):
    title_outl.append(outl[m][1])

#### Bag of word

In [50]:
keywords = ['Introduction', 'INTRODUCTION',
            'Methods', 'methods', 'METHODS',
            'Experimental', 'experimental', 'EXPERIMENTAL', 'Develop', 'develop', 'DEVELOP',
            'Result', 'result', 'RESULT', 'Results', 'results', 'RESULTS',
            'Discussion', 'discussion', 'DISCUSSION',
            'Conclusion','conclusion','CONCLUSION', 
            'Supplementary Materials', 'Author Contributions',
            'Abbreviations', 'abbrevations', 'ABBREVIATIONS', 'List of abbreviations', 'List of Abbreviations', 
            'Conflicts of interest', 'Conflicts of Interest', 'conflicts of interest', 'Conflicts Of Interest', 'CONFLICTS OF INTEREST', 
            'Acknowledgements', 'acknowledgements', 
            'References', 'REFERENCES', 'Notes and references', 'Notes and References']

#### Find

In [51]:
if len(title_outl)>5:
    titles = findTitles(title_outl, lines)
else:
    titles = findTitles(keywords, lines)

In [52]:
titles

[[('Introduction', 53)],
 [('Results', 436)],
 [('Discussion', 649)],
 [('Limitations', 887)],
 [('Conclusions', 913)]]

In [53]:
lines[26]

'Accepted 10 April 2014'

In [54]:
### normalize
title = []
for i in range(len(titles)):
    title.append(titles[i][0][0])
NLtitle = []
for j in range(len(titles)):
    NLtitle.append(titles[j][0][1])

# to_JSON

In [55]:
import io
import os
import json

In [56]:
def set_default(obj):
    if isinstance(obj, set):
        return list(obj)
    raise TypeError

In [57]:
def dumper(obj):
    try:
        return obj.toJSON()
    except:
        return obj.__dict__

In [58]:
def export_as_json(filename, json_path):
    data_json = {'filename' : filename}
    data_json['Metadata'] = Gmetadata
    data_json['Outlines'] = str(outl)
    data_json['Content'] = {}
    data_json['Tables'] = {}
    
    ##EXTRACT-TEXT
    if title[0]=='Abstract':
        text = ''.join(lines[NLtitle[0]:NLtitle[1]-1])
        data_json['Content'].update({'Abstract':text})
        valid_abs = True
    i=2
    #if valid_abs == True:
    for l in NLtitle[1:]:
        if l == NLtitle[-1]:
            text = ''.join(lines[l+1:])
            data_json['Content'].update({lines[l]:text}) 
            i=i+1
        else:
            text = ''.join(lines[l+1:NLtitle[i]-1])
            data_json['Content'].update({lines[l]:text}) 
            i=i+1
    
    with open(json_path, 'w') as fh:
        json.dump(data_json, fh, indent=4, default=dumper)
    return print(filename + '\n'+ '\n'+ 'TRANSFORMATION COMPLETED!')

In [59]:
pdf_path = os.path.splitext(os.path.basename(file_aux))[0]
json_path = 'output_is_here//'+pdf_path+'.json'
export_as_json(pdf_path, json_path)

A new class of bio-composite materials of unique collagen fibers

TRANSFORMATION COMPLETED!
