In [4]:
## All imports
from io import StringIO
import tika
tika.initVM()
from tika import parser
from PyPDF2 import PdfFileReader
import fitz
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import pdftotext
import re
import os
from nltk.tokenize import sent_tokenize
from gensim.summarization.textcleaner import split_sentences
from spacy.lang.en import English


nlp = English()
sbd = nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)

print ("Imports Successful!")

Imports Successful!


In [5]:
## text cleaner
def clean_sent(sent):
    sent = re.sub(r"\n"," ",sent)
    return sent

In [19]:
### All pdf extraction definition

#tika
def convert_with_tika(file_name):
    file_handle = file_path+"/"+file_name
    file_writer = open("output_text/tika_"+file_name+".txt","a+",encoding="utf-8")
    parsed = parser.from_file(file_handle)
    file_writer.write(parsed["content"])
    file_writer.close()
    print("processed file: ",file_name)
    

#pypdf2
def convert_with_pypdf2(file_name):
    file_handle = file_path+"/"+file_name
    file_writer = open("output_text/pypdf2_"+file_name+".txt","a+",encoding="utf-8")
    pdf = PdfFileReader(file_handle)
    pages = pdf.getNumPages()
    for page in range(pages):
        page_content = pdf.getPage(page)
        file_writer.write(page_content.extractText())
    file_writer.close()
    print("processed file: ",file_name)

#pymupdf
def convert_with_pymupdf(file_name):
    file_handle = file_path+"/"+file_name
    file_writer = open("output_text/pymupdf_"+file_name+".txt","a+",encoding="utf-8")
    doc = fitz.open(file_handle)
    pages = doc.pageCount
    for page in range(pages):
        page_content = doc.loadPage(page)
        file_writer.write(page_content.getText("text"))
    file_writer.close()
    print("processed file: ",file_name)

#pdfminer.six
def convert_with_pdfminer(file_name):
    file_handle = file_path+"/"+file_name
    file_writer = open("output_text/pdfminer_"+file_name+".txt","a+",encoding="utf-8")
    output_string = StringIO()
    with open(file_handle,'rb') as inp_file:
        parser = PDFParser(inp_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
    pdf_content = output_string.getvalue()
    file_writer.write(pdf_content)
    file_writer.close()
    print("processed file: ",file_name)

#pdftotext
def convert_with_pdftotext(file_name):
    file_handle = file_path+"/"+file_name
    file_writer = open("output_text/pdftotext_"+file_name+".txt","a+",encoding="utf-8")
    with open(file_handle,'rb') as inp_file:
        pdf = pdftotext.PDF(inp_file)
    file_writer.write("\n\n".join(pdf))
    file_writer.close()
    print("processed file: ",file_name)

In [7]:
## All sentence boundary analysis definitions

#NLTK method
def get_sent_by_nltk(text):
    return sent_tokenize(text)

#Spacy method
def get_sent_by_spacy(text):
    doc = nlp(text)
    sent_list = []
    for sent in doc.sents:
        sent_list.append(sent.text)
    return sent_list

#gensim method
def get_sent_by_gensim(text):
    return split_sentences(text)



In [8]:
## File folder browser and file lister from a folder

file_path = "dataset"
def get_filenames(file_extension):
    files = []
    for r,d, f in os.walk(file_path+"/"):
        for file in f:
            if "."+file_extension in file:
                files.append(file)
    return files

In [36]:
%%time
%reload_ext memory_profiler
%memit

files_list = get_filenames("pdf")
for filen in files_list:
    convert_with_tika(filen)

peak memory: 159.46 MiB, increment: -0.30 MiB
processed file:  9.pdf
processed file:  8.pdf
processed file:  10.pdf
processed file:  6.pdf
processed file:  7.pdf
processed file:  5.pdf
processed file:  4.pdf
processed file:  1.pdf
processed file:  3.pdf
processed file:  2.pdf
CPU times: user 258 ms, sys: 152 ms, total: 410 ms
Wall time: 3.78 s


In [26]:
file_to_clear = open("test-tei/7.tei.xml","r")
text = file_to_clear.read()
text = re.sub(r"(<ref.*>)|(<.[^(><.)]+>)","",text)
file_writer = open("output_text/grobid_7.tei.xml.txt","a+",encoding="utf-8")
file_writer.write(text)
file_writer.close()

In [47]:
%%time
%reload_ext memory_profiler
%memit
file_path = "output_text/grobid"
files_list = get_filenames("txt")
for filen in files_list:
    file_handle = file_path+"/"+filen
    f_reader = open(file_handle,"r",encoding="utf-8")
    data = f_reader.read()
    sent_list_nltk = get_sent_by_nltk(data)
    sent_list_spacy = get_sent_by_spacy(data)
    sent_list_gensim = get_sent_by_gensim(data)
    print(filen.split(".")[0],",NLTK,",len(sent_list_nltk),",Spacy,",len(sent_list_spacy),",Gensim,",len(sent_list_gensim))
    # print("Spacy Count:",len(sent_list_spacy))
    # print("Gensim Count:",len(sent_list_gensim))

peak memory: 158.38 MiB, increment: 0.03 MiB
grobid_2 ,NLTK, 23 ,Spacy, 22 ,Gensim, 129
grobid_7 ,NLTK, 33 ,Spacy, 35 ,Gensim, 94
grobid_4 ,NLTK, 400 ,Spacy, 398 ,Gensim, 410
grobid_1 ,NLTK, 163 ,Spacy, 157 ,Gensim, 162
grobid_8 ,NLTK, 192 ,Spacy, 192 ,Gensim, 193
grobid_10 ,NLTK, 78 ,Spacy, 78 ,Gensim, 78
grobid_6 ,NLTK, 90 ,Spacy, 90 ,Gensim, 91
grobid_3 ,NLTK, 114 ,Spacy, 113 ,Gensim, 124
grobid_9 ,NLTK, 147 ,Spacy, 146 ,Gensim, 155
grobid_5 ,NLTK, 33 ,Spacy, 32 ,Gensim, 70
CPU times: user 644 ms, sys: 173 ms, total: 818 ms
Wall time: 1.29 s
