In [1]:
import io
import os

from pdfminer.pdfparser import PDFParser
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed 
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.pdfinterp import resolve1



In [2]:
# Generate list of curated PDFs
os.chdir("../MaMomePDFs")
pdfs = os.listdir(".")


In [3]:
#Fxn to extract text from .pdfs

def pdfextract(pdf):
    

    with open(pdf, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        
        if not doc.is_extractable:
            print(pdf)
            raise PDFTextExtractionNotAllowed

        output_string = io.StringIO()
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams(detect_vertical = True))
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        
        
        # Drop first page w/abstract and header info and some references if not very short .pdf
        
        if (resolve1(doc.catalog['Pages'])!= None): 
            doclength = resolve1(doc.catalog['Pages'])['Count']
            
       
            for pageNumber, page in enumerate(PDFPage.get_pages(in_file)):
                if (pageNumber > 0) and (pageNumber < doclength) and (doclength >= 3):
                    interpreter.process_page(page)
                    
        else:

            for pageNumber, page in enumerate(PDFPage.get_pages(in_file)):
                interpreter.process_page(page)

            
        return output_string.getvalue()

In [4]:
# Extract text from .pdfs and write to .txt files

for pdf in pdfs:
 
    text = pdfextract(pdf)
    
    file = "../docproc/TxtData/" + pdf[:-4] + ".txt"
    
    with open(file, "w") as f:
        f.write(text)
