# Text Pipeline - Manaaki Whenua

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-and-Defaults" data-toc-modified-id="Imports-and-Defaults-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports and Defaults</a></span></li><li><span><a href="#Extracting-Text-from-PDF" data-toc-modified-id="Extracting-Text-from-PDF-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Extracting Text from PDF</a></span></li><li><span><a href="#Further-Preprocessing" data-toc-modified-id="Further-Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Further Preprocessing</a></span></li><li><span><a href="#Extracting-Text-From-.pdf-Files-in-a-Folder-and-Saving-New-.txt-Files" data-toc-modified-id="Extracting-Text-From-.pdf-Files-in-a-Folder-and-Saving-New-.txt-Files-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Extracting Text From .pdf Files in a Folder and Saving New .txt Files</a></span></li></ul></div>

In [1]:
from IPython.core.display import display, HTML, Markdown as md
display(HTML("""<style>.container { width:80% !important; } p, ul {max-width:␣
,→40em;} .rendered_html table { margin-left: 0; } .output_subarea.output_png {␣
,→display: flex; justify-content: center;}</style>"""))

## Imports and Defaults

In [2]:
# pip install Pillow
# pip install wand

In [3]:
# import pandas as pd
# import numpy as np
import re

In [4]:
import pyocr
import pyocr.builders
import io
import codecs
import os

In [21]:
import deleteMagick #executable script that deletes leftover magick* files

In [5]:
from PIL import Image as PI
from wand.image import Image

## Extracting Text from PDF

In [6]:
def extractTextFromPDF(folder, file):
    """
    Takes a PDF document, converts to an image and 
    then extracts text from each page of that image into an array.
    The elements of the array (pages) are merged into one document 
    and line breaks removed ready for the next step.
    """
#     print("beginning")
    pdf_as_image = Image(filename=folder+'/'+file, resolution=600)
    pdf_as_jpeg = pdf_as_image.convert('jpeg')
    del pdf_as_image
    tool = pyocr.get_available_tools()[0]
#     req_image = []
    pages_text = []
#     final_text = ''
#     print('pages: ')
    for img in pdf_as_jpeg.sequence:
        img_page = Image(image=img)
        req_image = img_page.make_blob('jpeg')
        txt = tool.image_to_string(
            PI.open(io.BytesIO(req_image)),
            #lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        pages_text.append(txt.encode('utf-8', 'ignore'))
#         print(len(pages_text))
#     del img
    #flatten array of pages into one string/byte for whole document
    doc_text = b' '.join(pages_text) 
    del pages_text
    #remove breaks from words that wrap over two lines
    doc_text = re.sub(b'-\n', b'', doc_text) 
    #replace single returns with spaces
    doc_text = re.sub(b'\n', b' ', doc_text) 
    
    #replace double spaces with single spaces
    doc_text = re.sub(b'  ', b' ', doc_text) 


    
#     #replace double returns with single returns
#     doc_text = re.sub(b'\n\n', b'\n', doc_text) 
    return doc_text

In [7]:
os.listdir('PDFs')

['thesis.txt',
 'gen-2015-0194.pdf',
 'thesis (1).txt',
 'thesis.pdf',
 'thesis (1).pdf',
 '10.1007@s00300-017-2230-0.txt',
 'thesis (2).pdf',
 'greenslade2018.txt',
 'gen-2015-0194suppla.txt',
 'thesis (3).pdf',
 '10.1007@s00300-017-2230-0.pdf',
 'gen-2015-0194suppla.pdf',
 'fevo-07-00076.pdf',
 'greenslade2018.pdf']

In [29]:
document = extractTextFromPDF('PDFs',"10.1007@s00300-017-2230-0.pdf")

In [30]:
document

b'Polar Biology https://doi.org/10.1007/s00300-017-2230-0 ORIGINAL PAPER @\xc2\xae CrossMark Collembola of Barrientos Island, Antarctica: first census and assessment of environmental factors determining springtail distribution Natalia Enriquez\' - Pablo Tejedo? - Javier Benayas? - Bel\xc3\xa9n Albertos? - Maria Jos\xc3\xa9 Luciafiez\' Received: 21 October 2016 / Revised: 13 August 2017 / Accepted: 3 December 2017 \xc2\xa9 Springer-Verlag GmbH Germany, part of Springer Nature 2017 Abstract Barrientos Island is a small islet in the South Shetland archipelago frequently visited by Antarctic tourists. Collembola were recently used in another study developed in this site to assess the environmental conditions of two paths used by visitors, showing the importance of this soil faunal community. This motivated the realization of the first comprehensive census of Collembola from Barrientos Island. Fifty-six samples were recorded over three seasons, 2011-2013, from eight different substrate type

## Further Preprocessing

In [10]:
from bs4 import BeautifulSoup as bs

Using Beautiful Soup to strip further unwanted characters from the document.

In [11]:
soup = bs(document)

In [12]:
raw = soup.get_text()
raw[:1000]

"Polar Biology https://doi.org/10.1007/s00300-017-2230-0 ORIGINAL PAPER @® CrossMark Collembola of Barrientos Island, Antarctica: first census and assessment of environmental factors determining springtail distribution Natalia Enriquez' - Pablo Tejedo? - Javier Benayas? - Belén Albertos? - Maria José Luciafiez' Received: 21 October 2016 / Revised: 13 August 2017 / Accepted: 3 December 2017 © Springer-Verlag GmbH Germany, part of Springer Nature 2017 Abstract Barrientos Island is a small islet in the South Shetland archipelago frequently visited by Antarctic tourists. Collembola were recently used in another study developed in this site to assess the environmental conditions of two paths used by visitors, showing the importance of this soil faunal community. This motivated the realization of the first comprehensive census of Collembola from Barrientos Island. Fifty-six samples were recorded over three seasons, 2011-2013, from eight different substrate types. During the last campaign, 39

In [13]:
len(raw)

41087

In [14]:
end = raw.rfind('References')
raw = raw[:end]

In [15]:
len(raw)

27828

In [16]:
type(raw)

str

## Extracting Text From .pdf Files in a Folder and Saving New .txt Files

In [7]:
def convertFolderofPDFsToText (folder):
    """
    Runs through a given folder and converts PDFs to cleaned text files
    
    Parameters
    ----------
    folder : string
        location of PDF files to convert. (also target location for text files)
        
    Returns
    -------
    n/a
    
    """
    
    filelist = os.listdir(folder)
    filelist = [x for x in filelist if x.endswith('pdf')]
    numFiles = len(filelist)
    for i, file in enumerate(filelist):
        txtfile = re.sub('pdf', 'txt', file)
        print('Converting file',i+1, 'of', numFiles)
#         print('creating ', txtfile)
        f = io.open(folder+'/'+txtfile, 
                    encoding='utf-8', 
                    mode='w')
#         print('loading ', file)
#         print('extracting text from ', file)
        extractedText = extractTextFromPDF(folder, file)
        soup = bs(extractedText)
        raw = soup.get_text()
#         print('saving ', txtfile)
        f.write(raw)
#         print('closing ', txtfile, '\n')
        f.close()
        deleteMagick.clean()

In [8]:
def convertPDFtoText (folder, file):
    """
    Converts one PDFs to cleaned text file
    
    Parameters
    ----------
    folder : string
        location of PDF files to convert. (also target location for text files)
    file : string
        name of file to convert
        
    Returns
    -------
    n/a
    
    """
    
    txtfile = re.sub('pdf', 'txt', file)
    f = io.open(folder+'/'+txtfile, 
                    encoding='utf-8', 
                    mode='w')
    extractedText = extractTextFromPDF(folder, file)
    
    soup = bs(extractedText)
    raw = soup.get_text()
    f.write(raw)
    f.close()
    deleteMagick.clean()

In [34]:
convertFolderofPDFsToText('smallPDFs')

Converting file 1 of 3
Converting file 2 of 3
Converting file 3 of 3


In [35]:
convertFolderofPDFsToText('FraserSet')

Converting file 1 of 7
Converting file 2 of 7
Converting file 3 of 7
Converting file 4 of 7
Converting file 5 of 7
Converting file 6 of 7
Converting file 7 of 7


In [17]:
# convertPDFtoText('PDFs', 'fevo-07-00076.pdf')

In [18]:
# convertPDFtoText('PDFs', 'thesis (3).pdf')

In [37]:
# convertFolderofPDFsToText('PDFs')

In [22]:
# convertPDFtoText('PDFs', 'gen-2015-0194.pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis.pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis (1).pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis (2).pdf')