# Text Pipeline - Manaaki Whenua

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Imports-and-Defaults" data-toc-modified-id="Imports-and-Defaults-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Imports and Defaults</a></span></li><li><span><a href="#Extracting-Text-from-PDFs-(Revised-methodology)" data-toc-modified-id="Extracting-Text-from-PDFs-(Revised-methodology)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Extracting Text from PDFs (Revised methodology)</a></span><ul class="toc-item"><li><span><a href="#Turn-PDF-into-Raw-Text" data-toc-modified-id="Turn-PDF-into-Raw-Text-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Turn PDF into Raw Text</a></span></li><li><span><a href="#Convert-Array-of-Raw-Text-into-Cleaned-and-Flattened-Text-File" data-toc-modified-id="Convert-Array-of-Raw-Text-into-Cleaned-and-Flattened-Text-File-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Convert Array of Raw Text into Cleaned and Flattened Text File</a></span></li><li><span><a href="#Converting-Individual-Documents" data-toc-modified-id="Converting-Individual-Documents-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Converting Individual Documents</a></span><ul class="toc-item"><li><span><a href="#Original-Group-of-PDFs" data-toc-modified-id="Original-Group-of-PDFs-2.3.1"><span class="toc-item-num">2.3.1&nbsp;&nbsp;</span>Original Group of PDFs</a></span></li><li><span><a href="#Second-Group-of-PDFs" data-toc-modified-id="Second-Group-of-PDFs-2.3.2"><span class="toc-item-num">2.3.2&nbsp;&nbsp;</span>Second Group of PDFs</a></span></li></ul></li></ul></li><li><span><a href="#Initial-Process---Now-replaced-with-new-method" data-toc-modified-id="Initial-Process---Now-replaced-with-new-method-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Initial Process - Now replaced with new method</a></span><ul class="toc-item"><li><span><a href="#Extracting-Text-from-PDF" data-toc-modified-id="Extracting-Text-from-PDF-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Extracting Text from PDF</a></span></li><li><span><a href="#Further-Preprocessing" data-toc-modified-id="Further-Preprocessing-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Further Preprocessing</a></span></li><li><span><a href="#Extracting-Text-From-.pdf-Files-in-a-Folder-and-Saving-New-.txt-Files" data-toc-modified-id="Extracting-Text-From-.pdf-Files-in-a-Folder-and-Saving-New-.txt-Files-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Extracting Text From .pdf Files in a Folder and Saving New .txt Files</a></span></li></ul></li><li><span><a href="#Sandbox" data-toc-modified-id="Sandbox-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Sandbox</a></span></li></ul></div>

In [216]:
from IPython.core.display import display, HTML, Markdown as md
display(HTML("""<style>.container { width:80% !important; } p, ul {max-width:␣
,→40em;} .rendered_html table { margin-left: 0; } .output_subarea.output_png {␣
,→display: flex; justify-content: center;}</style>"""))

## Imports and Defaults

In [217]:
# pip install Pillow
# pip install wand

In [218]:
import pandas as pd
import numpy as np
import re

In [219]:
import pyocr
import pyocr.builders
import io
import codecs
import os

In [220]:
import deleteMagick #executable script that deletes leftover magick* files

In [221]:
from PIL import Image as PI
from wand.image import Image

## Extracting Text from PDFs (Revised methodology)

### Turn PDF into Raw Text

In [5]:
def convertPDFtoTextArray(folder, file):
    """
    Takes a PDF document, converts to an image and 
    then extracts text from each page of that image into an array of UTF-8 encoded text.

    """
#     print("beginning")
    pdf_as_image = Image(filename=folder+'/'+file, resolution=600)
    pdf_as_jpeg = pdf_as_image.convert('jpeg')
    del pdf_as_image
    tool = pyocr.get_available_tools()[0]
    pages_text = []
    for img in pdf_as_jpeg.sequence:
        img_page = Image(image=img)
        req_image = img_page.make_blob('jpeg')
        txt = tool.image_to_string(
            PI.open(io.BytesIO(req_image)),
            builder=pyocr.builders.TextBuilder()
        )
        pages_text.append(txt.encode('utf-8', 'ignore'))
    filename=re.sub('.pdf','', file)
    np.save(folder+'/'+filename, pages_text)
    
    deleteMagick.clean() #ensure ImageMagick files are removed from temp folder 
    
    return pages_text

In [6]:
def convertFolderofPDFstotextArrays(folder):
    filelist = os.listdir(folder)
    filelist = [x for x in filelist if x.endswith('pdf')]
    numFiles = len(filelist)
    for i, file in enumerate(filelist):
        print('Converting file',i+1, 'of', numFiles)
        convertPDFtoTextArray(folder, file)

In [786]:
# convertFolderofPDFstotextArrays('smallPDFs')

In [784]:
# convertFolderofPDFstotextArrays('FraserSet')

In [785]:
# convertFolderofPDFstotextArrays('PDFs')

In [782]:
deleteMagick.clean()

### Convert Array of Raw Text into Cleaned and Flattened Text File

In [22]:
from difflib import SequenceMatcher

In [222]:
# takes a array of text strings (pages) and cleans docuemnt of headers and footers then cleans up flattened text (i.e. removes ) 

def cleanAndSaveText(folder, file, footkeeplist=[], footdroplist=[], headkeeplist=[], headdroplist=[],
                           footthreshold=3, headthreshold=3, hiddenFooters=True, hiddenHeaders=True):
    data = convertTextArrayToText(folder, file, footkeeplist,footdroplist, headkeeplist, headdroplist,
                                  footthreshold, headthreshold, hiddenFooters, hiddenHeaders)
    txtfile = re.sub('npy', 'txt', file)

    f = io.open(folder+'/'+txtfile, 
                encoding='utf-8', 
                mode='w')

#     doc_text = '!!!PAGE BREAK!!!'.join(data) 
    doc_text = ' '.join(data) 

    #remove breaks from words that wrap over two lines
    doc_text = re.sub('-\n', '', doc_text) 

#     #replace double returns with placeholder
#     doc_text = re.sub('\n\n', '<DOUBLERETURN>', doc_text) 

    #replace single returns with spaces
    doc_text = re.sub('\n', ' ', doc_text) 

    #replace double spaces with single spaces
    doc_text = re.sub('  ', ' ', doc_text) 

#     #replace placeholder with double returns
#     doc_text = re.sub('<DOUBLERETURN>','\n\n', doc_text) 
    
#     #replace double returns with single returns
#     while doc_text.find('\n\n')!=-1:
#         doc_text = re.sub('\n\n', '\n', doc_text)

#     doc_text = doc_text.decode()
    f.write(doc_text)
    f.close()

In [8]:
# # takes a array of text strings (pages) and cleans docuemnt of headers and footers then cleans up flattened text (i.e. removes ) 

# def cleanAndSaveText(folder, file, footkeeplist=[], footdroplist=[], headkeeplist=[], headdroplist=[],
#                            footthreshold=3, headthreshold=3, hiddenFooters=True, hiddenHeaders=True):
#     data = convertTextArrayToText(folder, file, footkeeplist,footdroplist, headkeeplist, headdroplist,
#                                   footthreshold, headthreshold, hiddenFooters, hiddenHeaders)
#     txtfile = re.sub('npy', 'txt', file)

#     f = io.open(folder+'/'+txtfile, 
#                 encoding='utf-8', 
#                 mode='w')

# #     doc_text = '!!!PAGE BREAK!!!'.join(data) 
#     doc_text = ' '.join(data) 

#     #remove breaks from words that wrap over two lines
#     doc_text = re.sub('-\n', '', doc_text) 

#     #replace double returns with placeholder
#     doc_text = re.sub('\n\n', '<DOUBLERETURN>', doc_text) 

#     #replace single returns with spaces
#     doc_text = re.sub('\n', ' ', doc_text) 

#     #replace double spaces with single spaces
#     doc_text = re.sub('  ', ' ', doc_text) 

#     #replace placeholder with double returns
#     doc_text = re.sub('<DOUBLERETURN>','\n\n', doc_text) 
    
#     #replace double returns with single returns
#     while doc_text.find('\n\n')!=-1:
#         doc_text = re.sub('\n\n', '\n', doc_text)

# #     doc_text = doc_text.decode()
#     f.write(doc_text)
#     f.close()

In [223]:
# Use for full process from PDF doc to text file 

def convertOnePDFtoText(folder, file, footkeeplist=[], headkeeplist=[], 
                        footthreshold=3, headthreshold=3, hiddenFooters=True, hiddenHeaders=True):
    pages_of_text = convertPDFtoTextArray(folder, file)
#     pages_of_text = np.load(folder+'/'+file)
    pages_of_text = decodeText(pages_of_text)
    pages_of_text = removeFooters(pages_of_text, footkeeplist, footthreshold, hiddenFooters)
    pages_of_text = removeHeaders(pages_of_text, headkeeplist, headthreshold, hiddenHeaders)
    return pages_of_text

In [224]:
# use for turning an already prcessed PDF (i.e. converted to npy file) into text file without headers or footers

def convertTextArrayToText(folder, file, footkeeplist=[], footdroplist=[], headkeeplist=[],headdroplist=[], 
                           footthreshold=3, headthreshold=3, hiddenFooters=True, hiddenHeaders=True):
#     pages_of_text = convertPDFtoTextArray(folder, file)
    pages_of_text = np.load(folder+'/'+file)
    pages_of_text = decodeText(pages_of_text)
    pages_of_text = removeFooters(pages_of_text, footkeeplist, footdroplist, footthreshold, hiddenFooters)
    pages_of_text = removeHeaders(pages_of_text, headkeeplist, headdroplist, headthreshold, hiddenHeaders)
    return pages_of_text

In [225]:
def decodeText(encoded_text):
    decoded_text = [page.decode('utf-8') for page in encoded_text]
    return decoded_text

In [226]:
def removeFooters(text_with_footers, keeplist=[], droplist=[], thresh=2.9, hidden=True):
    possible_footers = findFooters(text_with_footers)
    footer_scores = scoreHeadersOrFooters (possible_footers)
    probableFooters = sortScores(footer_scores, threshold=thresh, type='footers')
    new_text = text_with_footers.copy()
    
#     while len(probableFooters)>0: #not empty
#         set_of_footers = set([x[0][0] for x in probableFooters])
#         new_text = deleteFooters(new_text, probableFooters)
#         new_text = removeFootersHiddeninText (new_text,set_of_footers)
#         possible_footers = findFooters(new_text)
#         footer_scores = scoreHeadersOrFooters (possible_footers)
#         probableFooters = sortScores(footer_scores, threshold=4)

    set_of_footers = set([x[0][0] for x in probableFooters])
    set_of_footers = [x for x in set_of_footers if x not in keeplist]
    set_of_footers = list(set(set_of_footers + droplist))
    new_text = deleteFooters(new_text, probableFooters)
    if hidden ==True:
        new_text = removeHeadersOrFootersHiddeninText (new_text,set_of_footers)    
    
    return new_text     

In [227]:
def findFooters(pages):
    footer_candidates=[]
    for num, page in enumerate(pages):
#         print('page:', num)
        lines = []
        start=-1
        end=-1
        doublereturn=False
        for i in range (0,5):
#             print(i)
            while end==start:
                start = page.rfind('\n', max(0,end-200), end)
                if start == -1:
#                     print('not enough')
                    break
                elif start == end-1:
                    end = start
                    doublereturn=True
#                     print('looping', start, end)
#             print(start, end)
            if start ==-1:
                line = ''
            if end ==-1:
                line = re.sub('\d', '@', page[start+1:])
            elif (doublereturn):
                line = re.sub('\d', '@', page[start+1:end])
            else: line = re.sub('\d', '@', page[start+1:end])


#             print(line)
            lines.append([line, start, end])
            end = start
#         print(lines)
        footer_candidates.append(lines)

    return footer_candidates

In [228]:
def scoreHeadersOrFooters(candidates):
    scores=[]
    numpages= len(candidates)
    WIN = 8 #range of pages back and forth to compare
    weights = [1,0.75, 0.5, 0.5, 0.5]
    for j in range(0, numpages):
#         print('j=', j)
        first = max(0, j-WIN)
        last = min(j+WIN, numpages-1)
#         for i in range (0,5):
        pageScores = []
        for i in range (0,5):
            similaritySum=0
            for k in range(first, last):
                if j!=k:
                    similarity = SequenceMatcher(None, candidates[j][i][0],candidates[k][i][0]).ratio()
    #                 print(similarity)
                    similaritySum+=similarity
                similaritySum=weights[i]*similaritySum
            pageScores.append([candidates[j][i], j, i,similaritySum])
#             print(candidates[j][i], similaritySum)
        scores.append(pageScores)
#             print('k=',k)
#         print (first, last)
#     print(scores)
    return scores

In [229]:
def sortScores(scores, threshold=2.5, type = 'headers/footers'):
    deleteList = []
    for page in scores:
        for line in page:
            if line[3] >threshold:
                deleteList.append(line)
    print('Will delete these', type, ':', deleteList, '\n')
    return deleteList

In [230]:
def removeHeaders(text_with_headers, keeplist=[], droplist=[],thresh=2.9, hidden=True):
    possible_headers = findHeaders(text_with_headers)
    header_scores = scoreHeadersOrFooters (possible_headers)
    probable_headers = sortScores(header_scores, threshold=thresh, type='headers')
    new_text = text_with_headers.copy()
    
#     while len(probable_headers)>0: #not empty
#         set_of_headers = set([x[0][0] for x in probable_headers])
#         new_text = deleteFooters(new_text, probable_headers)
#         new_text = removeFootersHiddeninText (new_text,set_of_headers)
#         possible_headers = findHeaders(new_text)
#         header_scores = scoreHeadersOrFooters (possible_headers)
#         probable_headers = sortScores(header_scores, threshold=4)

    set_of_headers = set([x[0][0] for x in probable_headers])
    set_of_headers = [x for x in set_of_headers if x not in keeplist]
    set_of_headers = list(set(set_of_headers + droplist))
    new_text = deleteHeaders(new_text, probable_headers)
    if hidden ==True:
        new_text = removeHeadersOrFootersHiddeninText (new_text,set_of_headers)
#         possible_headers = findHeaders(new_text)
#         header_scores = scoreHeadersOrFooters (possible_headers)
#         probable_headers = sortScores(header_scores, threshold=4)
    
    
    return new_text     

In [231]:
def deleteFooters(text_pages, footers_to_delete, keeplist=[]):
    for line in footers_to_delete:
        if line[0][0] not in keeplist:
            pagenum = line[1]
            start = line[0][1]
            while text_pages[pagenum][start]=='\n':
                start-=1
            end = line[0][2]
#         print(pagenum, start, end)
        if end == -1:
            text_pages[pagenum] = text_pages[pagenum][:start]
        else:
            text_pages[pagenum] = text_pages[pagenum][:start] + text_pages[pagenum][end:]
    return text_pages

In [232]:
def removeHeadersOrFootersHiddeninText(text_pages, possibleHiddenItems):
# , headerOrfooter):
    print(possibleHiddenItems, '\n')
    for option in possibleHiddenItems:
        for i, page in enumerate(text_pages):
            option2 = option
#             print('looking for', option2, 'on page', i)

            if page.find(option2) !=-1:
#                 print('found:', option2, i, page.find(option2))
#                 print(page.find(option2), page.rfind(option2))
                if page.find(option2) == page.rfind(option2):
                    start=page.find(option2)
                    end=start+len(option2)
#                     print(start, end, page[start:end])
                    while start > 0  and page[start-1]== '\n':
                        start -=1
                    while end+1 <= len(page)-1 and page[end+1]=='\n':
                        end+=1
                    print(option2, 'deleted from page', i, 'position', page.find(option2))
                    text_pages[i] = text_pages[i][:start]+' '+text_pages[i][end:]
#                     text_pages[i] = re.sub(option2,' ',page)
    return text_pages

In [233]:
def findHeaders(pages):
    header_candidates = []
    for num, page in enumerate(pages):
#         print('page:', num)
        lines=[]
        start=0
        end=0
        for i in range (0,5):
            while end-start<1:
                end = page.find('\n', start, start+200)
                if end ==-1:
                    break
                elif start == end:
                    start = end+1
#                 print('looping', start, end)
        #     print(start, end)
#             print(page[start:end])
            if end ==-1:
                line =''
            else:
                line = re.sub('\d', '@', page[start:end])
            lines.append([line, start, end])
            start = end+1
#         print(lines)
        header_candidates.append(lines)
#     print(len(header_candidates))

    return header_candidates

In [234]:
def deleteHeaders(text_pages, headers_to_delete, keeplist=[]):
    for line in headers_to_delete:
        if line[0][0] not in keeplist:
            pagenum = line[1]
            start = line[0][1]
            end = line[0][2]+1
            while text_pages[pagenum][end]=='\n':
                end+=1
#         print(pagenum, start, end)
        if start != 0:
            text_pages[pagenum] = text_pages[pagenum][:start]+text_pages[pagenum][end:]
        else:
            text_pages[pagenum] = text_pages[pagenum][end:]
    return text_pages

### Converting Individual Documents

#### Original Group of PDFs

In [258]:
npyfiles  = [x for x in os.listdir('PDFs') if x.endswith('npy')]
npyfiles

['thesis (2).npy',
 'thesis (3).npy',
 '10.1007@s00300-017-2230-0.npy',
 'fevo-07-00076.npy',
 'greenslade2018.npy',
 'gen-2015-0194suppla.npy',
 'gen-2015-0194.npy',
 'thesis.npy',
 'thesis (1).npy']

In [259]:
fevofootdroplist=['Frontiers in Ecology and Evolution | www.frontiersin.org',
                  'Frontiers in Ecology and Evolution | www.frontiersin.or']
fevoheaddroplist = ['Spatial Diversity of Antarctic Springtails',
                    ' | g 1', ' | g ', 
                   ]
gen2015headdroplist = ['The 6th International Barcode of Life Conference Downloaded from cdnsciencepub.com by 151.210.131.92 on 11/30/20 For personal use only.',
                      'Genome Vol. 59, 2016', 'Beet et al.', 'The 6th International Barcode of Life Conference Downloaded from cdnscience',
                      'pub.com by 151.210.131.92 on 11/30/20','For personal use only.']
thesis2footkeeplist=['Cape Bird, Antarctica','Miers Valley, Antarctica','Cape Crozier, Antarctica',
                     'Granite Harbour, Antarctica','Marble Point, Antarctica','Antarctica']
thesis2headkeeplist=['TABLE OF CONTENTS','TABLE OF UNITS AND ABBREVIATIONS','LIST OF TABLES',
                     'LIST OF FIGURES',  'LIST OF APPENDICES','CHAPTER @', 'Taxon','REFERENCES @@']

In [271]:
# cleanAndSaveText('PDFs', npyfiles[0], footkeeplist=thesis2footkeeplist, headkeeplist=thesis2headkeeplist)

In [272]:
# cleanAndSaveText('PDFs', npyfiles[1], footthreshold=9, hiddenHeaders=False) #some line breaks remain

In [273]:
# cleanAndSaveText('PDFs', npyfiles[2])

In [274]:
# cleanAndSaveText('PDFs', npyfiles[3], footdroplist=fevofootdroplist, headdroplist=fevoheaddroplist) 
# #some manual cleaning needed after running this 

In [275]:
# cleanAndSaveText('PDFs', npyfiles[4])

In [276]:
# cleanAndSaveText('PDFs', npyfiles[5])

In [277]:
# cleanAndSaveText('PDFs', npyfiles[6], headdroplist=gen2015headdroplist)

In [278]:
# cleanAndSaveText('PDFs', npyfiles[7], headthreshold=5)

In [279]:
# cleanAndSaveText('PDFs', npyfiles[8], footthreshold=5.5, hiddenFooters=False, headthreshold=10)

#### Second Group of PDFs

In [237]:
npyfiles2  = [x for x in os.listdir('FraserSet') if x.endswith('npy')]
npyfiles2

['Archer2017_Article_EndolithicMicrobialDiversityIn.npy',
 'Fraser2018_Article_EvidenceOfPlantAndAnimalCommun.npy',
 'source.npy',
 'fmicb-10-01018.npy',
 's42003-018-0260-y.npy',
 'summer-activity-patterns-for-mosses-and-lichens-in-maritime-antarctica.npy',
 'fmicb-07-01642.npy']

In [239]:
archerheaddroplist = ['Polar Biol (2017) 40:997—1006']

Fraser2018headdroplist = ['Polar Biol (2018) 41:417-421']

sourceheaddroplist = ['T. C. Hawes', 'Downloaded by [University of Cambridge] at 05:22 08 April 2016',
                      'LIST OF FIGURES',  'LIST OF APPENDICES','CHAPTER @', 'Taxon','REFERENCES @@']

In [247]:
# cleanAndSaveText('FraserSet', npyfiles2[0], headdroplist=archerheaddroplist)

In [248]:
# cleanAndSaveText('FraserSet', npyfiles2[1], headthreshold=2, footthreshold=1, headdroplist=Fraser2018headdroplist)

In [249]:
# cleanAndSaveText('FraserSet', npyfiles2[2], headdroplist=sourceheaddroplist, headthreshold=1)
# #some additional headers deleted

In [250]:
# cleanAndSaveText('FraserSet', npyfiles2[3], headthreshold=2, footthreshold=1)

In [251]:
# cleanAndSaveText('FraserSet', npyfiles2[4])

In [252]:
# cleanAndSaveText('FraserSet', npyfiles2[5], footthreshold=2, 
#                  headthreshold=2, headdroplist=['BURKHARD SCHROETER et al.'])

In [253]:
# cleanAndSaveText('FraserSet', npyfiles2[6], headthreshold=2, footthreshold=2)

## Initial Process - Now replaced with new method 

This initial process had some flaws. Headers and footer were not removed and this caused issues with tokenisation and named entity recognition. Additionally the beautiful soup tool used to clean text removed entire pages.

### Extracting Text from PDF

In [100]:
def extractTextFromPDF(folder, file):
    """
    Takes a PDF document, converts to an image and 
    then extracts text from each page of that image into an array.
    The elements of the array (pages) are merged into one document 
    and line breaks removed ready for the next step.
    """
#     print("beginning")
    pdf_as_image = Image(filename=folder+'/'+file, resolution=600)
    print('number of pages:', len(pdf_as_image.sequence))
    pdf_as_jpeg = pdf_as_image.convert('jpeg')
    print('number of pages:', len(pdf_as_jpeg.sequence))
    del pdf_as_image
    tool = pyocr.get_available_tools()[0]
#     req_image = []
    pages_text = []
#     final_text = ''
#     print('pages: ')
    for img in pdf_as_jpeg.sequence:
        img_page = Image(image=img)
        req_image = img_page.make_blob('jpeg')
        txt = tool.image_to_string(
            PI.open(io.BytesIO(req_image)),
            #lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        pages_text.append(txt.encode('utf-8', 'ignore'))

#     del img
    #flatten array of pages into one string/byte for whole document
    doc_text = b' '.join(pages_text) 
    del pages_text
    #remove breaks from words that wrap over two lines
    doc_text = re.sub(b'-\n', b'', doc_text) 
    #replace single returns with spaces
    doc_text = re.sub(b'\n', b' ', doc_text) 
    
    #replace double spaces with single spaces
    doc_text = re.sub(b'  ', b' ', doc_text) 
    
    #replace double returns with single returns
    doc_text = re.sub(b'\n\n', b'\n', doc_text) 
    return doc_text

In [101]:
document = extractTextFromPDF('PDFs',"10.1007@s00300-017-2230-0.pdf")

number of pages: 13
number of pages: 13
1
number of pages: 1
2
number of pages: 2
3
number of pages: 3
4
number of pages: 4
5
number of pages: 5
6
number of pages: 6
7
number of pages: 7
8
number of pages: 8
9
number of pages: 9
10
number of pages: 10
11
number of pages: 11
12
number of pages: 12
13
number of pages: 13
number of pages: 13


In [106]:
document[:1000].decode('utf-8')

"Polar Biology https://doi.org/10.1007/s00300-017-2230-0 ORIGINAL PAPER @® CrossMark Collembola of Barrientos Island, Antarctica: first census and assessment of environmental factors determining springtail distribution Natalia Enriquez' - Pablo Tejedo? - Javier Benayas? - Belén Albertos? - Maria José Luciafiez' Received: 21 October 2016 / Revised: 13 August 2017 / Accepted: 3 December 2017 © Springer-Verlag GmbH Germany, part of Springer Nature 2017 Abstract Barrientos Island is a small islet in the South Shetland archipelago frequently visited by Antarctic tourists. Collembola were recently used in another study developed in this site to assess the environmental conditions of two paths used by visitors, showing the importance of this soil faunal community. This motivated the realization of the first comprehensive census of Collembola from Barrientos Island. Fifty-six samples were recorded over three seasons, 2011-2013, from eight different substrate types. During the last campaign,"

### Further Preprocessing

In [33]:
from bs4 import BeautifulSoup as bs

Using Beautiful Soup to strip further unwanted characters from the document.

In [34]:
soup = bs(document)

In [35]:
raw = soup.get_text()
raw[:1000]

"Polar Biology\nhttps://doi.org/10.1007/s00300-017-2230-0\n\nORIGINAL PAPER\n\n@® CrossMark\n\nCollembola of Barrientos Island, Antarctica: first census\nand assessment of environmental factors determining springtail\ndistribution\n\nNatalia Enriquez' - Pablo Tejedo? - Javier Benayas? - Belén Albertos? - Maria José Luciafiez'\n\nReceived: 21 October 2016 / Revised: 13 August 2017 / Accepted: 3 December 2017\n© Springer-Verlag GmbH Germany, part of Springer Nature 2017\n\nAbstract\n\nBarrientos Island is a small islet in the South Shetland archipelago frequently visited by Antarctic tourists. Collembola were\nrecently used in another study developed in this site to assess the environmental conditions of two paths used by visitors,\nshowing the importance of this soil faunal community. This motivated the realization of the first comprehensive census of\nCollembola from Barrientos Island. Fifty-six samples were recorded over three seasons, 2011-2013, from eight different\nsubstrate types.

In [13]:
len(raw)

41087

In [14]:
end = raw.rfind('References')
raw = raw[:end]

In [15]:
len(raw)

27828

In [16]:
type(raw)

str

### Extracting Text From .pdf Files in a Folder and Saving New .txt Files

In [7]:
def convertFolderofPDFsToText (folder):
    """
    Runs through a given folder and converts PDFs to cleaned text files
    
    Parameters
    ----------
    folder : string
        location of PDF files to convert. (also target location for text files)
        
    Returns
    -------
    n/a
    
    """
    
    filelist = os.listdir(folder)
    filelist = [x for x in filelist if x.endswith('pdf')]
    numFiles = len(filelist)
    for i, file in enumerate(filelist):
        txtfile = re.sub('pdf', 'txt', file)
        print('Converting file',i+1, 'of', numFiles)
#         print('creating ', txtfile)
        f = io.open(folder+'/'+txtfile, 
                    encoding='utf-8', 
                    mode='w')
#         print('loading ', file)
#         print('extracting text from ', file)
        extractedText = extractTextFromPDF(folder, file)
        soup = bs(extractedText)
        raw = soup.get_text()
#         print('saving ', txtfile)
        f.write(raw)
#         print('closing ', txtfile, '\n')
        f.close()
        deleteMagick.clean()

In [8]:
def convertPDFtoText (folder, file):
    """
    Converts one PDFs to cleaned text file
    
    Parameters
    ----------
    folder : string
        location of PDF files to convert. (also target location for text files)
    file : string
        name of file to convert
        
    Returns
    -------
    n/a
    
    """
    
    txtfile = re.sub('pdf', 'txt', file)
    f = io.open(folder+'/'+txtfile, 
                    encoding='utf-8', 
                    mode='w')
    extractedText = extractTextFromPDF(folder, file)
    
#     soup = bs(extractedText)
#     raw = soup.get_text()
    f.write(raw)
    f.close()
    deleteMagick.clean()

In [34]:
convertFolderofPDFsToText('smallPDFs')

Converting file 1 of 3
Converting file 2 of 3
Converting file 3 of 3


In [35]:
convertFolderofPDFsToText('FraserSet')

Converting file 1 of 7
Converting file 2 of 7
Converting file 3 of 7
Converting file 4 of 7
Converting file 5 of 7
Converting file 6 of 7
Converting file 7 of 7


In [17]:
# convertPDFtoText('PDFs', 'fevo-07-00076.pdf')

In [18]:
# convertPDFtoText('PDFs', 'thesis (3).pdf')

In [37]:
# convertFolderofPDFsToText('PDFs')

In [22]:
# convertPDFtoText('PDFs', 'gen-2015-0194.pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis.pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis (1).pdf')

In [None]:
# convertPDFtoText('PDFs', 'thesis (2).pdf')

## Sandbox

In [1017]:
def saveArraysAsText(folder):
    filelist = os.listdir(folder)
    filelist = [x for x in filelist if x.endswith('npy')]
    numFiles = len(filelist)
    for i, file in enumerate(filelist):
        data = np.load(folder+'/'+file)
        txtfile = re.sub('npy', 'txt', file)
        print('Converting file',i+1, 'of', numFiles)
#         print('creating ', txtfile)
        f = io.open(folder+'/'+txtfile, 
                    encoding='utf-8', 
                    mode='w')

        doc_text = b'\n\n!!!PAGE BREAK!!!\n\n'.join(data) 

        #remove breaks from words that wrap over two lines
        doc_text = re.sub(b'-\n', b'', doc_text) 
        
        #replace double returns with single returns
        doc_text = re.sub(b'\n\n', b'<DOUBLERETURN>', doc_text) 
        
        #replace single returns with spaces
        doc_text = re.sub(b'\n', b' ', doc_text) 

        #replace double spaces with single spaces
        doc_text = re.sub(b'  ', b' ', doc_text) 
        
        #replace double returns with single returns
        doc_text = re.sub(b'<DOUBLERETURN>',b'\n\n', doc_text) 

        doc_text = doc_text.decode()
        f.write(doc_text)
        f.close()

In [None]:
def loadAndSaveArrayAsText(folder, file):
    data = np.load(folder+'/'+file)
    txtfile = re.sub('npy', 'txt', file)
    print('Converting file',i+1, 'of', numFiles)
    f = io.open(folder+'/'+txtfile, 
                encoding='utf-8', 
                mode='w')

    doc_text = b'\n\n!!!PAGE BREAK!!!\n\n'.join(data) 

    #remove breaks from words that wrap over two lines
    doc_text = re.sub(b'-\n', b'', doc_text) 

    #replace double returns with single returns
    doc_text = re.sub(b'\n\n', b'<DOUBLERETURN>', doc_text) 

    #replace single returns with spaces
    doc_text = re.sub(b'\n', b' ', doc_text) 

    #replace double spaces with single spaces
    doc_text = re.sub(b'  ', b' ', doc_text) 

    #replace double returns with single returns
    doc_text = re.sub(b'<DOUBLERETURN>',b'\n\n', doc_text) 

    doc_text = doc_text.decode()
    f.write(doc_text)
    f.close()

In [109]:
def extractTextFromPDF_test(folder, file):
    """
    Takes a PDF document, converts to an image and 
    then extracts text from each page of that image into an array.
    The elements of the array (pages) are merged into one document 
    and line breaks removed ready for the next step.
    """
#     print("beginning")
    pdf_as_image = Image(filename=folder+'/'+file, resolution=600)
    pdf_as_jpeg = pdf_as_image.convert('jpeg')
    del pdf_as_image
    tool = pyocr.get_available_tools()[0]
#     req_image = []
    pages_text = []
#     final_text = ''
#     print('pages: ')
    for img in pdf_as_jpeg.sequence:
        img_page = Image(image=img)
        req_image = img_page.make_blob('jpeg')
        txt = tool.image_to_string(
            PI.open(io.BytesIO(req_image)),
            #lang=lang,
            builder=pyocr.builders.TextBuilder()
        )
        pages_text.append(txt.encode('utf-8', 'ignore'))
#         print(len(pages_text))
#     del img
    #flatten array of pages into one string/byte for whole document
#     doc_text = b' '.join(pages_text) 
#     del pages_text
#     #remove breaks from words that wrap over two lines
#     doc_text = re.sub(b'-\n', b'', doc_text) 
#     #replace single returns with spaces
#     doc_text = re.sub(b'\n', b' ', doc_text) 
    
#     #replace double spaces with single spaces
#     doc_text = re.sub(b'  ', b' ', doc_text) 


    
#     #replace double returns with single returns
#     doc_text = re.sub(b'\n\n', b'\n', doc_text) 
#     return doc_text
    return pages_text