In [1]:
import os
import re
import io
import pickle
import pandas as pd

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [2]:
input_dir = 'textbooks/'
metadata_file = 'metadata.csv'

metadata = pd.read_csv(metadata_file)

isbns = os.listdir(input_dir)

In [3]:
# Functions for extracting data from the PDFs
def normalize_whitespace(s):
    return re.sub(r'\s+', ' ', s) 

def get_toc(pdf_path):
    infile = open(pdf_path, 'rb')
    parser = PDFParser(infile)
    document = PDFDocument(parser)

    toc = list()
    for (level, title, dest, a, structelem) in document.get_outlines():
        toc.append((level, title))

    return toc

def convert_pdf_to_txt(pdf_path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(pdf_path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True

    pages_text = []
    for page in PDFPage.get_pages(fp, set(), 
                                  maxpages=0,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):    
        # Get (and store) the "cursor" position of stream before reading from PDF
        # On the first page, this will be zero
        read_position = retstr.tell()

        # Read PDF page, write text into stream
        interpreter.process_page(page)

        # Move the "cursor" to the position stored
        retstr.seek(read_position, 0)

        # Read the text (from the "cursor" to the end)
        page_text = retstr.read()

        # Add this page's text to a convenient list
        pages_text.append(normalize_whitespace(page_text))

    fp.close()
    device.close()
    retstr.close()
    
    return(pages_text)

In [4]:
toc_file = 'toc.pkl'
text_file = 'text.pkl'

for isbn in isbns:
    
    isbn_dir = os.path.join(input_dir, isbn)
    isbn_files = os.listdir(isbn_dir)
    # Filter to PDFs
    pdf_files = [fn for fn in isbn_files if re.search('.pdf', fn)]
    pdf_file = pdf_files[0]

    filepath = os.path.join(isbn_dir, pdf_file)

    if toc_file not in isbn_files:
        # Get the table of contents
        print("Extracting table of contents for ISBN: {}".format(isbn))
        toc = get_toc(filepath)        
        with open(os.path.join(isbn_dir, toc_file), 'wb') as fp:
            pickle.dump(toc, fp, protocol=pickle.HIGHEST_PROTOCOL)
    
    if text_file not in isbn_files:
        # Extract the text from the PDF
        print("Extracting text for ISBN: {}".format(isbn))
        text_list = convert_pdf_to_txt(filepath)        
        with open(os.path.join(isbn_dir, text_file), 'wb') as fp:
            pickle.dump(text_list, fp, protocol=pickle.HIGHEST_PROTOCOL)     

Extracting text for ISBN: 9781429219617
Extracting table of contents for ISBN: 9781429242301
Extracting text for ISBN: 9781429242301
Extracting table of contents for ISBN: 9781429298643
Extracting text for ISBN: 9781429298643
Extracting table of contents for ISBN: 9781429298902
Extracting text for ISBN: 9781429298902
Extracting table of contents for ISBN: 9781464126147
Extracting text for ISBN: 9781464126147
Extracting table of contents for ISBN: 9781464135958
Extracting text for ISBN: 9781464135958
Extracting table of contents for ISBN: 9781464140815
Extracting text for ISBN: 9781464140815
Extracting table of contents for ISBN: 9781464154072
Extracting text for ISBN: 9781464154072
Extracting table of contents for ISBN: 9781464171703
Extracting text for ISBN: 9781464171703
