In [1]:
import re
import numpy as np
from pathlib import Path
from bs4 import BeautifulSoup
from nltk import tokenize, download
from pdfminer.high_level import extract_text
from textwrap import TextWrapper

In [2]:
file_path = 'thinkdsp.pdf'

In [3]:
def read_pdf(file_path):      
    download('punkt', quiet=True)
#     wrapper = TextWrapper(cf.MAX_CHAR_LEN, fix_sentence_endings=True)
    wrapper = TextWrapper(140, fix_sentence_endings=True)

    book = extract_text(file_path)
    
    input_text = BeautifulSoup(book, "html.parser").text
    text_list = []
    for paragraph in input_text.split('\n'):
        paragraph = paragraph.replace('—', '-')
        paragraph = re.sub(r'[^\x00-\x7f]', "", paragraph)
        sentences = tokenize.sent_tokenize(paragraph)
        
        # Wrap sentences to maximum character limit
        sentence_list = []
        for sentence in sentences:
            wrapped_sentences = wrapper.wrap(sentence)
            sentence_list.append(wrapped_sentences)
            
        # Flatten list of list of sentences and append
        trunc_sentences = [phrase for sublist in sentence_list for phrase in sublist]
        text_list.append(trunc_sentences)
    text_list = [[text for sentences in text_list for text in sentences]]
    
    # Parse out title from imported file path
    txt_title = Path(file_path).stem.lower().replace(' ', '_')

    return text_list, txt_title

In [4]:
corpus, title = read_pdf(file_path)

In [5]:
np.shape(corpus)

(1, 195)

In [9]:
corpus[0][1]

'Digital Signal Processing in Python'

In [10]:
title

'thinkdsp'

In [12]:
corpus

[['Think DSP',
  'Digital Signal Processing in Python',
  'Version 1.1.4',
  '  Think DSP',
  'Digital Signal Processing in Python',
  'Version 1.1.4',
  'Allen B. Downey',
  'Green Tea Press',
  'Needham, Massachusetts',
  ' Copyright c(cid:13) 2014 Allen B. Downey.',
  'Green Tea Press',
  '9 Washburn Ave',
  'Needham MA 02492',
  'Permission is granted to copy, distribute, and/or modify this document under',
  'the terms of the Creative Commons Attribution-NonCommercial-ShareAlike',
  '4.0 International License, which is available at http://creativecommons.',
  'org/licenses/by-nc-sa/4.0/.',
  'The LATEX source for this book is available from http://think-dsp.com.',
  ' Preface',
  'Signal processing is one of my favorite topics.',
  'It is useful in many areas',
  'of science and engineering, and if you understand the fundamental ideas, it',
  'provides insight into many things we see in the world, and especially the things',
  'we hear.',
  'But unless you studied electrical or me

In [13]:
assert title == "thinkdsp"
assert np.shape(corpus) == (1, 195)
assert corpus[0][0] == 'Think DSP'
assert corpus[0][1] == 'Digital Signal Processing in Python'