In [38]:
import pdfplumber
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
import re
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import bookChapterPageNumbers as chaps

In [28]:
# get counts and remove stop words
def vectorizeText(inputText):
    cv = CountVectorizer(stop_words='english')
    X = cv.fit_transform(inputText)
    df_cv = pd.DataFrame(X.toarray(), columns=cv.get_feature_names())  
    df_cv.info()
    print(df_cv.columns)
    print(df_cv)

# get term frequencies and remove stop words
def vectorizeTextIDF(inputText):
    cv_tfidf = TfidfVectorizer(stop_words='english')
    X_tfidf = cv_tfidf.fit_transform(page_sentences)
    df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=cv_tfidf.get_feature_names())
    df_tfidf.info()
    print(df_tfidf.columns)
    print(df_tfidf)      


def cleanText(page_text):
    
    clean_text = page_text
    
    # remove punctuation
    clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', clean_text)
    # make lower case
    clean_text = clean_text.lower()
    
    # remove numbers
    clean_text = re.sub('\w*\d\w*', ' ', clean_text)
      
    # tokenize the text either in individual words or sentences
    clean_text = word_tokenize(clean_text)
#     page_sentences = sent_tokenize(clean_text) # this doesn't work if punctuation has already been removed, can't clean the punctuation with the same line of code as above once it's been tokenized

    # remove punctuation which has been reintroduced after tokenizing
    clean_text = [word for word in clean_text if word.isalpha()]

    # remove stop words (can do this manually or automatically with the vectorizer)
    clean_text = [word for word in clean_text if word not in stop_words]
    
    # stem words
    clean_text = [porter.stem(word) for word in clean_text]

    
#     print(clean_text)
#     print(page_sentences)
    
#     print('Num words on page = ', len(page_words))
#     print('Num sentences on page = ', len(page_sentences))
    
#     twograms = list(ngrams(page_words,2))
#     print(twograms)

    return clean_text
    

    

In [43]:
books = ["../StormlightArchiveBooks/The_Way_of_Kings_The_Stormlight_Archive_Book_1.pdf"]
book_chapters_pages = [chaps.book_1_chapter_pages, chaps.book_2_chapter_pages, chaps.book_3_chapter_pages, chaps.book_4_chapter_pages]

In [47]:
for i, book in enumerate(books):
    with pdfplumber.open(book) as pdf:
        this_book_chaps_pages = book_chapters_pages[i]
        
        for chap in range(len(this_book_chaps_pages)-1):
            startingPage = this_book_chaps_pages[chap]-1 # minus one to shift pages down by 1 since the pdf.pages object starts at 0
            endingPage = this_book_chaps_pages[chap+1]-1-1 # extra minus one for ending page to not include beginning of next chapter
            print('chap: ', chap, ' starting page:', startingPage, ' ending page: ', endingPage)
    
        
        
#         this_page = pdf.pages[17]
#         page_text = this_page.extract_text()
#         print(page_text)

#         cleaned_page_text = cleanText(page_text)
#         print(cleaned_page_text)

#         recombined_text = ' '.join(cleaned_page_text)
#         print(recombined_text)

    #     vectorizeText(recombined_text)
    

chap:  0  starting page: 12  ending page:  17
chap:  1  starting page: 18  ending page:  36
chap:  2  starting page: 37  ending page:  50
chap:  3  starting page: 51  ending page:  66
chap:  4  starting page: 67  ending page:  82
chap:  5  starting page: 83  ending page:  95
chap:  6  starting page: 96  ending page:  109
chap:  7  starting page: 110  ending page:  131
chap:  8  starting page: 132  ending page:  150
chap:  9  starting page: 151  ending page:  172
chap:  10  starting page: 173  ending page:  180
chap:  11  starting page: 181  ending page:  188
chap:  12  starting page: 189  ending page:  200
chap:  13  starting page: 201  ending page:  207
chap:  14  starting page: 208  ending page:  212
chap:  15  starting page: 213  ending page:  220
chap:  16  starting page: 221  ending page:  244
chap:  17  starting page: 245  ending page:  255
chap:  18  starting page: 256  ending page:  267
chap:  19  starting page: 268  ending page:  296
chap:  20  starting page: 297  ending page: