In [144]:
from PyPDF2 import PdfReader
import os
import pandas as pd

In [129]:
class process_pdf:
    def __init__(self,path) -> None:
        self.path = path

    def create_outline(self):
        '''Get outline from PyPDF2 reader object. Note: the outline contains only top 1 level and ingores sublevels under the chapters.
        Args:
        reader
        Returns:
        (list) : titles of chapters
        '''
        outline_first_level = []
        for element in self.reader.outline:
            if isinstance(element, dict):
                outline_first_level.append(element["/Title"]) 
        return outline_first_level


    def pdf_to_txt_by_chapters(self):
        '''Reads textbooks in pdf format and convert them into txt and create dataset as output containing chapters.
        Args:
        path: where multiple textbooks in pdf format are stored
        Returns:
        (dataframe):
                columns:
                    filename (str) - name of pdf file
                    author (str) - author of a textbook
                    title (str) - title of a textbook
                    chapter title (str) - title of a chapter of a textbook
                    text (str) - text of a chapter
        '''
        data = pd.DataFrame(columns = ["filename","author","title","chapter_title","text"])
        i=-1
        all_chapters_num = 0

        for book in os.listdir(self.path):
            self.reader = PdfReader(self.path+book)
            try:
                author = self.reader.getDocumentInfo()["/Author"]
            except:
                author = "unknown"
            try:
                title = self.reader.getDocumentInfo()["/Title"]
            except:
                title = "unknown"

            outline_first_level = self.create_outline()

            all_chapters_num+=len(outline_first_level)

            chapter_num = 0
            chapter_title = outline_first_level[chapter_num]
            intro = False

            for page in self.reader.pages:
                page_text = page.extract_text() + "\n"
                excerpt = page_text[:100].replace("\n"," ")
                excerpt = excerpt.replace(".","")
                excerpt = excerpt.upper()

                if "CONTENT" in chapter_title.upper():
                    #i+=1
                    #data.loc[i,:] = [book[:-4], author, title, chapter_title, page_text]
                    chapter_num+=1
                    chapter_title = outline_first_level[chapter_num]
                    #intro = True            

                elif (chapter_title.upper() in excerpt) and ("CONTENT" not in excerpt): 
                    i+=1
                    data.loc[i,:] = [book[:-4], author, title, chapter_title, page_text]
                    chapter_num+=1
                    if chapter_num >= len(outline_first_level):
                        pass
                    else:
                        chapter_title = outline_first_level[chapter_num]
                    intro = True
                if intro:
                    data.loc[i,"text"]+=page_text
        print("Total number of chapters: ", all_chapters_num)

        return data

In [130]:
path = "../dat/books/all_books/"
processpdf = process_pdf(path)
data = processpdf.pdf_to_txt_by_chapters()

Total number of chapters:  105


In [132]:
data = (data.loc[
        ~data.chapter_title.str.contains
        ("|".join(['Preface', 'Acknowledgements','Acknowledgments','Appendix','Some Prerequisite Topics', 'Exercise',
       'Photo Credits', 'Index',"A Complex Numbers", 'B Proofs','C Mathematical Induction', 'D Polynomials',
       'Online Resources', 'Midterm', 'Sample Final Exam', 'Movie Scripts']))
        ]
    )
data.shape
data.to_json("data_chapters.json")

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import re

In [142]:
class utils:
    def __init__(self) -> None:
        ...

    @staticmethod
    def remove_repeated_sentences(text, times = 4):
        text_tokenized = sent_tokenize(text)
        freqs = Counter(text_tokenized)
        garbage = [el for el in freqs.elements() if freqs[el]>=times]
        text = [i for i in text_tokenized if i not in garbage]
        return " ".join(text)

    #def remove_formulas(text):
    #    return 
    def remove_new_line(text):
        return re.sub("\n"," ",text)
        
    def text_only(text):
        regex=u"[A-Za-z]+"
        regex = re.compile(regex)
        text = " ".join(regex.findall(text))
        return text

    def remove_stopwords(text):
        text = text.lower()
        word_tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        return " ".join([w for w in word_tokens if not w in stop_words])

    def lemmatize_text(text):
        lemmatizer = WordNetLemmatizer()
        word_tokens = word_tokenize(text)
        return " ".join([lemmatizer.lemmatize(w) for w in word_tokens])

In [147]:
data.text = data.text.apply(utils.remove_repeated_sentences)
data.text = data.text.apply(utils.remove_new_line)

In [148]:
data.text

2      Chapter SLE Systems of Linear Equations We wil...
3      Chapter V Vectors We have worked extensively i...
4      Chapter M Matrices We have made frequent use o...
5      Chapter VS Vector Spaces We now have a computa...
6      Chapter D Determinants The determinant is a fu...
                             ...                        
111    7. Linear Transformations IfVandWare vector sp...
112    8. Orthogonality In Section 5.3we introduced t...
113    9. Change of Basis IfAis an m×nmatrix, the cor...
114    10. Inner Product Spaces 10.1 Inner Products a...
115    11. Canonical Forms Given a matrix A, the effe...
Name: text, Length: 76, dtype: object