This notebook reads in all the books and pre-processes them to be ready to run different ML models on the data. It then exports this data to a csv to upload into other notebooks for said analysis. 

In [4]:
# import all need libaries 
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [95]:
# class that processes each book
class Book():
    
    _all_books = []
    
    def __init__(self,filepath):
        # assigns title and author
        self.title = filepath.split('/')[1].rstrip('.txt')
        self.author = filepath.split('/')[0]
        
    def read(self):
        # opens the actual file
        with open(filepath) as f:
            self.text = f.readlines()
            
    def process(self):
        # cleans book, takes out punctuation and formating 
        self.cleaned_book = []
        for line in self.text:
            for symbol in """“”<>{}[]&*^%$#@~`_+=-""'':;(),.?!''\n""":
                line = line.replace(symbol, '').lower()
            self.cleaned_book.append(line)
        
    def tokenize(self):
        # tokenize and remove stopwords
        joined_book = ' '.join(self.cleaned_book)
        tokenized_book = word_tokenize(joined_book)
        stop_words=set(stopwords.words("english"))
        self.filtered_book = []
        for w in tokenized_book:
            if w not in stop_words:
                self.filtered_book.append(w)
        
    def lemmatize(self):
        # lemmatize the text
        self.lemmatized_book=[]
        for w in self.filtered_book:
            self.lemmatized_book.append(lemmatizer.lemmatize(w))
               
    def add_book(self):
        # adds each book to list of dictionaries of all books
        Book._all_books.append({
            'title':self.title,
            'author':self.author,
            'text':self.lemmatized_book,
        })
    
    @classmethod
    def all_books(cls):
        return pd.DataFrame(cls._all_books)[['author','title','text']]

In [96]:
# import and clean list of book .txt files, use bash to make a list and clean this part according to your needs below
f = open("list.txt", "r")
path_list = f.readlines()
path_list = [i.replace("/Users/<myname>/mod3_project/","") for i in path_list]
path_list = [i.replace("\n","") for i in path_list]

In [97]:
# reads in all books and runs them thru the Book class
for filepath in path_list:
    i = Book(filepath)
    print(filepath)
    i.read()
    i.process()
    i.tokenize()
    i.lemmatize()
    i.add_book()

austen/Emma-JA.txt
austen/LS-JA.txt
austen/Mansfield-JA.txt
austen/Northranger_Abbey-JA.txt
austen/PP_JA.txt
austen/Persuasion-JA.txt
austen/SS-JA.txt
dickens/dickens-american-631.txt
dickens/dickens-battle-630.txt
dickens/dickens-childs-629-copy.txt
dickens/dickens-chimes-379.txt
dickens/dickens-christmas-125.txt
dickens/dickens-cricket-127.txt
dickens/dickens-david-626.txt
dickens/dickens-dombey-622.txt
dickens/dickens-hard-625.txt
dickens/dickens-haunted-633.txt
dickens/dickens-holiday-623.txt
dickens/dickens-hunted-624.txt
dickens/dickens-master-634.txt
dickens/dickens-mystery-636.txt
dickens/dickens-old-628.txt
dickens/dickens-oliver-627.txt
dickens/dickens-pickwick-635.txt
dickens/dickens-pictures-632.txt
dickens/dickens-tale-126.txt
doyle/agrange.txt
doyle/b-p_plan.txt
doyle/bascombe.txt
doyle/beryl.txt
doyle/blanced.txt
doyle/blkpeter.txt
doyle/bluecar.txt
doyle/cardbox.txt
doyle/caseide.txt
doyle/charles.txt
doyle/copper.txt
doyle/creeping.txt
doyle/crookman.txt
doyle/danceman

In [98]:
# just to check all the data and make sure that it looks ok
Book.all_books()

Unnamed: 0,author,title,text
0,austen,Emma-JA,"[volume, chapter, emma, woodhouse, handsome, c..."
1,austen,LS-JA,"[lady, susan, vernon, mr, vernon, langford, de..."
2,austen,Mansfield-JA,"[chapter, thirty, year, ago, miss, maria, ward..."
3,austen,Northranger_Abbey-JA,"[advertisement, authoress, northanger, abbey, ..."
4,austen,PP_JA,"[chapter, 1, truth, universally, acknowledged,..."
5,austen,Persuasion-JA,"[chapter, 1, sir, walter, elliot, kellynch, ha..."
6,austen,SS-JA,"[chapter, 1, family, dashwood, long, settled, ..."
7,dickens,dickens-american-631,"[chapter, going, away, shall, never, forget, o..."
8,dickens,dickens-battle-630,"[chapter, part, first, upon, time, matter, lit..."
9,dickens,dickens-childs-629-copy,"[chapter, ancient, england, roman, look, map, ..."


In [108]:
# exports all books to one csv for use in other notebooks for analysis
Book.all_books().to_csv('all_books.csv')

In [100]:
df = Book.all_books()