This notebook reads in all the books and pre-processes them to be ready to run different ML models on the data. It then exports this data to a csv to upload into other notebooks for said analysis. 

In [58]:
# import all need libaries 
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lrsterngmail.com/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lrsterngmail.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lrsterngmail.com/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
# class that processes each book
class Book():
    
    _all_books = []
    
    def __init__(self,filepath):
        # assigns title and author
        self.title = filepath.split('/')[1].rstrip('.txt')
        self.author = filepath.split('/')[0]
        
    def read(self):
        # opens the actual file
        with open(filepath) as f:
            self.text = f.readlines()
            
    def process(self):
        # cleans book, takes out punctuation and formating 
        self.cleaned_book = []
        for line in self.text:
            for symbol in """<>{}[]&*^%$#@~`_+=-""'':;(),.?!''\n""":
                line = line.replace(symbol, '').lower()
            self.cleaned_book.append(line)
        
    def tokenize(self):
        # tokenize and remove stopwords
        joined_book = ' '.join(self.cleaned_book)
        tokenized_book = word_tokenize(joined_book)
        stop_words=set(stopwords.words("english"))
        self.filtered_book = []
        for w in tokenized_book:
            if w not in stop_words:
                self.filtered_book.append(w)
        
    def lemmatize(self):
        # lemmatize the text
        self.lemmatized_book=[]
        for w in self.filtered_book:
            self.lemmatized_book.append(lemmatizer.lemmatize(w))
               
    def add_book(self):
        # adds each book to list of dictionaries of all books
        Book._all_books.append({
            'title':self.title,
            'author':self.author,
            'text':self.lemmatized_book,
        })
    
    @classmethod
    def all_books(cls):
        return pd.DataFrame(cls._all_books)[['author','title','text']]

In [96]:
# a list of where all the individual books are saved as txt files
list_of_file_paths = ['vep_shakespeare_tcp_v2_txt/AllsWellThatEndsWell.txt']

In [None]:
# reads in all books and runs them thru the Book class
for filepath in list_of_file_paths:
    i = Book(filepath)
    i.read()
    i.process()
    i.tokenize()
    i.lemmatize()
    i.add_book()

In [97]:
# just to check all the data and make sure that it looks ok
Book.all_books()

Unnamed: 0,author,title,text
0,vep_shakespeare_tcp_v2_txt,AllsWellThatEndsWell,"[delivering, son, bury, second, husband, going..."


In [None]:
# exports all books to one csv for use in other notebooks for analysis
Book.all_books().to_csv('all_books')