### General Library Imports

In [24]:
import re
import os
import random
import numpy as np
import pandas as pd 
from pathlib import Path  
from gensim import *
np.random.seed(0)

# Preprocessing

In [25]:
# Cleaning and Tokenizing Words for train texts

train_folder = Path.cwd().joinpath("rawtextfiles")

clean_books = {}

for filename in train_folder.iterdir():
    with open(str(filename), encoding='utf-8', errors='ignore') as fhand:
        text = fhand.readlines()
        clean_text = []
        for line in text:
            clean_line = re.sub('[^A-Za-z0-9 ]+', '', line)
            words = clean_line.split(" ")
            for word in words[:]:
                if len(word) != 0:
                    clean_text.append(word)
        if len(clean_text) > 100 :
            file_name = str(filename).split('-')[1]
            clean_books[file_name] = clean_books.get(file_name, []) + clean_text[350:]
        #         clean_books[file_name] = clean_text[350:]
        else:
            continue

for k, v in clean_books.items():
    if len(v) < 1:
        del clean_books[k]     

clean_books.keys()

dict_keys(['MarkTwain', 'JohnLocke', 'NathanielHawthorne', 'FScottFitzgerald', 'VirginiaWoolf', 'MaryWollstonecraft', 'JaneAustin', 'EdithWharton', 'MaryShelley', 'KateChopin', 'MargaretFuller', 'HenryDavidThoreau', 'JackLondon', 'CharlesDickens'])

In [46]:
# Checking to make sure additions were correct

# for filename in train_folder.iterdir():
#      with open(str(filename), encoding='utf-8', errors='ignore') as fhand:
#             text = fhand.readlines()
#             word_count = []
#             for line in text:
#                 clean_line = re.sub('[^A-Za-z0-9 ]+', '', line)
#                 words = clean_line.split()
#                 for word in words:
#                     if len(word) != 0:
#                         word_count.append(word)
        
#             print(len(word_count), str(filename).split('/')[7])


# len(clean_text)
# clean_text[104000:]
# file_name
print(len(clean_books['MarkTwain']))
# clean_books['MarkTwain']

183764


### Named Entity Recognition & Classification for Cleaning Text

#### Notes about this section:<br>

These steps can be time-consuming, but we deemed it an important step in preventing data leakage. While not perfect, the removal of many of the named entities (proper nouns) avoids text-specific named entites like "Frankenstein" or period-specific/work-specific named entities like 'King George' from unduly influencing the model (instead of 'style', which is admittedly a hard idea to get at). <br>

The corpus undoubtedly missed some named entities, and deleted others. Given more time, we would likely implement an ensemble method of: <br>

1) StanfordNER, <br>
2) NLTK's built-in POS tagging (based on the Penn Treebank Project model), and <br>
3) Polyglot's POS tagger. <br>

A word would only be deleted if it appeared in two of the three "name entity" lists generated by the methods. <br>

<b> Cautionary note </b> that the way this is written is in a preprocessing pipeline mode; i.e. the code as currently written overwrites the original dictionary value, so you would have to rerun prior preprocessing steps each time if you wanted to test different methods. We may change this in future updates. 



##### StanfordNER 

In [28]:
from nltk.tag import StanfordNERTagger #this is a built in nltk wrapper for StanfordNER which is written in java

classifier_doc_path = Path.cwd().joinpath("stanford_ner/classifiers/english.muc.7class.distsim.crf.ser.gz")
classifier_dir_path = Path.cwd().joinpath("stanford_ner/stanford_ner.jar")

st = StanfordNERTagger(str(classifier_doc_path), str(classifier_dir_path), encoding='utf-8')

In [29]:
word_loss_NER_stan = {}

for author in clean_books.keys():
    tagged_corpus = st.tag(clean_books[author])
    NER_corpus =[word.lower() for word, tag in tagged_corpus if tag == 'O']
    clean_books[author] = NER_corpus
    word_loss_NER_stan[author] = [len(tagged_corpus) - len(NER_corpus)]
    removed_words = [word, tag for word, tag in tagged_corpus if tag != 'O' for i in range(1,500)]
#     removed_words = []
#     while len(removed_words) < 10:
    word_loss_NER_stan[author].append(removed_words[:10])
    
word_loss_NER_stan

{'CharlesDickens': 8241,
 'EdithWharton': 8833,
 'FScottFitzgerald': 7113,
 'HenryDavidThoreau': 4178,
 'JackLondon': 4107,
 'JaneAustin': 8495,
 'JohnLocke': 422,
 'KateChopin': 3833,
 'MargaretFuller': 2481,
 'MarkTwain': 3040,
 'MaryShelley': 3372,
 'MaryWollstonecraft': 704,
 'NathanielHawthorne': 3690,
 'VirginiaWoolf': 10085}

In [54]:
word_loss_NER_percents_stan = {}

for author in word_loss_NER_stan.keys():
    word_loss_NER_percents_stan[author] = "{:.3%}".format((word_loss_NER_stan[author] / (word_loss_NER_stan[author]+len(clean_books[author]))))

word_loss_NER_percents_stan

{'CharlesDickens': '2.579%',
 'EdithWharton': '3.844%',
 'FScottFitzgerald': '3.493%',
 'HenryDavidThoreau': '2.162%',
 'JackLondon': '2.417%',
 'JaneAustin': '3.548%',
 'JohnLocke': '0.234%',
 'KateChopin': '3.171%',
 'MargaretFuller': '1.402%',
 'MarkTwain': '1.627%',
 'MaryShelley': '1.355%',
 'MaryWollstonecraft': '0.538%',
 'NathanielHawthorne': '1.965%',
 'VirginiaWoolf': '3.345%'}

##### Built-in NLTK POS Tagging

In [None]:
import nltk

In [None]:
word_loss_NER_nltk = {}

for author in clean_books.keys():
    tagged_corpus = nltk.tag.pos_tag(clean_books[author])
    NER_corpus = [word.lower() for word,tag in tagged_corpus if tag != 'NNP' and tag != 'NNPS']
    clean_books[author] = NER_corpus
    word_loss_NER_nltk[author] = len(tagged_corpus) - len(NER_corpus)
    i+=1
    print(i)
    
word_loss_NER_nltk

In [None]:
word_loss_NER_percents_nltk = {}

for author in word_loss_NER_nltk.keys():
    word_loss_NER_percents_nltk[author] = ((word_loss_NER_nltk[author] / (word_loss_NER_nltk[author]+len(clean_books[author]))) * 100)

word_loss_NER_percents_nltk

##### Polyglot POS Tagging

In [None]:
from polyglot.text import Text

In [None]:
word_loss_NER_poly = {}

for author in clean_books.keys():
    poly_text_object = Text(' '.join(clean_books[author]))
    tagged_corpus = poly_text_object.pos_tags
    NER_corpus =[word.lower() for word,tag in tagged_corpus if tag != 'PROPN']
    clean_books[author] = NER_corpus
    word_loss_NER_poly[author] = len(tagged_corpus) - len(NER_corpus)

word_loss_NER_poly

In [None]:
word_loss_NER_poly_percents = {}

for author in word_loss_NER.keys():
    word_loss_NER_poly_percents[author] = ((word_loss_NER_poly[author] / (word_loss_NER_poly[author]+len(clean_books[author]))) * 100)

word_loss_NER_poly_percents

##### Metrics for each method - Note, section not yet complete

In [31]:
values_stan_method = word_loss_NER_percents_stan.values()

metrics_stan_method_loss = [min(values_stan_method), (sum(values_stan_method)/len(values_stan_method)), max(values_stan_method)]

metrics_stan_method_loss

[0.23391553542822618, 2.2629175711682605, 3.8442957927309602]

##### Ensemble Method Placeholder

### Constructing our dataframe with 'paragraphs' from each work

In [32]:
# This code is copied and pasted from the python file "building_labeled_paragraphs.py"

desired_columns = ['text', 'author', 'sex', 'period']
master_paragraphs = pd.DataFrame(columns = desired_columns)

def create_paragraphs(corpus, author_name, para_size, num_para):
    start_index = random.randint(0, 50)
    end_index = start_index + para_size + 1
    jump_metric = ((len(corpus)) / (int(num_para)+5))
    jump_plus_minus = jump_metric / 10
    i = 0
    
    paragraphs = pd.DataFrame(columns = desired_columns)

    for x in range(int(num_para)):
        word_slice = corpus[start_index : end_index]
        string_paragraph = word_slice[0]
        for word in word_slice[1:]:
            string_paragraph = string_paragraph + ' ' + word
            
        paragraphs.loc[i] = [string_paragraph, author_name, None, None]

        jump_size = random.randint(int((jump_metric - jump_plus_minus)), int((jump_metric + jump_plus_minus)))
        start_index = random.randint(end_index, int(end_index + jump_size))
        end_index = start_index + para_size + 1
        i += 1
        
    return paragraphs

#End of Function

para_size = 150
num_para = 250

for k, v in clean_books.items():
    paragraphs = create_paragraphs(v, k, para_size, num_para)
    master_paragraphs = pd.concat([master_paragraphs, paragraphs], ignore_index=True)
    
# To add the other column values you specified, use a dictionary and map 
    
author_sex = {'KateChopin' : 'female', 'NathanielHawthorne': 'male', 'JackLondon': 'male', 'JohnLocke': 'male',
              'MargaretFuller': 'female', 'JaneAustin': 'female', 'MaryWollstonecraft': 'female', 
              'VirginiaWoolf': 'female', 'MarkTwain': 'male', 'HenryDavidThoreau': 'male',  
              'FScottFitzgerald': 'male', 'MaryShelley': 'female', 'EdithWharton': 'female', 
              'CharlesDickens': 'male'}

work_period = {'KateChopin' : 'realism', 'NathanielHawthorne': 'gothic/romantic', 'JackLondon': 'naturalism', 
               'JohnLocke': 'enlightenment', 'MargaretFuller': 'transcendentalism','JaneAustin': 'victorian', 
               'MaryWollstonecraft':'enlightenment','VirginiaWoolf': 'early_modernism', 
               'MarkTwain': 'realism', 'HenryDavidThoreau': 'transcendentalism',
               'FScottFitzgerald': 'early_modernism', 'MaryShelley': 'gothic/romantic', 
               'EdithWharton': 'naturalism', 'CharlesDickens': 'victorian'}


master_paragraphs['sex'] = master_paragraphs['author'].map(author_sex)
master_paragraphs['period'] = master_paragraphs['author'].map(work_period)

# UNCOMMENT TO CREATE NEW DOC
# Caution though. This will overwrite any existing csv of the same name. 

master_paragraphs.to_csv('{}Paragraphs_{}Words.csv'.format(num_para, para_size), mode='w+')

### Processing Extra-Validation Texts

In [37]:
ev_folder = Path.cwd().joinpath("evtextsfiles")

ev_books = {}

for filename in ev_folder.iterdir():
    with open(str(filename), encoding='utf-8', errors='ignore') as fhand:
        text = fhand.readlines()
        clean_text = []
        for line in text:
            clean_line = re.sub('[^A-Za-z0-9 ]+', '', line)
            words = clean_line.split(" ")
            for word in words[:]:
                if len(word) != 0:
                    clean_text.append(word)
        if len(clean_text) > 100 :
            file_name = str(filename).split('-')[2]
            ev_books[file_name] = ev_books.get(file_name, []) + clean_text[350:]
        #         ev_books[file_name] = clean_text[350:]
        else:
            continue

for k, v in ev_books.items():
    if len(v) < 1:
        del ev_books[k]     

ev_books.keys()

dict_keys(['JosephConrad', 'FranzKafka', 'LouisaMayAlcott', 'BenjaminFranklin', 'HarrietBeecherStowe', 'HermanMelville', 'EmilyBronte', 'LewisCarroll', 'AgathaChristie', 'GertrudeStein'])

In [39]:
ev_word_loss_NER_stan = {}

for author in ev_books.keys():
    tagged_corpus = st.tag(ev_books[author])
    NER_corpus =[word.lower() for word,tag in tagged_corpus if tag == 'O']
    ev_books[author] = NER_corpus
    ev_word_loss_NER_stan[author] = len(tagged_corpus) - len(NER_corpus)
    
ev_word_loss_NER_stan

{'AgathaChristie': 3127,
 'BenjaminFranklin': 3007,
 'EmilyBronte': 2743,
 'FranzKafka': 326,
 'GertrudeStein': 4934,
 'HarrietBeecherStowe': 6302,
 'HermanMelville': 4042,
 'JosephConrad': 236,
 'LewisCarroll': 903,
 'LouisaMayAlcott': 5197}

In [40]:
ev_word_loss_NER_percents_stan = {}

for author in ev_word_loss_NER_stan.keys():
    ev_word_loss_NER_percents_stan[author] = ((ev_word_loss_NER_stan[author] / (ev_word_loss_NER_stan[author]+len(ev_books[author]))) * 100)

ev_word_loss_NER_percents_stan

{'AgathaChristie': 4.1728385176881915,
 'BenjaminFranklin': 3.975830336365559,
 'EmilyBronte': 2.3743983934074304,
 'FranzKafka': 1.5042451088962718,
 'GertrudeStein': 5.752996595308055,
 'HarrietBeecherStowe': 3.4972058978585028,
 'HermanMelville': 1.9029683858666226,
 'JosephConrad': 0.6284618662121858,
 'LewisCarroll': 3.467875110411306,
 'LouisaMayAlcott': 2.798720461840036}

In [44]:
ev_author_sex = {'AgathaChristie': 'female', 'JosephConrad': 'male', 'FranzKafka': 'male', 
                 'BenjaminFranklin': 'male', 'GertrudeStein': 'female', 'EmilyBronte': 'female', 
                 'HarrietBeecherStowe': 'female', 'LouisaMayAlcott': 'female', 'HermanMelville': 'male', 
                 'LewisCarroll': 'male'}

ev_work_period = {'AgathaChristie' : 'early_modernism', 'JosephConrad': 'early_modernism', 
                   'FranzKafka': 'early_modernism', 'BenjaminFranklin': 'enlightenment', 
                   'GertrudeStein': 'early_modernism', 'EmilyBronte': 'victorian', 
                   'HarrietBeecherStowe':'victorian','LouisaMayAlcott': 'victorian', 
                   'HermanMelville': 'gothic/romantic', 'LewisCarroll': 'victorian'}

ev_master_paragraphs = pd.DataFrame(columns = desired_columns)

ev_num_para = 50
ev_para_size = 150

for k,v in ev_books.items():
    paragraphs = create_paragraphs(v, k, ev_para_size, ev_num_para)
    ev_master_paragraphs = pd.concat([ev_master_paragraphs, paragraphs], ignore_index=True)
    
ev_master_paragraphs['sex'] = ev_master_paragraphs['author'].map(ev_author_sex)
ev_master_paragraphs['period'] = ev_master_paragraphs['author'].map(ev_work_period)

ev_master_paragraphs.to_csv('EV_{}Paragraphs_{}Words.csv'.format(ev_num_para, ev_para_size), mode='w+')