# Finding names in book reviews

In [None]:
#import necessary packages
import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

In [None]:
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')

In [None]:
# loading in files
directory = "../../aps_reviews_50/aps_reviews/"
filenames = os.listdir(directory)
txts = []
for file in filenames:
    with open(directory + file) as f:
        txts.append(f.read())

In [None]:
df_all = pickle.load(open('author_title_df.pkl', 'rb'))

In [None]:
authors_by_last_name = pickle.load(open('author_last_name_dict.pkl', 'rb'))

### Step 1: Find names using titles and capitalization

In [None]:
titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director
"""

In [None]:
titles = titles.rstrip().split(',')

In [None]:
#title_list = r"\.? .+? |".join(titles)

In [None]:
title_list = '.?\s|'.join(titles)

In [None]:
# test text
mytext = """
    Missouri: A Bone of Contention '                 WE  aware that most of the States of the Union had their nicknames, more or less complimentary, but to name Missouri ' a bone of contention ' is a stroke of wit. It does, however, rightly describe the Missouri of the past, and vividly writes in a phrase her political history. Until the triumph of the Union armies and the close of the Civil War, Missouri was in the jaws of the watch-dogs of slavery and freedom. In war or in peace, the subiect of legislative com-                 promise or of military struggle, Missouri was an uncertain factor. Now, after -five years of national peace, her history may be calmly and impartially written. Indeed, the task has been done, and well done, and the , Lucien Carr of Harvard, may be congratulated upon his work,  is strong, unimpassioned, scholarly, and as impressed with the firm touch which comes of local knowledge as are the imprinted rocks in the cabinets at Cambridge. Long familiarity with the wealth of archaeology in the Peabody Museuml seems to have given him the power of comparison and generalization in the evolution of a commonwealth, while  acquaintance with living men enables him to blend the results of the study and the field in pleasing literary form. Five of his seventeen chapters give a luminous picture of the early French and Spanish discoveries and domination. Then follow three chapters treating of the                 Missouri. By Ldclen Carr. $ti... (American Commonwealths.) Boston:                 Itoughto., Sftfltn & Co.                 l , the compromise, and the  into the U nion of this State named after the great river which flows through it. In his treatment of the period from 1844 to i861, as well as that of war time, some readers may charge Mr. Carr with unduly favoring the Southern and even Confederate view; but to people living this side of the now-vanished Mason and Dixon's line, this is doubtless a benefit; for only when Northern people are able to ' put themselves in the place' of Southerners and see with Southern eyes, can they be sure that they have achieved that impartiality which is essential to the writing of final history. He shows that the Missourians were neither secessionists nor slavery propagandists. He both criticises and justifies the action of the second convention which, in the uncertain hours when other States  seceding and Missouri's Governor had been driven into exile, org  a provisional government, and  saved Missouri Irons ' the pit of political degradation into which the States in rebellion were sunk during the period of reconstruction.' Mr. Carr practically and almost abruptly ends his history at the close of the War, believing that the career of Missouri as a bone of contention ended with the abolition of slavery. The fifty years' struggle was over, the State recovered rapidly froni the wounds of the Civil War, wealth increased wonderfully, and the Negro was liberally dealt with in most if not all points relating to citizenship. Taken as a whole, this book, with its sustained interest, high average literary merit, and thorough treatment of the voluminous facts, fully justifies its place in the series of ' histories of such States as have exercised a positive influence in the shaping of the national Government, or have had a striking political - . . history.' Like the others, it has a good map and index."""  

In [None]:
from symspellpy.symspellpy import SymSpell, Verbosity 

# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7
# create object
first_name_symspell = SymSpell(max_edit_distance_dictionary, prefix_length)
# load dictionary
dictionary_path = "name_freq_dict.txt"
term_index = 0 
count_index = 1
first_name_symspell.load_dictionary(dictionary_path, term_index, count_index, separator=",")

In [None]:
authors_last_name_list = ','.join(list(authors_by_last_name.keys()))

In [None]:
f = open("authors_last_name_list.txt", "w")
f.write(authors_last_name_list)
f.close()

In [None]:
# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7
# create object
author_surname_symspell = SymSpell()
# load dictionary
term_index = 0 
count_index = 1
author_surname_symspell.create_dictionary('authors_last_name_list.txt')

In [None]:
from string import punctuation
punctuation

In [None]:
def cleanTextForNameSearch(txt):
    """
    Removes all non-newline whitespace and adds spaces around commas, semicolons, and colons.
    """
    #delete extra whitespace
    txt = re.sub(' +',' ',txt)
    
    #delete characters that should never be in this dataset (i think)
    txt = re.sub("\\'\(\)\*/<=>@\[\]^_`\|~","",txt)
    
    #adding space around certain punctuation
    txt = re.sub(',',' , ',txt)
    txt = re.sub(';',' ; ',txt)
    txt = re.sub(':',' : ',txt)
    txt = re.sub('"',' " ',txt)
    txt = re.sub("'"," ' " ,txt)
    return txt

notes:
+ reduce names down to 1880-1900
+ rule for 's

In [None]:
def removePeriodsNotFollowingTitleOrInitial(name):
    name_parts = name.split()
    cleaned_name = []
    for part in name_parts:
        if (len(part)>2) and (part.endswith('.')) and (part[:-1] not in titles):
            cleaned_name.append(part[:-1])
        else:
            cleaned_name.append(part)
    return ' '.join(cleaned_name)

In [None]:
def cleanIndices(index_tuple):
    x = index_tuple[0]
    y = index_tuple[1]
    if y[-1] in '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~':
        y = y[:-1]
    return (x,y)

In [None]:
def isName(name):
    return len([word for word in name.split() if word[0].isupper()])>1

In [None]:
def getNamesFollowingTitles(txt):
    """
    Returns names following titles - specifically capitalized titles followed by capitalized names.
    Names can be any number of words in length, and can include punctuation.
    """
    names = []
    
    # iter only works once before emptying
    iterx = re.finditer(title_list, txt)
    indices = [(m.start(), m.group()) for m in iterx]
    indices = [cleanIndices(m) for m in indices]
    
    for i, index in enumerate(indices):
        if (i<len(indices)-1):
            try:
                match = re.match(indices[i][1] + '\w{2,}[.,;:!\?\'\"]', txt[indices[i][0]:indices[i+1][0]])
                names.append(match.group()[:-1])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [\\\/\?!\'\"]', txt[indices[i][0]:indices[i+1][0]])
                    names.append(match.group()[:-1])
                except:
                    try:
                        match = re.match(indices[i][1] + '.*? [a-z]|\Z', txt[indices[i][0]:indices[i+1][0]])
                        names.append(match.group()[:-2])
                    except:
                        pass
        else:
            try:
                match = re.match(indices[i][1] + '\w{2,}[.,;:!\?\'\"]', txt[indices[i][0]:])
                names.append(match.group()[:-1])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [a-z]|\Z', txt[indices[i][0]:])
                    names.append(match.group()[:-2])
                except:
                    pass
    
    names = [word.replace("'s", "") for word in names]
    names = [removePeriodsNotFollowingTitleOrInitial(word) for word in names]
    names = [word for word in names if isName(word)]
    
    return list(set(names))

In [None]:
def removePunct(word):
    return ''.join([x for x in word if (x.isalpha()) or (x == '-') or (x=='–')])

In [None]:
def getCapitalizedWords(txt):
    """
    Returns strings of capitalized words up 3 words long. 
    Removes words/phrases containing stopwords and words found later in the text lowercased.
    """
    #listen idk why it won't just let me put in an optional repeat either
    all_words = []
    
    #three words
    all_words.extend([match for match in re.findall('[A-Z]\S* [A-Z]\S* [A-Z]\S+', txt) if 
                      all([(removePunct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(removePunct(word).lower() not in txt) for word in match.split()])])
    
    #two words
    two_words = [match for match in re.findall('[A-Z]\S* [A-Z]\S+', txt) if 
                      all([(removePunct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(removePunct(word).lower() not in txt) for word in match.split()])
                      and all([match not in x for x in all_words])]
    
    all_words.extend(two_words)
    
    #one word
    one_words = [match for match in re.findall('[A-Z]\S+', txt) if 
                      (removePunct(match).lower() not in stopword_list) 
                      and (removePunct(match).lower() not in txt)
                      and all([match not in x for x in all_words])]
    
    all_words.extend(one_words)
    
    return [word for word in [' '.join([removePunct(y) for y in x.split() if removePunct(y) not in titles]) 
                      for x in all_words] if (len(word)>0)]

In [None]:
from nltk.metrics import edit_distance

In [None]:
edit_distance('voynich','voynicch')

In [None]:
getNamesFollowingTitles(cleanTextForNameSearch(txts[0]))

In [None]:
x = 'President Andrew D. White'
y = 'Lord is'

In [None]:
len([word for word in y.split() if word[0].isupper()])>1

In [None]:
def getSetOfNames(txt):
    """
    Returns sets of names matched by similarity.
    """
    all_name_sets = []
    
    known_names = getNamesFollowingTitles(txt)
    potential_names = getCapitalizedWords(txt)
    
    for known in known_names:
        name_set = []
        name_set.append(known)
        name_parts = [removePunct(x).lower() for x in known.split() if removePunct(x) not in titles]
        
        try:
            author_surname_symspell._words[name_parts[-1].lower()]
            for n in potential_names:
                lookup = author_surname_symspell.lookup(n.lower(), Verbosity.CLOSEST)
                if (lookup):
                    for i, x in enumerate(lookup):
                        if (lookup[i]._term in name_parts):
                            name_set.append(n)
        
        except:
            for n in potential_names:
                for p in name_parts:
                    if (len(n)>5) and (len(p)>5):
                        if (edit_distance(n,p)<3):
                            name_set.append(n)
        
        all_name_sets.append(name_set)
        
    return all_name_sets

In [None]:
txts_cleaned = [cleanTextForNameSearch(x) for x in txts]

if first part of shorter name is entirely contained in longer name
fuzzy match to edit distance of two 
look at part of string length of shorter name - if those fuzzy match
cut it off and group, check to see if second part is dictionary name

match known publishers
remove publishers
regex with name & name & name

In [None]:
for num, txt in enumerate(txts_cleaned):
    try:
        print('Text #' + str(num))
        for x in getSetOfNames(txt):
            print(x)
        print()
    except:
        print('Error')
        print()