# Finding names in book reviews

In [1]:
#import necessary packages
import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [2]:
from nltk.corpus import stopwords
stopword_list = stopwords.words('english')

In [3]:
from nltk.metrics import edit_distance

In [4]:
# loading in files
directory = "../../aps_reviews_50/aps_reviews/"
filenames = os.listdir(directory)
txts = []
for file in filenames:
    with open(directory + file) as f:
        txts.append(f.read())

this is the review_id

In [131]:
[x.split('.')[0] for x in filenames]

['136726613', '90142613', '90390665', '124486140', '126315326', '136992742', '137651039', '89668273', '136971619', '90313218', '128376070', '124688361', '124882533', '89639059', '88461015', '124190789', '136902134', '90180084', '137244878', '88721263', '137412078', '89675095', '760950213', '124217248', '853770606', '137195890', '125727674', '88797650', '89649761', '90967694', '89672866', '89674750', '89635917', '124196531', '128176513', '88477095', '125715359', '124189638', '136887509', '90554411', '90186165', '125502663', '125724646', '124740955', '124707428', '89644579', '137072896', '136685639', '126788760', '89671107']

In [5]:
df_all = pickle.load(open('author_title_df.pkl', 'rb'))

In [6]:
authors_by_last_name = pickle.load(open('author_last_name_dict.pkl', 'rb'))

In [125]:
class ReviewNameObj(str):
    review_id = ''
    review_loc = ''

### Step 1: Finding publishers

In [8]:
known_publishers = ["Charles Scribner's Sons","Scribner","Macmillan","Funk & Wagnalls","McClure, Philips","Houghton Mifflin","G.P. Putnam's Sons", "G.P. Putnam",
 "Harper & Brothers","Harper","J.B. Lippincott","J. B. Lippincott", "Doubleday, Page","Doubleday","D. Appleton","Longmans, Green",
 "Longman","Henry Holt","Holt","Adam & Charles Black"]

In [9]:
pub_ends = ['company','co','incorporated','inc','firm','press','group','publishers','publishing',
                    'publications','pub','books','ltd','limited','society','house','associates']

In [10]:
pub_ends_str = '|'.join(pub_ends)

In [11]:
pub_list = []
for pub in known_publishers:
    for end in pub_ends:
        pub_list.append(pub+' '+end)

In [12]:
pub_list_str = '|'.join(pub_list)

In [13]:
pub_associates = ['sons','son','brother','brothers']

In [14]:
# use this to sort out paper names for non-book reviews
paper_names = ['register','sentinel','times','post','review','gazette','daily','journal','tribune','chronicle',
               'news','chronicle','record']

In [16]:
def removePunct(word, dash = True):
    if dash == True:
        return ''.join([x for x in word if (x.isalnum()) or (x == '-') or (x=='–')])

In [134]:
class PublisherName(ReviewNameObj):
    """
    Object type for publisher names. Inherits string functions.
    
    Parameters
    ----------
    self.full_name : full name passed to the original init
    self.pub_count : number of names (inc. all full names, last names, sons, brothers)
    self.pub_type : type of publisher if found in pub_ends
    self.pub_names : all names (inc. all full names, last names - not including sons, brothers)
    self.pub_associates : son(s)/brother(s)
    
    """
    name_type = 'publisher'
    
    def __assign(self):
        self.pub_count = ''
        self.pub_type = ''
        self.pub_names = ''
        self.pub_associates = ''
        
        if removePunct(self.name_parts[-1]) in pub_ends:
            self.pub_type = self.name_parts[-1]
            self.pub_count = len(self.name_parts[:-1])
            self.pub_associates = [x for x in self.name_parts if x in pub_associates]
            self.pub_names = ';'.join([word.replace("'s", "") for word in self.name_parts[:-1]])
        else:
            self.pub_count = len(self.name_parts)
            self.pub_associates = [x for x in self.name_parts if x in pub_associates]
            self.pub_names = ';'.join([word.replace("'s", "") for word in self.name_parts[:-1]])
        
    def __find_variations(self):
        # in the future i will set this up to autogenerate variations inc. various pub ends 
        self.potential_variations = ''  
        
    def __init__(self, name):
        self.full_name = name
        self.name_parts = [x.lower() for x in self.full_name.split('&|and|And')]
        self.__assign()

In [18]:
pub_ends_list = '|'.join([x.capitalize()+'\.?(?!\w)' for x in pub_ends])
pub_ends_list

'Company\\.?(?!\\w)|Co\\.?(?!\\w)|Incorporated\\.?(?!\\w)|Inc\\.?(?!\\w)|Firm\\.?(?!\\w)|Press\\.?(?!\\w)|Group\\.?(?!\\w)|Publishers\\.?(?!\\w)|Publishing\\.?(?!\\w)|Publications\\.?(?!\\w)|Pub\\.?(?!\\w)|Books\\.?(?!\\w)|Ltd\\.?(?!\\w)|Limited\\.?(?!\\w)|Society\\.?(?!\\w)|House\\.?(?!\\w)|Associates\\.?(?!\\w)'

In [19]:
p_iter = re.finditer(pub_ends_list, txts[0])

In [20]:
p_indices = [(m.end(), m.group()) for m in p_iter]
p_indices
#p_indices = [cleanIndices(m) for m in k_indices]

[(452, 'Co.'), (766, 'Co.'), (2351, 'Co.'), (2844, 'Co.'), (3915, 'Co.'), (4331, 'Society'), (5380, 'Society')]

In [21]:
txts[0][:p_indices[0][0] + 1]

'                                                                                                LITERARY NOTES.                 -The Chautauqua department of " Wide Awake" is now published separately.                 -Mr. Charles Dudley Warner has written for"The North American Review " a paper on prison .                 -Mr. Vedder\'s illustrations of the "Rubfilyfit" have found a great admirer in the Queen of Italy.                 -Cassell & Co. '

In [22]:
p_indices[0][1]

'Co.'

In [23]:
for i in re.finditer("[A-Z]\S+? (and|&) .+", txts[0][:p_indices[0][0] + 1]):
    print(i.group())

Cassell & Co. 


In [135]:
def cleanIndices(index_tuple):
    """
    Removes all non-period punctuation from the string component of an index tuple.
    """
    x = index_tuple[0]
    y = index_tuple[1]
    if y[-1] in '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~':
        y = y[:-1]
    return (x,y)

In [25]:
def removeDashForPub(pub_match):
    return re.sub(r'(?<!/w)-(?!/w)', '', pub_match)

In [26]:
removeDashForPub('-E. P. Dutton & Cov')

'E. P. Dutton & Cov'

In [57]:
def cleanPubMatches(match_list):
    #print('cleaning', match_list, '...')
    cleaned_matches = []
    for match in match_list:
        index_to_start = 0
        for i, x in enumerate(match[1].split()):
            if x[0].islower() and x[0]!='&':
                index_to_start = i+1
        cleaned_matches.append(' '.join(match[1].split()[index_to_start:]))
    return cleaned_matches

In [64]:
for i, index in enumerate(p_indices):
    try:
        if (i<len(p_indices)+1):
            match = re.finditer("(?<= [^A-Z&\.])[\S]{,10} ?[A-Z][\w&. ]*?" + index[1] + '(?!\w)', 
                                txts[0][p_indices[i-1][0]:(p_indices[i][0] + 1)])
            all_matches = [(m.end(), removeDashForPub(m.group())) for m in match]
            if len(all_matches) > 0: 
                print(cleanPubMatches(all_matches))
            print()
    except:
        pass


['E. P. Dutton & Co.']

['A. S. Barnes & Co.']

['Ginn, Heath & Co.']

['E. P. Dutton & Cov', 'Thomas Y. Crowell & Co.']

['American Historical Society']

['Dunlap Society']



In [59]:
txts[0][:400]

'                                                                                                LITERARY NOTES.                 -The Chautauqua department of " Wide Awake" is now published separately.                 -Mr. Charles Dudley Warner has written for"The North American Review " a paper on prison .                 -Mr. Vedder\'s illustrations of the "Rubfilyfit" have found a great admirer i'

In [127]:
def getPublishers(txt):
    """
    Returns a list of potential publishers. Searches using pub_ends, capitalization, and associates.
    
    For reference:
    -------------
    pub_ends = ['co','company','inc','incorporated','firm','press','group', 'pub','publishers','publishing',
                    'publications','books','ltd','limited','society','house','associates']
                    
    pub_associates = ['sons','son','brother','brothers']
    
    """
    
    pubs = []
    
    # iter only works once before emptying
    p_iter = re.finditer(pub_ends_list, txts[0])
    p_indices = [(m.end(), m.group()) for m in p_iter]
    
    for i, index in enumerate(p_indices):
        try:
            if (i<len(p_indices)+1):
                match = re.finditer("(?<= [^A-Z&\.])[\S]{,10} ?[A-Z][\w&. ]*?" + index[1] + '(?!\w)', 
                                txts[0][p_indices[i-1][0]:(p_indices[i][0] + 1)])
                all_matches = [(m.end(), removeDashForPub(m.group())) for m in match]
                if len(all_matches) > 0: 
                    pubs.extend(cleanPubMatches(all_matches))
        except:
            pass

    pubs = [PublisherName(word) for word in pubs]
    
    return list(set(pubs))

In [128]:
getPublishers(txts[0])[0].review_loc

''

### Step 2: Find names using titles and capitalization

In [66]:
titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Princess,Prince,Duke,Duchess,Baron,Father,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director
"""

In [67]:
titles = titles.rstrip().split(',')

In [68]:
title_list = '.?\s|'.join(titles)

In [69]:
# test text
mytext = """
    Missouri: A Bone of Contention '                 WE  aware that most of the States of the Union had their nicknames, more or less complimentary, but to name Missouri ' a bone of contention ' is a stroke of wit. It does, however, rightly describe the Missouri of the past, and vividly writes in a phrase her political history. Until the triumph of the Union armies and the close of the Civil War, Missouri was in the jaws of the watch-dogs of slavery and freedom. In war or in peace, the subiect of legislative com-                 promise or of military struggle, Missouri was an uncertain factor. Now, after -five years of national peace, her history may be calmly and impartially written. Indeed, the task has been done, and well done, and the , Lucien Carr of Harvard, may be congratulated upon his work,  is strong, unimpassioned, scholarly, and as impressed with the firm touch which comes of local knowledge as are the imprinted rocks in the cabinets at Cambridge. Long familiarity with the wealth of archaeology in the Peabody Museuml seems to have given him the power of comparison and generalization in the evolution of a commonwealth, while  acquaintance with living men enables him to blend the results of the study and the field in pleasing literary form. Five of his seventeen chapters give a luminous picture of the early French and Spanish discoveries and domination. Then follow three chapters treating of the                 Missouri. By Ldclen Carr. $ti... (American Commonwealths.) Boston:                 Itoughto., Sftfltn & Co.                 l , the compromise, and the  into the U nion of this State named after the great river which flows through it. In his treatment of the period from 1844 to i861, as well as that of war time, some readers may charge Mr. Carr with unduly favoring the Southern and even Confederate view; but to people living this side of the now-vanished Mason and Dixon's line, this is doubtless a benefit; for only when Northern people are able to ' put themselves in the place' of Southerners and see with Southern eyes, can they be sure that they have achieved that impartiality which is essential to the writing of final history. He shows that the Missourians were neither secessionists nor slavery propagandists. He both criticises and justifies the action of the second convention which, in the uncertain hours when other States  seceding and Missouri's Governor had been driven into exile, org  a provisional government, and  saved Missouri Irons ' the pit of political degradation into which the States in rebellion were sunk during the period of reconstruction.' Mr. Carr practically and almost abruptly ends his history at the close of the War, believing that the career of Missouri as a bone of contention ended with the abolition of slavery. The fifty years' struggle was over, the State recovered rapidly froni the wounds of the Civil War, wealth increased wonderfully, and the Negro was liberally dealt with in most if not all points relating to citizenship. Taken as a whole, this book, with its sustained interest, high average literary merit, and thorough treatment of the voluminous facts, fully justifies its place in the series of ' histories of such States as have exercised a positive influence in the shaping of the national Government, or have had a striking political - . . history.' Like the others, it has a good map and index."""  

In [70]:
from symspellpy.symspellpy import SymSpell, Verbosity 

# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7
# create object
first_name_symspell = SymSpell(max_edit_distance_dictionary, prefix_length)
# load dictionary
dictionary_path = "name_freq_dict.txt"
term_index = 0 
count_index = 1
first_name_symspell.load_dictionary(dictionary_path, term_index, count_index, separator=",")

True

In [71]:
authors_last_name_list = ','.join(list(authors_by_last_name.keys()))

In [72]:
f = open("authors_last_name_list.txt", "w")
f.write(authors_last_name_list)
f.close()

In [73]:
# maximum edit distance per dictionary precalculation
max_edit_distance_dictionary = 2
prefix_length = 7
# create object
author_surname_symspell = SymSpell()
# load dictionary
term_index = 0 
count_index = 1
author_surname_symspell.create_dictionary('authors_last_name_list.txt')

True

In [74]:
def fixInitials(initials):
    i_list = initials.split()
    i_list = [removePunct(x) for x in i_list]
    return ';'.join(i_list)

In [117]:
class AuthName(ReviewNameObj):
    """
    Object type for author names. Inherits string functions.
    
    Parameters
    ----------
    .full_name : full name originally passed to original init
    .name_parts : all name parts
    .title : title
    .first_name : first name
    .first_initial : first initial, can be autogenerated from first name
    .middle_name : middle name
    .middle_initial : middle initial(s), can be autogenerated from middle name(s)
    .initials : first and middle initial(s), can be autogenerated from first/middle name(s)
    .last_name : last name
    .name_part_count : total number of name parts passed to the original init
    
    
    """
    name_type = 'author'
        
    def __assign(self):
        self.first_name = ''
        self.first_initial = ''
        self.middle_name = ''
        self.middle_initial = ''
        self.initials = ''
        
        # just title & last name
        if self.name_part_count < 3:
            pass
        
        # title, first name/initial, last name
        elif self.name_part_count == 3:
            if (len(self.name_parts[1])) < 3:
                self.first_initial = self.name_parts[1]
            else:
                self.first_name = self.name_parts[1]
        
        # title, first name/initial, middle name/initial, last name
        elif self.name_part_count == 4:
            if (len(self.name_parts[1])) < 3 and (len(self.name_parts[2])) < 3:
                self.first_initial = self.name_parts[1]
                self.middle_initial = self.name_parts[2]
            else:
                if (len(self.name_parts[1])) < 3:
                    self.first_initial = self.name_parts[1]
                    self.middle_name = self.name_parts[2]
                elif (len(self.name_parts[2])) < 3:
                    self.first_name = self.name_parts[1]
                    self.middle_initial = self.name_parts[2]
                else:
                    self.first_name = self.name_parts[1]
                    self.middle_name = self.name_parts[2]
                
        elif self.name_part_count > 4:
            
            # all initials
            if all([len(x)<3 for x in self.name_parts[1:-1]]):
                initial_list = []
                for x in self.name_parts[1:-1]:
                    initial_list.append(x)
                self.initials = ' '.join(initial_list)
                
            # all names
            elif all([len(x)>2 for x in self.name_parts[1:-1]]):
                self.first_name = self.name_parts[1]
                middle_name_list = []
                for x in self.name_parts[1:-1]:
                    middle_name_list.append(x)
                self.middle_name = ' '.join(middle_name_list)
                
            # first initial, middle names
            elif (len(self.name_parts[1])<3 and all([len(x)>2 for x in self.name_parts[2:-1]])): 
                self.first_initial = self.name_parts[1]
                middle_name_list = []
                for x in self.name_parts[2:-1]:
                    middle_name_list.append(x)
                self.middle_name = ' '.join(middle_name_list)
                
            # first name, middle initials
            elif (len(self.name_parts[1])>2 and all([len(x)<3 for x in self.name_parts[2:-1]])): 
                self.first_name = self.name_parts[1]
                middle_initial_list = []
                for x in self.name_parts[2:-1]:
                    middle_initial_list.append(x)
                self.middle_initial = ' '.join(middle_initial_list)
                
    def __generate(self):
        if self.first_name:
            self.first_initial = self.first_name[0]
            if self.middle_name:
                self.initials = ' '.join([x[0] for x in self.name_parts[1:-1]])
                self.middle_initial = ' '.join([x[0] for x in self.middle_name.split()])
            if self.middle_initial:
                self.initials = ' '.join([x[0] for x in self.name_parts[1:-1]])
        elif self.first_initial and self.middle_initial:
            self.initials = ' '.join([x[0] for x in self.name_parts[1:-1]])   
        else:
            if self.middle_name:
                self.middle_initial = ' '.join([x[0] for x in self.middle_name.split()])          
                
    def __reformat(self):
        if self.initials:
            self.initials = fixInitials(self.initials)
        if self.middle_initial:
            self.middle_initial = fixInitials(self.middle_initial)
        if self.first_initial:
            self.first_initial = fixInitials(self.first_initial)
            
    def __init__(self, name):
        self.full_name = name
        self.name_parts = [x.lower() for x in self.full_name.split()]
        self.title = self.name_parts[0]
        self.last_name = self.name_parts[-1]
        self.name_part_count = len(self.name_parts)
        
        self.__assign()
        self.__generate()
        self.__reformat()
        
    #def combine(authname, authname):
        #pass
        
    def __repr__(self):
        #add in APS id
        return self.full_name
        
        #another useful thing to add: title gender
                    
    #__assign = assign
    #__generate = generate
    #__reformat = reformat

In [103]:
def cleanTextForNameSearch(txt):
    """
    Removes all non-newline whitespace and adds spaces around commas, semicolons, and colons.
    """
    #delete extra whitespace
    txt = re.sub(' +',' ',txt)
    
    #delete characters that should never be in this dataset (i think)
    txt = re.sub("\\'\(\)\*/<=>@\[\]^_`\|~","",txt)
    
    #adding space around certain punctuation
    txt = re.sub(',',' , ',txt)
    txt = re.sub(';',' ; ',txt)
    txt = re.sub(':',' : ',txt)
    txt = re.sub('"',' " ',txt)
    txt = re.sub("'"," ' " ,txt)
    return txt

notes:
+ reduce names down to 1880-1900
+ rule for 's

In [104]:
def removePeriodsNotFollowingTitleOrInitial(name):
    name_parts = name.split()
    cleaned_name = []
    for part in name_parts:
        if (len(part)>2) and (part.endswith('.')) and (part[:-1] not in titles):
            cleaned_name.append(part[:-1])
        else:
            cleaned_name.append(part)
    return ' '.join(cleaned_name)

In [105]:
def cleanName(name):
    return ' '.join([word for word in name.split() if (word[0].isalpha())])

In [106]:
def getNamesFollowingTitles(txt):
    """
    Returns names following titles - specifically capitalized titles followed by capitalized names.
    Names can be any number of words in length, and can include punctuation.
    
    object factory
    potentially add index from original text in dict
    
    """
    names = []
    
    # iter only works once before emptying
    iterx = re.finditer(title_list, txt)
    indices = [(m.start(), m.group()) for m in iterx]
    indices = [cleanIndices(m) for m in indices]
    
    for i, index in enumerate(indices):
        if (i<len(indices)-1):
            try:
                match = re.match(indices[i][1] + '\w{2,}[.,;:!\?\'\"]', txt[indices[i][0]:indices[i+1][0]])
                names.append(match.group()[:-1])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [a-z]|\Z', 
                                     txt[indices[i][0]:indices[i+1][0]])
                    names.append(match.group()[:-2])
                except:
                    pass
        else:
            try:
                match = re.match(indices[i][1] + '\w{2,}[.,;:!\?\'\"]', txt[indices[i][0]:])
                names.append(match.group()[:-1])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [a-z]|\Z', 
                                     txt[indices[i][0]:])
                    names.append(match.group()[:-2])
                except:
                    pass
    
    names = [word.replace("'s", "") for word in names]
    names = [removePeriodsNotFollowingTitleOrInitial(word) for word in names]
    names = [AuthName(cleanName(word)) for word in names]
    
    return list(set(names))

In [107]:
def getCapitalizedWords(txt):
    """
    Returns strings of capitalized words up 3 words long. 
    Removes words/phrases containing stopwords and words found later in the text lowercased.
    """
    #listen idk why it won't just let me put in an optional repeat either
    all_words = []
    
    #three words
    all_words.extend([match for match in re.findall('[A-Z]\S* [A-Z]\S* [A-Z]\S+', txt) if 
                      all([(removePunct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(removePunct(word).lower() not in txt) for word in match.split()])])
    
    #two words
    two_words = [match for match in re.findall('[A-Z]\S* [A-Z]\S+', txt) if 
                      all([(removePunct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(removePunct(word).lower() not in txt) for word in match.split()])
                      and all([match not in x for x in all_words])]
    
    all_words.extend(two_words)
    
    #one word
    one_words = [match for match in re.findall('[A-Z]\S+', txt) if 
                      (removePunct(match).lower() not in stopword_list) 
                      and (removePunct(match).lower() not in txt)
                      and all([match not in x for x in all_words])]
    
    all_words.extend(one_words)
    
    return [word for word in [' '.join([removePunct(y) for y in x.split() if removePunct(y) not in titles]) 
                      for x in all_words] if (len(word)>1)]

In [108]:
def consolidateNames(name_list):
    """
    Takes list of AuthNames and returns list of lists, consolidating by likely identical authors.
    """
    name_set = []
    used_indices = []
    last_names = sorted([name.last_name for name in name_list], key=len)
    
    for i, name in enumerate(last_names):
        if i not in used_indices:
            full_name = [x for x in name_list if x.last_name == name][0]
            name_holder = [full_name]
            for j, name2 in enumerate(last_names):
                full_name2 = [x for x in name_list if x.last_name == name2][0]
                if (i < (len(last_names) - 1)) and (i!=j):
                    if (edit_distance(name, name2[:len(name)+1]) < 2) and (j not in used_indices):
                        if (full_name.first_initial==full_name2.first_initial and full_name.middle_initial==full_name2.middle_initial or full_name.title==full_name2.title) or (full_name.first_initial==full_name2.first_initial or full_name.middle_initial==full_name2.middle_initial and full_name.title==full_name2.title):
                            name_holder.append(full_name2)
                            used_indices.append(j)
            used_indices.append(i)
            name_set.append(name_holder)
    
    return name_set                  

In [109]:
def getSetOfNames(txt):
    """
    Returns sets of names matched by similarity.
    """
    all_name_sets = []
    
    known_names = getNamesFollowingTitles(txt)
    known_names = consolidateNames(known_names)
    potential_names = getCapitalizedWords(txt)
    
    for known in known_names:
        name_parts = known[0].name_parts
        name_set = []
        name_set.extend(known)
   
        try:
            author_surname_symspell._words[name_parts[-1].lower()]
            for n in potential_names:
                lookup = author_surname_symspell.lookup(n.lower(), Verbosity.CLOSEST)
                if (lookup):
                    for i, x in enumerate(lookup):
                        if (lookup[i]._term in name_parts):
                            name_set.append(n)
        
        except:
            for n in potential_names:
                for p in name_parts:
                    if (len(n)>4) and (len(p)>4):
                        if (edit_distance(n,p)<3):
                            name_set.append(n)
        
        all_name_sets.append(name_set)
        
    return all_name_sets

In [110]:
txts_cleaned = [cleanTextForNameSearch(x) for x in txts]

In [115]:
getSetOfNames(txts_cleaned[0])

[[Dr. Herbert B. Adam, Dr. i. B. Adams, 'Herbert', 'Adam'], [President Andrew D. White, 'White'], [Dr. Geor6 Ebers, 'Ebers'], [Mr. Vedder, 'Vedder'], [Professor Thorold Rogers], [Mr. Charles Dudley Warner], [Dr. Parker, Dr. Joseph Parktercontinueshiseipositionuin, 'Parker'], [Dr. William M. Taylor, 'Taylor'], [Professor E. Johnsou, 'Johnsou'], [Mr. Cushing, 'Cushing'], [Mrs. Browning, 'Browning'], [Mr. Julius Chambers], [Mr. Brander Matthews], [Mr. Julian Hawthorne], [Mr. Swinburne, 'Swinburne']]

if first part of shorter name is entirely contained in longer name
fuzzy match to edit distance of two 
look at part of string length of shorter name - if those fuzzy match
cut it off and group, check to see if second part is dictionary name

match known publishers
remove publishers
regex with name & name & name

In [100]:
for num, txt in enumerate(txts_cleaned):
    try:
        print('Text #' + str(num))
        for x in getSetOfNames(txt):
            print(x)
        print()
    except:
        print('Error')
        print()

Text #0
[Dr. Herbert B. Adam, Dr. i. B. Adams, 'Herbert', 'Adam']
[President Andrew D. White, 'White']
[Dr. Geor6 Ebers, 'Ebers']
[Mr. Vedder, 'Vedder']
[Professor Thorold Rogers]
[Mr. Charles Dudley Warner]
[Dr. Parker, Dr. Joseph Parktercontinueshiseipositionuin, 'Parker']
[Dr. William M. Taylor, 'Taylor']
[Professor E. Johnsou, 'Johnsou']
[Mr. Cushing, 'Cushing']
[Mrs. Browning, 'Browning']
[Mr. Julius Chambers]
[Mr. Brander Matthews]
[Mr. Julian Hawthorne]
[Mr. Swinburne, 'Swinburne']

Text #1
[Mr. E. L. Voynich, 'Voynich', 'VWynich', 'Voyniich']

Text #2
[Miss s, Miss HowAiD Is, 'Shp', 'S8', 'G0']
[Mrs.]
[Miss Peard, 'Peard', 'Peard']
[Mrs. A. L. Wister, 'Wister']
[Lord Mttdhursm- Coming]
[Princess Mercedes, 'Mercedes']
[Sir Henry Lancaster, 'Shp', 'Lancaster', 'Lancaster', 'Lancaster', 'Lira']

Text #3
[Dr. Atwood, 'Atwood']

Text #4

Text #5
[Lord]

Text #6
[President of]
[Mr. Cook, 'Cook', 'Cook']
[Mr. Alden, 'Alden']
[Lord Clive, 'Clive']
[Mr. Crane, 'Crane', 'CRANE']
[Mr. Bix