In [1]:
import sys
sys.path.append('../')

In [2]:
from application.name_obj_classes import PubName, PersonName, remove_punct

In [3]:
from application.text_preprocessing import preprocess_text

In [4]:
import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [5]:
# loading in files
directory = "../../aps_reviews_50/aps_reviews/"
filenames = os.listdir(directory)
txts = []
for file in filenames:
    with open(directory + file) as f:
        txts.append(f.read())

In [6]:
[x.split('.')[0] for x in filenames]

['136726613', '90142613', '90390665', '124486140', '126315326', '136992742', '137651039', '89668273', '136971619', '90313218', '128376070', '124688361', '124882533', '89639059', '88461015', '124190789', '136902134', '90180084', '137244878', '88721263', '137412078', '89675095', '760950213', '124217248', '853770606', '137195890', '125727674', '88797650', '89649761', '90967694', '89672866', '89674750', '89635917', '124196531', '128176513', '88477095', '125715359', '124189638', '136887509', '90554411', '90186165', '125502663', '125724646', '124740955', '124707428', '89644579', '137072896', '136685639', '126788760', '89671107']

## Part 1: Publisher Names

In [21]:
def clean_pub_matches(match_list):
    cleaned_matches = []
    for match in match_list:
        index_to_start = 0
        for i, x in enumerate(match[1].split()):
            if x[0].islower() and x[0]!='&':
                index_to_start = i+1
        cleaned_matches.append(' '.join(match[1].split()[index_to_start:]))
    return cleaned_matches

In [7]:
def remove_dash_for_pub(pub_match):
    return re.sub(r'(?<!/w)-(?!/w)', '', pub_match)

In [8]:
pub_ends = ['company','co','incorporated','inc','firm','press','group','publishers','publishing',
                    'publications','pub','books','ltd','limited','society','house','associates']

In [9]:
pub_ends_list = '|'.join([x.capitalize()+'\.?(?!\w)' for x in pub_ends])

In [10]:
def get_publishers(review):
    """
    Takes a ReviewObj. 
    Returns a list of potential publishers. Searches using pub_ends, capitalization, and associates.
    
    For reference:
    -------------
    pub_ends = ['co','company','inc','incorporated','firm','press','group', 'pub','publishers','publishing',
                    'publications','books','ltd','limited','society','house','associates']
                    
    pub_associates = ['sons','son','brother','brothers']
    
    """
    
    pubs = []
    spans = []
    
    # iter only works once before emptying
    p_iter = re.finditer(pub_ends_list, review.cleaned_text)
    p_indices = [(m.end(), m.group()) for m in p_iter]
    
    for e, index in enumerate(p_indices):
        if (e==len(p_indices)-1):
            start_index = 0
        else:
            start_index = p_indices[e-1][0] - 1
    
        match = re.finditer("(?<= [^A-Z&\.])[\S]{,10} ?[A-Z][\w&. ]*?" + index[1] + '(?!\w)', 
                                review.cleaned_text[start_index:index[0]])
        all_matches = [(m.span(), remove_dash_for_pub(m.group())) for m in match]
        if len(all_matches) > 0: 
                pubs.extend(clean_pub_matches(all_matches))
                spans.extend([m[0] for m in all_matches])

    pubs = [PubName(word) for word in pubs]
    
    for pub in pubs:
        pub.review_id = review.review_id
    
    for e, pub in enumerate(pubs):
        pub.review_id = review.review_id
        pub.review_loc = spans[e]
    
    return pubs

## Part 1: Person Names

In [11]:
titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Princess,Prince,Duke,Duchess,Baron,Father,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director
"""

In [12]:
titles = titles.rstrip().split(',')

In [13]:
title_list = '\.?\s(?=[A-Z])|'.join(titles)

In [14]:
def remove_punct_not_following_title_or_initial(name):
    name_parts = name.split()
    cleaned_name = []
    for part in name_parts:
        if part[-1] in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
            if (len(part)>2) and (part[:-1] not in titles):
                cleaned_name.append(part[:-1])
            else:
                cleaned_name.append(part)
        else:
            cleaned_name.append(part)
    return ' '.join(cleaned_name)

In [15]:
def clean_name(name):
    name = remove_punct_not_following_title_or_initial(name)
    cleaned_name = []
    return ' '.join([word for word in name.split() if (word[0].isalpha())])

In [16]:
def getCapitalizedWords(txt):
    """
    Returns strings of capitalized words up 3 words long. 
    Removes words/phrases containing stopwords and words found later in the text lowercased.
    """
    #listen idk why it won't just let me put in an optional repeat either
    all_words = []
    
    #three words
    all_words.extend([match for match in re.findall('[A-Z]\S* [A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])])
    
    #two words
    two_words = [match for match in re.findall('[A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])
                      and all([match not in x for x in all_words])]
    
    all_words.extend(two_words)
    
    #one word
    one_words = [match for match in re.findall('[A-Z]\S+', txt) if 
                      (remove_punct(match).lower() not in stopword_list) 
                      and (remove_punct(match).lower() not in txt)
                      and all([match not in x for x in all_words])]
    
    all_words.extend(one_words)
    
    return [word for word in [' '.join([removePunct(y) for y in x.split() if remove_punct(y) not in titles]) 
                      for x in all_words] if (len(word)>1)]

In [17]:
def consolidateNames(name_list):
    """
    Takes list of AuthNames and returns list of lists, consolidating by likely identical authors.
    """
    name_set = []
    used_indices = []
    last_names = sorted([name.last_name for name in name_list], key=len)
    
    for i, name in enumerate(last_names):
        if i not in used_indices:
            full_name = [x for x in name_list if x.last_name == name][0]
            name_holder = [full_name]
            for j, name2 in enumerate(last_names):
                full_name2 = [x for x in name_list if x.last_name == name2][0]
                if (i < (len(last_names) - 1)) and (i!=j):
                    if (edit_distance(name, name2[:len(name)+1]) < 2) and (j not in used_indices):
                        if (full_name.first_initial==full_name2.first_initial and full_name.middle_initial==full_name2.middle_initial or full_name.title==full_name2.title) or (full_name.first_initial==full_name2.first_initial or full_name.middle_initial==full_name2.middle_initial and full_name.title==full_name2.title):
                            name_holder.append(full_name2)
                            used_indices.append(j)
            used_indices.append(i)
            name_set.append(name_holder)
    
    return name_set                  

In [18]:
def get_names_following_titles(review):
    """
    Returns names following titles - specifically capitalized titles followed by capitalized names.
    Names can be any number of words in length, and can include punctuation.
    
    object factory
    potentially add index from original text in dict
    
    """
    #txt = clean_text_for_name_search(txt)
    
    names = []
    spans = []
    
    txt = review.cleaned_text
    
    # iter only works once before emptying
    iterx = re.finditer(title_list, txt)
    indices = [(m.start(), m.group()) for m in iterx]
    
    
    
    for e, index in enumerate(indices):
        first_match = None
        second_match = None
        
        if (e==len(indices)-1):
            end_index = -1
        else:
            end_index = indices[e+1][0]
            
        first_match = re.match(indices[e][1] + '[A-Z]\w{1,}[,;:!?\-\'"().](?!\w)', txt[indices[e][0]:end_index])
        #first_match = re.match(indices[e][1] + '[\w\s]+?[,;:!?\-\'"()\.](?!\w)', txt[indices[e][0]:end_index])
        if first_match:
            names.append(first_match.group()[:-1])
            spans.append(indices[e][0])
        else:
            second_match = re.match(indices[e][1] + '(.*?) [a-z]|\Z', txt[indices[e][0]:end_index])
        if second_match:
            names.append(second_match.group()[:-2])
            spans.append(indices[e][0])
    
    names = [word.replace("'s", "") for word in names]
    names = [PersonName(clean_name(word)) for word in names]
    
    for e, name in enumerate(names):
        name.review_id = review.review_id
        name.review_loc = (spans[e], spans[e]+len(name))
    
    return names

## ReviewObject

In [19]:
class ReviewObj():
    
    def __findnames(self):
        self.pub_names = get_publishers(self)
        self.person_names = get_names_following_titles(self)
        
    def __init__(self, aps_id, txt):
        self.review_id = aps_id
        self.original_text = txt
        self.cleaned_text = preprocess_text(txt)
        
        self.__findnames()

In [22]:
review_ex = ReviewObj(136726613, txts[0])

In [23]:
for x in review_ex.person_names:
    print(x.getNameVariants())

['charles warner', 'charles dudley warner', 'warner', 'c d warner', 'charles d warner', 'c dudley warner', 'c warner']
['vedder']
['rogers', 'thorvald rogers', 't rogers', 'thorold rogers']
['browning']
['swinburne']
['life']
['g ebers', 'ebers', 'geo ebers', 'george ebers', 'gerry ebers', 'geor6 ebers']
['hawthorne', 'julian hawthorne', 'j hawthorne']
['j chambers', 'julius chambers', 'chambers']
['cushing']
['herbert b adam', 'herbert adam', 'h b adam', 'adam', 'h adam']
['white', 'andrew d white', 'andrew white', 'a d white', 'a white']
['b matthews', 'evander matthews', 'brandon matthews', 'matthews', 'leander matthews', 'brander matthews', 'vander matthews']
['joseph parktercontinueshiseipositionuin', 'parktercontinueshiseipositionuin', 'j parktercontinueshiseipositionuin']
['e johnsou', 'johnsou']
['malaren']
['parker']


spacy NLP object steal their code

In [110]:
for txt, filename in zip(txts, [x.split('.')[0] for x in filenames]):
    print(filename)
    docnames = get_names_following_titles(txt, filename)
    for name in docnames:
        print(name, name.review_loc)
    print()

136726613
Mr. Charles Dudley Warner (91, 116)
Mr. Vedder (184, 194)
Professor Thorold Rogers (385, 409)
Mrs. Browning (642, 655)
Mr. Swinburne (799, 812)
Dr. William M. Taylor Life (815, 841)
Dr. Geor6 Ebers (1318, 1333)
Mr. Julian Hawthorne (1585, 1605)
Mr. Julius Chambers, (1729, 1749)
Mr. Cushing (3651, 3662)
Dr. Herbert B. Adam (3950, 3969)
President Andrew D. White (4089, 4114)
Mr. Brander Matthews, (4892, 4913)
Dr. Joseph Parktercontinueshiseipositionuin (5169, 5212)
Professor E. Johnsou (5238, 5258)
Dr. Malaren (5348, 5359)
Dr. Parker (5361, 5371)

90142613
Mr. E. L. Voynich (108, 125)

90390665
Miss HowAiD Is (22, 36)
Mrs. A. L. Wister (689, 706)
Princess Mercedes (3169, 3186)
Miss Peard (4050, 4060)
Miss Peardstory (4445, 4460)
Lord Mttdhursm (4639, 4653)
Miss Peard (6432, 6442)
Sir Henry Lancaster (6487, 6506)

124486140
Dr. Atwood (1320, 1330)

126315326

136992742

137651039
Lord Clive (815, 825)
Mr. Alden (2435, 2444)
Mr. Bixby (3158, 3167)
Mr. Crane (3957, 3966)
Mr. Cook 

## Goals
Make each Person name aware of others and able to check for potential matches?