In [1]:
import sys
sys.path.append('../')

In [2]:
from application.name_obj_classes import PubName, PersonName, remove_punct

In [3]:
from application.text_preprocessing import preprocess_text

In [4]:
import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [5]:
# loading in files
directory = "../../aps_reviews_50/aps_reviews/"
filenames = os.listdir(directory)
txts = []
for file in filenames:
    with open(directory + file) as f:
        txts.append(f.read())

In [6]:
[x.split('.')[0] for x in filenames]

['136726613', '90142613', '90390665', '124486140', '126315326', '136992742', '137651039', '89668273', '136971619', '90313218', '128376070', '124688361', '124882533', '89639059', '88461015', '124190789', '136902134', '90180084', '137244878', '88721263', '137412078', '89675095', '760950213', '124217248', '853770606', '137195890', '125727674', '88797650', '89649761', '90967694', '89672866', '89674750', '89635917', '124196531', '128176513', '88477095', '125715359', '124189638', '136887509', '90554411', '90186165', '125502663', '125724646', '124740955', '124707428', '89644579', '137072896', '136685639', '126788760', '89671107']

## Part 1: Publisher Names

In [7]:
def clean_pub_matches(match_list):
    cleaned_matches = []
    for match in match_list:
        index_to_start = 0
        for i, x in enumerate(match[1].split()):
            if x[0].islower() and x[0]!='&':
                index_to_start = i+1
        cleaned_matches.append(' '.join(match[1].split()[index_to_start:]))
    return cleaned_matches

In [8]:
def remove_dash_for_pub(pub_match):
    return re.sub(r'(?<!/w)-(?!/w)', '', pub_match)

In [9]:
def get_publishers(text, aps_id):
    """
    Takes a text and its aps_id. Both inputs are required.
    Returns a list of potential publishers. Searches using pub_ends, capitalization, and associates.
    
    For reference:
    -------------
    pub_ends = ['co','company','inc','incorporated','firm','press','group', 'pub','publishers','publishing',
                    'publications','books','ltd','limited','society','house','associates']
                    
    pub_associates = ['sons','son','brother','brothers']
    
    """
    
    pubs = []
    
    # iter only works once before emptying
    p_iter = re.finditer(pub_ends_list, txts[0])
    p_indices = [(m.end(), m.group()) for m in p_iter]
    
    for i, index in enumerate(p_indices):
        try:
            if (i<len(p_indices)+1):
                match = re.finditer("(?<= [^A-Z&\.])[\S]{,10} ?[A-Z][\w&. ]*?" + index[1] + '(?!\w)', 
                                txts[0][p_indices[i-1][0]:(p_indices[i][0] + 1)])
                all_matches = [(m.end(), remove_dash_for_pub(m.group())) for m in match]
                if len(all_matches) > 0: 
                    pubs.extend(clean_pub_matches(all_matches))
        except:
            pass

    pubs = [PubName(word) for word in pubs]
    
    for pub in pubs:
        pub.review_id = aps_id
    
    return pubs

## Part 1: Person Names

In [10]:
titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Princess,Prince,Duke,Duchess,Baron,Father,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director
"""

In [11]:
titles = titles.rstrip().split(',')

In [37]:
title_list = '\.?\s(?=[A-Z])|'.join(titles)

In [13]:
def remove_periods_not_following_title_or_initial(name):
    name_parts = name.split()
    cleaned_name = []
    for part in name_parts:
        if (len(part)>2) and (part.endswith('.')) and (part[:-1] not in titles):
            cleaned_name.append(part[:-1])
        else:
            cleaned_name.append(part)
    return ' '.join(cleaned_name)

In [14]:
def clean_text_for_name_search(txt):
    """
    Removes all non-newline whitespace and adds spaces around commas, semicolons, and colons.
    """
    #delete extra whitespace
    txt = re.sub(' +',' ',txt)
    
    #delete characters that should never be in this dataset (i think)
    txt = re.sub("\\'\(\)\*/<=>@\[\]^_`\|~","",txt)
    
    #adding space around certain punctuation
    txt = re.sub(',',' , ',txt)
    txt = re.sub(';',' ; ',txt)
    txt = re.sub(':',' : ',txt)
    txt = re.sub('"',' " ',txt)
    txt = re.sub("'(?!s)"," ' " ,txt)
    return txt

In [15]:
def clean_name(name):
    cleaned_name = []
    return ' '.join([word for word in name.split() if (word[0].isalpha())])

In [16]:
match = re.match('.', txts[0])

In [17]:
match.span()

(0, 1)

In [18]:
def getCapitalizedWords(txt):
    """
    Returns strings of capitalized words up 3 words long. 
    Removes words/phrases containing stopwords and words found later in the text lowercased.
    """
    #listen idk why it won't just let me put in an optional repeat either
    all_words = []
    
    #three words
    all_words.extend([match for match in re.findall('[A-Z]\S* [A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])])
    
    #two words
    two_words = [match for match in re.findall('[A-Z]\S* [A-Z]\S+', txt) if 
                      all([(remove_punct(word).lower() not in stopword_list) for word in match.split()]) 
                      and all([(remove_punct(word).lower() not in txt) for word in match.split()])
                      and all([match not in x for x in all_words])]
    
    all_words.extend(two_words)
    
    #one word
    one_words = [match for match in re.findall('[A-Z]\S+', txt) if 
                      (remove_punct(match).lower() not in stopword_list) 
                      and (remove_punct(match).lower() not in txt)
                      and all([match not in x for x in all_words])]
    
    all_words.extend(one_words)
    
    return [word for word in [' '.join([removePunct(y) for y in x.split() if remove_punct(y) not in titles]) 
                      for x in all_words] if (len(word)>1)]

In [19]:
def consolidateNames(name_list):
    """
    Takes list of AuthNames and returns list of lists, consolidating by likely identical authors.
    """
    name_set = []
    used_indices = []
    last_names = sorted([name.last_name for name in name_list], key=len)
    
    for i, name in enumerate(last_names):
        if i not in used_indices:
            full_name = [x for x in name_list if x.last_name == name][0]
            name_holder = [full_name]
            for j, name2 in enumerate(last_names):
                full_name2 = [x for x in name_list if x.last_name == name2][0]
                if (i < (len(last_names) - 1)) and (i!=j):
                    if (edit_distance(name, name2[:len(name)+1]) < 2) and (j not in used_indices):
                        if (full_name.first_initial==full_name2.first_initial and full_name.middle_initial==full_name2.middle_initial or full_name.title==full_name2.title) or (full_name.first_initial==full_name2.first_initial or full_name.middle_initial==full_name2.middle_initial and full_name.title==full_name2.title):
                            name_holder.append(full_name2)
                            used_indices.append(j)
            used_indices.append(i)
            name_set.append(name_holder)
    
    return name_set                  

In [27]:
iterx = re.finditer(title_list, txts[0])
indices = [(m.start(), m.group()) for m in iterx]

In [28]:
indices

[(91, 'Mr. '), (184, 'Mr. '), (386, 'Professor '), (643, 'Mrs. '), (802, 'Mr. '), (818, 'Dr. '), (1326, 'Dr. '), (1597, 'Mr. '), (1743, 'Mr. '), (2329, 'Dr. '), (3692, 'Mr. '), (3994, 'Dr. '), (4135, 'President '), (4950, 'Mr. '), (5232, 'Dr. '), (5302, 'Professor '), (5412, 'Dr. '), (5426, 'Dr. ')]

In [38]:
def get_names_following_titles(txt, aps_id):
    """
    Returns names following titles - specifically capitalized titles followed by capitalized names.
    Names can be any number of words in length, and can include punctuation.
    
    object factory
    potentially add index from original text in dict
    
    """
    #txt = clean_text_for_name_search(txt)
    
    names = []
    spans = []
    
    # iter only works once before emptying
    iterx = re.finditer(title_list, txt)
    indices = [(m.start(), m.group()) for m in iterx]
    
    for i, index in enumerate(indices):
        if (i<len(indices)-1):
            try:
                match = re.match(indices[i][1] + '[A-Z]\w{1,}[.,;:!\?\'\"]', txt[indices[i][0]:indices[i+1][0]])
                names.append(match.group()[:-1])
                spans.append(indices[i][0])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [a-z]|\Z', 
                                     txt[indices[i][0]:indices[i+1][0]])
                    names.append(match.group()[:-2])
                    spans.append(indices[i][0])
                except:
                    pass
        else:
            try:
                match = re.match(indices[i][1] + '[A-Z]\w{1,}[.,;:!\?\'\"]', txt[indices[i][0]:])
                names.append(match.group()[:-1])
                spans.append(indices[i][0])
            except:
                try:
                    match = re.match(indices[i][1] + '.*? [a-z]|\Z', 
                                     txt[indices[i][0]:])
                    names.append(match.group()[:-2])
                    spans.append(indices[i][0])
                except:
                    pass
    
    names = [word.replace("'s", "") for word in names]
    names = [remove_periods_not_following_title_or_initial(word) for word in names]
    names = [PersonName(clean_name(word)) for word in names]
    
    for i, name in enumerate(names):
        name.review_id = aps_id
        name.review_loc = (spans[i], spans[i]+len(name))
    
    return names

In [34]:
txts = [preprocess_text(rev) for rev in txts]

In [35]:
txts[0][91:116]

'Mr. Charles Dudley Warner'

In [41]:
txts[23][3619:3680]

'Miss Edgeworth , Scribe , N. P. Willis , Dickens , etc. , and'

spacy NLP object steal their code

In [39]:
for txt, filename in zip(txts, [x.split('.')[0] for x in filenames]):
    print(filename)
    docnames = get_names_following_titles(txt, filename)
    for name in docnames:
        print(name, name.review_loc)
    print()

136726613
Mr. Charles Dudley Warner (91, 116)
Mr. Vedder (184, 194)
Professor Thorold Rogers (386, 410)
Mrs. Browning (643, 656)
Mr. Swinburne (802, 815)
Dr. William M. Taylor Life (818, 844)
Dr. Geor6 Ebers (1326, 1341)
Mr. Julian Hawthorne (1597, 1617)
Mr. Julius Chambers (1743, 1762)
Mr. Cushing (3693, 3704)
Dr. Herbert B. Adam (3995, 4014)
President Andrew D. White (4136, 4161)
Mr. Brander Matthews (4951, 4971)
Dr. Joseph Parktercontinueshiseipositionuin (5233, 5276)
Professor E. Johnsou (5303, 5323)
Dr. Parker (5427, 5437)

90142613
Mr. E. L. Voynich (108, 125)

90390665
Miss HowAiD Is (22, 36)
Mrs. A. L. Wister (700, 717)
Princess Mercedes (3209, 3226)
Miss Peard (4101, 4111)
Miss Peard (4500, 4510)
Lord Mttdhursm- Coming (4696, 4718)
Miss Peard (6501, 6511)
Sir Henry Lancaster (6556, 6575)

124486140
Dr. Atwood (1348, 1358)

126315326

136992742

137651039
Lord Clive (822, 832)
Mr. Alden (2469, 2478)
Mr. Bixby (3199, 3208)
Mr. Crane (4005, 4014)
Mr. Cook (9998, 10006)
Mr. Cook (

## Goals
Make each Person name aware of others and able to check for potential matches?