### Notes

For Span obj:
+ build off author and publisher names
+ text
+ start char end char
+ label (author or publisher name)
+ group (make sure authors and pubs don't overlap)
+ as a group (this group's closeness to KB ids)
    + dictionary of id and then its score
    
For ReviewObj:
+ all author spans attribute
+ all pub spans attribute

Also refer to spaCy spans -> https://spacy.io/api/span
+ doc (doc object)
+ start (int)
+ end (int)
+ label (int/unicode)
+ kb_id (int/unicode)
+ vector (numpy.ndarray[ndim=1, dtype='float32'])

In [1]:
import sys
sys.path.append('../')

In [2]:
from application.name_obj_classes import PubName, PersonName
from application.review_obj_class import ReviewObj
import pandas as pd
import numpy as np
import os
%pprint

Pretty printing has been turned OFF


In [3]:
from nltk.metrics import edit_distance

In [4]:
# loading in files
# directory = "../../aps_reviews_50/aps_reviews/"
directory = "../../aps_reviews_1000/"
filenames = os.listdir(directory)

In [5]:
reviews = ((file.split('.')[0], open(directory + file).read()) for file in filenames)
# reviews = [(file.split('.')[0], open(directory + file).read()) for file in filenames]

In [6]:
# problem_texts = []
# for rev in reviews:
#     try:
#         ReviewObj(*rev)
#     except:
#         problem_texts.append(rev)

In [7]:
review_list = [ReviewObj(file, txt) for (file, txt) in reviews]

Span object will contain:
+ Review ID (parent doc)
+ Start char (int)
+ End char (int)
+ Label (person/publisher)
+ Group (int)
+ Name variations
+ VIAF but not yet

In [8]:
# review id
review_list[5].person_names[0].review_id

'89810338'

In [9]:
# start and end char
review_list[5].person_names[0].review_loc_chars

(246, 264)

In [10]:
# label
review_list[5].person_names[0].name_type

'person'

In [11]:
review_list[5].person_names[0].getNameVariants()

['w a becker', 'w becker', 'becker']

In [12]:
class NameSpan():
    """
    Object type for named entities. Currently contains both person names and publisher names.

    Parameters
    ----------
    self.name = NameObj
     - will contain either a PersonName or PubName object
    self.review_id : aps_id
    self.span : character span in review text
    self.start_char : start character in review text
    self.end_char : end character in review text
    self.label : name type
     - either "person" or "publisher"
    self.name_id : unique id for span object (review_id plus the start_char)
    self.group : grouping by similarity to other objects w the same label in review 
     - defaults to -1
     
    **things i'm planning on adding: self.collocates**

    """
    def __init__(self, NameObj):
        self.name = NameObj
        self.review_id = NameObj.review_id
        self.review_doc = '' # ReviewObj will go here?
        self.span = NameObj.review_loc_chars
        self.start_char = NameObj.review_loc_chars[0]
        self.end_char = NameObj.review_loc_chars[1]
        self.label = NameObj.name_type
        self.name_id = int(self.review_id + str(self.start_char))
        self.group = -1

In [13]:
span_test = NameSpan(review_list[5].person_names[0])

In [14]:
span_test.review_id

'89810338'

In [15]:
span_test.name_id

89810338246

In [16]:
span_test.name

Prof. W. A. BECKER

In [17]:
span_test.name.last_name

'becker'

In [18]:
spanlist = [NameSpan(x) for x in review_list[5].person_names]

In [19]:
spanlist

[<__main__.NameSpan object at 0x1a31a23fd0>, <__main__.NameSpan object at 0x1a31a21090>, <__main__.NameSpan object at 0x1a31a21110>, <__main__.NameSpan object at 0x1a31a21190>]

In [20]:
names = sorted([(span.name, span.name_id) for span in spanlist], key=lambda x: len(x[0].last_name))

In [21]:
names

[(Sir Henry S. Maine, 8981033820764), (Sir Henry S. Maine, 8981033823059), (Prof. W. A. BECKER, 89810338246), (Rev. FREDRICK METCALF, 89810338284)]

In [22]:
names[0][0].last_name

'maine'

In [23]:
group_dict = {}
used_ids = []

In [24]:
for e, (name, name_id) in enumerate(names):
    if (name_id not in used_ids):
        id_holder = [name_id]
        for name2, name_id2 in names:
            if (e < (len(name) - 1)) and (name_id!=name_id2):
                if (edit_distance(name.last_name, name2.last_name[:len(name.last_name)+1]) < 2) and (name_id2 not in used_ids):
                    if (name.first_initial==name2.first_initial and name.middle_initial==name2.middle_initial or name.title==name2.title) or (name.first_initial==name2.first_initial or name.middle_initial==name2.middle_initial and name.title==name2.title):
                        id_holder.append(name_id2)
                        used_ids.append(name_id2)
        used_ids.append(name_id)
        for x in id_holder:
            group_dict[x] = e

In [25]:
group_dict

{8981033820764: 0, 8981033823059: 0, 89810338246: 2, 89810338284: 3}

In [26]:
for span in spanlist:
    span.group = group_dict[span.name_id]

In [27]:
for span in spanlist:
    print(span.name, span.group)

Prof. W. A. BECKER 2
Rev. FREDRICK METCALF 3
Sir Henry S. Maine 0
Sir Henry S. Maine 0


In [28]:
def group_people(spanlist):
    
    names = sorted([(span.name, span.name_id) for span in spanlist], key=lambda x: len(x[0].last_name))
    group_dict = {}
    used_ids = []
    for e, (name, name_id) in enumerate(names):
        if (name_id not in used_ids):
            id_holder = [name_id]
            for name2, name_id2 in names:
                if (e < (len(name) - 1)) and (name_id!=name_id2):
                    if (edit_distance(name.last_name, name2.last_name[:len(name.last_name)+1]) < 2) and (name_id2 not in used_ids):
                        if (name.first_initial==name2.first_initial and name.middle_initial==name2.middle_initial or name.title==name2.title) or (name.first_initial==name2.first_initial or name.middle_initial==name2.middle_initial and name.title==name2.title):
                            id_holder.append(name_id2)
                            used_ids.append(name_id2)
            used_ids.append(name_id)
            for x in id_holder:
                group_dict[x] = e
    
    for span in spanlist:
        span.group = group_dict[span.name_id]
    
    return spanlist

In [29]:
spanlist = [NameSpan(x) for x in review_list[5].person_names]

In [30]:
spanlist = group_people(spanlist)

In [31]:
for span in spanlist:
    print(span.name, span.group)

Prof. W. A. BECKER 2
Rev. FREDRICK METCALF 3
Sir Henry S. Maine 0
Sir Henry S. Maine 0


+ change names to have underscores for tokenization
+ sentence tokenize first?

In [32]:
pub_sort = sorted(review_list, key=lambda x: len(x.pub_names))

In [33]:
pub_sort[-5].pub_names

['Crowvll & Co.', 'Stokes Co..', 'Lijpinicott Co', 'Century Co.', 'Lippincott Co.', 'McClurg & Co', 'Lippincott Co..', 'Lippincott Co.', 'Stokes Co.']

In [36]:
pub_sort[-3].pub_names

['Macmillan & Co.', 'Stokses Company', 'Century Company', 'Macmillan & Co.', 'Macmillan & Co.', 'Chiswvick House', 'Picture Books', 'Crowvell & Co', 'Macmillan & Co.', 'Stokes Company', 'Lippincott Company', 'Lippincott Company', 'Stokes Company', 'C. McClurg & Co.']

In [37]:
for x in pub_sort[-3].pub_names:
    print(x.pub_names)

macmillan
stokses
century
macmillan
macmillan
chiswvick
picture
crowvell
macmillan
stokes
lippincott
lippincott
stokes
c. mcclurg


In [38]:
spanlist = [NameSpan(x) for x in pub_sort[-3].pub_names]

In [39]:
pubs = sorted([(span.name, span.name_id) for span in spanlist], key=lambda x: x[0].pub_count)

In [40]:
pubs

[('Macmillan & Co.', 13711803711788), ('Stokses Company', 13711803715294), ('Century Company', 13711803720153), ('Macmillan & Co.', 13711803726241), ('Macmillan & Co.', 13711803736235), ('Chiswvick House', 13711803736949), ('Picture Books', 13711803740543), ('Crowvell & Co', 13711803743760), ('Macmillan & Co.', 13711803747854), ('Stokes Company', 13711803753326), ('Lippincott Company', 13711803754601), ('Lippincott Company', 13711803754825), ('Stokes Company', 13711803757632), ('C. McClurg & Co.', 13711803757679)]

In [41]:
for x in pubs:
    print(x[0].pub_names.split()[-1])

macmillan
stokses
century
macmillan
macmillan
chiswvick
picture
crowvell
macmillan
stokes
lippincott
lippincott
stokes
mcclurg


In [42]:
pub_ends = ['company','co','incorporated','inc','firm','press','group','publishers','publishing',
                    'publications','pub','books','ltd','limited','society','house','associates']

In [43]:
def get_fuzzy_pub_ends(pub_part):
    """
    Fuzzy matches pub ends.
    Returns pub ends closer than 2 edits away.
    """
    pub_part = ''.join([x for x in list(pub_part) if x.isalpha()]).lower()
    potential_matches = [x for x in pub_ends if (edit_distance(pub_part, x[:len(pub_part)+1]) < 2)]

    return potential_matches

In [44]:
get_fuzzy_pub_ends('co')

['company', 'co']

In [45]:
min([0,1,2,3,4,5])

0

In [46]:
def calc_edit_distances(plist,plist2):
    eds = []
    for x in plist[:-1]:
        for y in plist2[:-1]:
            eds.append(edit_distance(x,y))
    fuzzypubs1 = get_fuzzy_pub_ends(plist[-1])
    fuzzypubs2 = get_fuzzy_pub_ends(plist2[-1])
    fuzz = []
    for x in fuzzypubs1:
        for y in fuzzypubs2:
            fuzz.append(edit_distance(x,y))
    eds.append(min(fuzz))
    
    return eds

In [47]:
pubs[0][0].name_parts

['macmillan', 'co.']

In [49]:
pubs[1][0].name_parts

['stokses', 'company']

In [51]:
calc_edit_distances(pubs[0][0].name_parts, pubs[1][0].name_parts)

[9, 0]

In [50]:
np.mean(calc_edit_distances(pubs[0][0].name_parts, pubs[1][0].name_parts))

4.5

In [78]:
group_dict = {}
used_ids = []

In [57]:
for e, (pub, name_id) in enumerate(pubs):
    if (name_id not in used_ids):
        id_holder = [name_id]
        for pub2, name_id2 in pubs:
            if np.mean(calc_edit_distances(pub.name_parts, pub2.name_parts)) < 3:
                id_holder.append(name_id2)
                used_ids.append(name_id2)
        used_ids.append(name_id)
        for x in id_holder:
            group_dict[x] = e

In [53]:
group_dict

{8981033820764: 0, 8981033823059: 0, 89810338246: 2, 89810338284: 3, 13711803711788: 0, 13711803726241: 0, 13711803736235: 0, 13711803747854: 0, 13711803715294: 1, 13711803753326: 1, 13711803757632: 1, 13711803720153: 2, 13711803736949: 5, 13711803740543: 6, 13711803743760: 7, 13711803754601: 10, 13711803754825: 10, 13711803757679: 13}

In [54]:
for span in spanlist:
    span.group = group_dict[span.name_id]

In [55]:
for span in sorted(spanlist, key = lambda x: x.group):
    print(span.name, span.group)

Macmillan & Co. 0
Macmillan & Co. 0
Macmillan & Co. 0
Macmillan & Co. 0
Stokses Company 1
Stokes Company 1
Stokes Company 1
Century Company 2
Chiswvick House 5
Picture Books 6
Crowvell & Co 7
Lippincott Company 10
Lippincott Company 10
C. McClurg & Co. 13


In [58]:
def group_pubs(spanlist):
    
    pubs = sorted([(span.name, span.name_id) for span in spanlist], key=lambda x: x[0].pub_count)
    group_dict = {}
    used_ids = []
    for e, (pub, name_id) in enumerate(pubs):
        if (name_id not in used_ids):
            id_holder = [name_id]
            for pub2, name_id2 in pubs:
                if np.mean(calc_edit_distances(pub.name_parts, pub2.name_parts)) < 3:
                    id_holder.append(name_id2)
                    used_ids.append(name_id2)
            used_ids.append(name_id)
            for x in id_holder:
                group_dict[x] = e
    
    for span in spanlist:
        span.group = group_dict[span.name_id]
    
    return spanlist

In [60]:
spanlist = group_pubs(spanlist)

In [61]:
for span in sorted(spanlist, key = lambda x: x.group):
    print(span.name, span.group)

Macmillan & Co. 0
Macmillan & Co. 0
Macmillan & Co. 0
Macmillan & Co. 0
Stokses Company 1
Stokes Company 1
Stokes Company 1
Century Company 2
Chiswvick House 5
Picture Books 6
Crowvell & Co 7
Lippincott Company 10
Lippincott Company 10
C. McClurg & Co. 13


In [86]:
class NameSpanGenerator:
    """
    NameSpanGenerator.generate() takes a ReviewObj and returns all NameSpans. 
    """
    
    def generate(self):
        all_spans = []
        if self.person_names:
            per_spans = [NameSpan(x) for x in self.person_names]
            all_spans.extend(group_people(per_spans))
        if self.pub_names:
            pub_spans = [NameSpan(x) for x in self.pub_names]
            all_spans.extend(group_pubs(pub_spans))
        return all_spans

In [87]:
review_list[300]

<application.review_obj_class.ReviewObj object at 0x1a2e674dd0>

In [88]:
pub_sort[-10].person_names

[Dr Rhodes, Dr Rhodes, Dr Rhodes, Professor Lounsbury, Dame Quickly, Mr Burton, Dr Johnson, Professor Price, Mr Morse, Mr Morse, Mr Stickney, Mr Stickney, Miss Bowen, Mr Crockett, Mr Crockett, Dr Turner]

In [89]:
pub_sort[-10].pub_names

['Macmillan Company', 'Macmillan Company', 'Lodging House', 'Macmillan Company', 'McClure Phillips and Company', 'Lippincott Company', 'Macmillan Company']

In [90]:
spanex = NameSpanGenerator.generate(pub_sort[-10])

In [91]:
spanex

[<__main__.NameSpan object at 0x1a31ae5d50>, <__main__.NameSpan object at 0x1a31ae5c10>, <__main__.NameSpan object at 0x1a31ae5450>, <__main__.NameSpan object at 0x1a31ae5f10>, <__main__.NameSpan object at 0x1a31ae5250>, <__main__.NameSpan object at 0x1a31ae5e90>, <__main__.NameSpan object at 0x1a31ae5550>, <__main__.NameSpan object at 0x1a31ae5490>, <__main__.NameSpan object at 0x1a31ae5950>, <__main__.NameSpan object at 0x1a31ae5b50>, <__main__.NameSpan object at 0x1a31ae5650>, <__main__.NameSpan object at 0x1a31ae5bd0>, <__main__.NameSpan object at 0x1a31ae5290>, <__main__.NameSpan object at 0x1a31ae5710>, <__main__.NameSpan object at 0x1a31ae56d0>, <__main__.NameSpan object at 0x1a31ae5890>, <__main__.NameSpan object at 0x1a31ae5090>, <__main__.NameSpan object at 0x1a31ae5510>, <__main__.NameSpan object at 0x1a31ae5390>, <__main__.NameSpan object at 0x1a31ae5310>, <__main__.NameSpan object at 0x1a31ae5190>, <__main__.NameSpan object at 0x1a31ae50d0>, <__main__.NameSpan object at 0x

In [93]:
for span in sorted(spanex, key = lambda x: (x.label, x.group)):
    print(span.name, span.group)

Professor Price 0
Mr Morse 1
Mr Morse 1
Miss Bowen 3
Dr Rhodes 4
Dr Rhodes 4
Dr Rhodes 4
Mr Burton 7
Dr Turner 8
Dame Quickly 9
Dr Johnson 10
Mr Stickney 11
Mr Stickney 12
Mr Crockett 13
Mr Crockett 14
Professor Lounsbury 15
Macmillan Company 0
Macmillan Company 0
Macmillan Company 0
Macmillan Company 0
Lodging House 2
McClure Phillips and Company 4
Lippincott Company 5
