# Initial Setup

- import various helpers, load data, select reviews by status and category

In [4]:
import sys
sys.path.append('../')
from application.name_obj_classes import PubName, PersonName, remove_punct

from application.review_obj_class import ReviewObj

from application.text_preprocessing import preprocess_text

import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
from nltk.corpus import stopwords
%pprint

Pretty printing has been turned OFF


In [8]:
import application.models as models

In [9]:
# load full text from db
aps_details_single = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'done'))).filter(models.Review.review_type == 'single_focus').all()

len(aps_details_single)

1093

In [None]:
reviews_parsed = [ReviewObj(i.record_id, i.full_text) for i in aps_details_single if i.reviewed_author_name !='' and i.reviewed_author_name is not None]

In [None]:
authors = [i.reviewed_author_name for i in aps_details_single if i.reviewed_author_name !='' and i.reviewed_author_name is not None]
len(authors)

In [None]:
def make_author_candidates(review):
    titles = """Doctor,Dr,Mr,Mrs,Miss,Msgr,Monsignor,Rev,Reverend,Hon,Honorable,Honourable,Prof,Professor,Madame,Madam,Lady,Lord,Sir,Dame,Master,Mistress,Princess,Prince,Duke,Duchess,Baron,Father,Chancellor,Principal,President,Pres,Warden,Dean,Regent,Rector,Provost,Director"""
    titles = titles.rstrip().split(',')
    title_list = '\.?\s(?=[A-Z])|'.join(titles) + '\.?\s(?=[A-Z])'

    text = review.no_pubs_text.split()

    full_names = {}

    for e,i in enumerate(text):
        maybe_title = "".join([z for z in i if z.isalpha()])
        if len(maybe_title) > 0:
            if maybe_title[0].isupper() and maybe_title in titles:
                
                surname = []
                for p in [e+1, e+2, e+3, e+4, e+5]:
                    try:
                        if text[p] == "." and len(text[p-1]) > 1:
                            break
                    except:
                        pass
                    
                    try:
                        if text[p][0].isupper():
                            if text[p].lower() not in stopwords.words('english'):
                                surname.append(text[p])
                    except:
                        pass
                    try:
                        if text[p][-1] == "." and len(text[p]) > 2:
                            break
                    except:
                        pass
                if len(surname) > 0:
                    surname = " ".join(surname).replace("'s", "")
                    surname_cleaned = []
                    for s in surname:
                        if s not in '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~':
                            surname_cleaned.append(s)
                    surname_cleaned = "".join(surname_cleaned)
                    try:
                        check = full_names[surname]
                    except:
                        full_names[surname] = {}
                    try:
                        full_names[surname]['title'].append(maybe_title)
                    except:
                        full_names[surname]['title'] = [maybe_title,]
                    try:
                        full_names[surname]['surname_cleaned'].append(surname_cleaned)
                    except:
                        full_names[surname]['surname_cleaned'] = [surname_cleaned,]

    for surname in full_names.keys():
        s = surname.split()
        for e, i in enumerate(text):
            if text[e:e+len(s)] == s:
                forename = "".join([x for x in text[e-1] if x.isalpha()])
                if forename.istitle() and forename not in titles:
                    try:
                        full_names[surname]['forename'].append(forename)
                    except:
                        full_names[surname]['forename'] = [forename,]
            try:
                forenames = full_names[surname]['forename']
            except:
                full_names[surname]['forename'] = []
    
    for name in full_names.keys():
        for i in full_names[name]['forename']:
            try: 
                full_names[name]['full_name'].append(i + " " + name)
            except:
                full_names[name]['full_name'] = [i + " " + name,]
        try:
            full = full_names[surname]['full_name']
        except:
            full_names[name]['full_name'] = []
    full_name_candidates = {}

    for n in full_names.keys():
        for f in full_names[n]['full_name']:
            try:
                full_name_candidates[f] += 1
            except:
                full_name_candidates[f] = 1

    # add title and surnames
    for n,o in full_names.items():
        for i in o['surname_cleaned']:
            # check if surname in a full name
            name_part = False
            for full in full_name_candidates.keys():
                if i in full:
                    name_part = True
            if not name_part:
                try: 
                    full_name_candidates[i] += 1
                except:
                    full_name_candidates[i] = 1
    return full_name_candidates

# Screen out names that are extremely common single words like 'there'?

In [None]:
make_author_candidates(reviews_parsed[1])

In [None]:
authors[1]

In [None]:
reviews_parsed[1].cleaned_text[:100]

## Part 1: Person Names

In [None]:
known_authors = list(set(authors))
len(known_authors)

In [None]:
known_authors = list(set([z for i in known_authors for z in i.split(";")]))
len(known_authors)

In [None]:
from fuzzysearch import find_near_matches

def match_known_authors(text, k_a):
    all_matches = {}
    for can in k_a:
        
        #base fuzziness on length of author name
        if len(can) < 10:
            fuzz=0
        elif len(can) > 9 and len(can) < 20:
            fuzz=1
        else:
            fuzz=3
        matches = find_near_matches(can, text, max_l_dist=fuzz)
        if len(matches) > 0:
            match_strings = [text[m.start:m.end] for m in matches]
            for i in match_strings:
                try:
                    all_matches[can].append(i)
                except:
                    all_matches[can] = [i,]
        for k,v in all_matches.items():
            if len(v) == 1 and k == v[0]:  
                return ("found", [k,])
            
    result = list(all_matches.keys())
    if len(result) > 0:
        return ("found", result)
    else:
        return ("not found", result)

def match_surname(text, k_a):
    all_matches = {}
    for can in k_a:
        # get last item in name string
        surname = can.split()[-1]
        
        #base fuzziness on length of author name
        if len(surname) < 10:
            fuzz=0
        elif len(surname) > 9 and len(surname) < 20:
            fuzz=1
        else:
            fuzz=3
        matches = find_near_matches(surname, text, max_l_dist=fuzz)
        
        if len(matches) > 0:
            match_strings = [text[m.start:m.end] for m in matches]
            for i in match_strings:
                try:
                    all_matches[can].append(i)
                except:
                    all_matches[can] = [i,]
        for k,v in all_matches.items():
            if len(v) == 1 and k == v[0]:  
                return ("found", [k,])
    result = list(all_matches.keys())
    
    if len(result) > 0:
        return ("found", result)
    else:
        return ("not found", result)
    

In [None]:
candidates = make_author_candidates(reviews_parsed[1])
for name, score in candidates.items():
    print(name, score, match_known_authors(name, known_authors))
    #print(name, score, match_surname(name, known_authors))

In [None]:
all_author_matches = {}

# run on all reviews, return matches, novel, or no match
for review in reviews_parsed:
    
    results = {}
    #names = list(set(review.person_names))
    candidates = make_author_candidates(review)
    
    for i, score in candidates.items():
        # function to fuzzy match known authors
        result, author_candidates = match_known_authors(i, known_authors)
        if len(author_candidates) == 0:
            result, author_surname_candidates = match_surname(i, known_authors)
            if len(author_surname_candidates) > 0:
                for z in author_surname_candidates:
                    if len(z.strip()) > 2:
                        try:
                            results[i].append(z)
                        except:
                            results[i] = [z,]
        else:
            for z in author_candidates:
                if len(z.strip()) > 2:
                    try:
                        results[i].append(z)
                    except:
                        results[i] = [z,]
    
    if len(results.keys()) == 0:        
        result = "not found"

    if len(review.person_names) == 0:        
        result = "not found"
    
    
    try:
        all_author_matches["type"].append(result) 
    except:
        all_author_matches["type"] = [result,]
        
    try:
        all_author_matches["results"].append(results)
    except:
        all_author_matches["results"] = [results,]
    try:
        all_author_matches["match_number"].append(len(results.keys()))
    except:
        all_author_matches["match_number"] = [len(results.keys()),]

In [None]:
all_author_matches["results"][0]

In [None]:
# build viaf store, add columns for comparison
author_tuples = [(i.reviewed_author_name, i.reviewed_author_viaf_match) for i in aps_details_single if i.reviewed_author_name is not None and i.reviewed_author_name !='']
viaf_store = {}
count = 0
for f,g in author_tuples:
    i = f.split(";")
    uris = g.split(";")
    i = f.split(";")
    uris = g.split(";")
    tail = []
    if len(i) > len(uris):
        for m in range(len(i) - len(uris)):
            tail.append('not available')
        uris = uris+tail
    if len(uris) > len(i):
        uris = uris[:4]
        i = i[:4]
        
    for e, j in enumerate(uris):
        if j == '' or j.lower() == "not available":
            try: 
                is_in = viaf_store[i[e]]
            except:
                viaf_store[i[e]] = [count,]
                count +=1
        else:
            try:
                if j not in viaf_store[i[e]]:
                    viaf_store[i[e]].append(j)
            except:
                viaf_store[i[e]] = [j,]

In [None]:
from collections import Counter
import random

top_candidates = []
for r in all_author_matches['results']:
    top = {}
    for a,b in r.items():
        for c in b: 
            score = edit_distance(a,c)
            try:
                top[a][c] = score
            except:
                top[a] = {}
                top[a][c] = score
        #should be room to improve this
        
    best = []
    for i,j in top.items():
        lowest = ""
        score = 100
        for c,s in j.items():
            if s < score:
                lowest = c
                score = s
        best.append(lowest)
    best.sort()
    best = list(set(best))
    top_candidates.append(";".join(best))   

In [None]:
import pandas as pd

df = pd.DataFrame(all_author_matches)
df['recorded_authors'] = authors
df['top_candidates'] = top_candidates
#df['recorded_viaf'] = recorded_viaf
#df['top_result_viaf'] = top_result_viaf
df = df.drop(['results'], axis=1)
len(df.loc[(df['match_number'] == 1) & (df['recorded_authors'] == df['top_candidates'])])/len(df.loc[(df['match_number'] == 1)])
#78% accurate

In [None]:
len(df.loc[(df['match_number'] == 1)])

In [None]:
true_authors = []
for row in df.iterrows():
    true_authors.extend(row[1]['recorded_authors'].split(";"))
len(true_authors)

In [None]:
false_positives_count = []
missed_author_count = []
matched_correctly_count = []
perfect_match_count = []

for row in df.iterrows():
    candidate_list = [i.strip() for i in row[1]['top_candidates'].split(";") if i !='']
    target_list = [i.strip() for i in row[1]['recorded_authors'].split(";") if i !='']
    if len(candidate_list) > 1:
        # number of matches and number of false positives
        fp = 0
        matches = 0
        for name in candidate_list:
            if name not in target_list:
                fp += 1

            else:
                matches += 1
        false_positives_count.append(fp)
        matched_correctly_count.append(matches)

        # number of missed authors
        fn = 0
        for name in target_list:
            if name not in candidate_list:
                fn +=1
        missed_author_count.append(fn)

        # perfect match?
        if fn == 0 and fp == 0:
            perfect = True
        else:
            perfect = False

        perfect_match_count.append(perfect)
sum(false_positives_count), sum(missed_author_count), sum(matched_correctly_count)
#(149, 35, 149)
# including single match (188, 84, 309)


In [None]:
len([i for i in perfect_match_count if i == True])/len(perfect_match_count)
#0.3157894736842105
#0.3157894736842105

In [None]:
# try all of this with reviews_parsed.no_pubs_text
len(perfect_match_count)