# Initial Setup

- import various helpers, load data, select reviews by status and category

In [1]:
import sys
sys.path.append('../')

from application.name_obj_classes import PubName, PersonName, remove_punct

from application.review_obj_class import ReviewObj

from application.text_preprocessing import preprocess_text

import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [2]:
from database import *
import database.models as models

# load full text from db
aps_details_single = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'done'))).filter(models.Review.review_type == 'single_focus').all()

len(aps_details_single)

561

In [3]:
reviews_parsed = [ReviewObj(i.record_id, i.full_text) for i in aps_details_single if i.reviewed_author_name !='' and i.reviewed_author_name is not None]

In [4]:
authors = [i.reviewed_author_name for i in aps_details_single if i.reviewed_author_name !='' and i.reviewed_author_name is not None]
len(authors)

533

## Part 1: Person Names

In [115]:
# use for titles? 
reviews_parsed[0].no_pubs_text[:10]

'Missouri: '

In [116]:
# consolidate duplicates? 
reviews_parsed[3].person_names

[Dr. Fox, Dr. Fox]

In [117]:
known_authors = list(set(authors))
len(known_authors)

522

In [118]:
authors[1]

'Lord Byron; W. A. Lewis Bettany'

In [119]:
from fuzzysearch import find_near_matches

def match_known_authors(text, k_a):
    all_matches = {}
    for a in k_a:
        # a might be multiple
        candidates = a.split(';')
        for can in candidates:
        #base fuzziness on length of author name
            if len(can) < 10:
                fuzz=0
            elif len(can) > 9 and len(can) < 20:
                fuzz=1
            else:
                fuzz=3
            matches = find_near_matches(can, text, max_l_dist=fuzz)
            if len(matches) > 0:
                match_strings = [text[m.start:m.end] for m in matches]
                for i in match_strings:
                    try:
                        all_matches[can].append(i)
                    except:
                        all_matches[can] = [i,]
        for k,v in all_matches.items():
            if len(v) == 1 and k == v[0]:  
                return ("found", [k,])
    result = list(all_matches.keys())
    if len(result) > 0:
        return ("found", result)
    else:
        return ("not found", result)

def match_surname(text, k_a):
    all_matches = {}
    for a in k_a:
        # a might be multiple
        candidates = a.split(';')
        for can in candidates:
            surname = can.split()[-1]
            #base fuzziness on length of author name
            if len(surname) < 10:
                fuzz=0
            elif len(surname) > 9 and len(surname) < 20:
                fuzz=1
            else:
                fuzz=3
            matches = find_near_matches(surname, text, max_l_dist=fuzz)
            if len(matches) > 0:
                match_strings = [text[m.start:m.end] for m in matches]
                for i in match_strings:
                    try:
                        all_matches[can].append(i)
                    except:
                        all_matches[can] = [i,]
        for k,v in all_matches.items():
            if len(v) == 1 and k == v[0]:  
                return ("found", [k,])
    result = list(all_matches.keys())
    if len(result) > 0:
        return ("found", result)
    else:
        return ("not found", result)
    

In [120]:
for i in reviews_parsed[0].person_names:
    #print(i, match_known_authors(i, known_authors))
    print(i, match_surname(i, known_authors))

Mr. Carr ('found', ['Lucien Carr', 'Clark E. Carr'])
Mr. Carr ('found', ['Lucien Carr', 'Clark E. Carr'])


In [121]:
all_author_matches = {}

# run on all reviews, return matches, novel, or no match
for review in reviews_parsed:
    
    results = {}
    names = list(set(review.person_names))
    for i in names:
        # function to fuzzy match known authors
        result, author_candidates = match_known_authors(i, known_authors)
        if len(author_candidates) == 0:
            result, author_surname_candidates = match_surname(i, known_authors)
            if len(author_surname_candidates) > 0:
                for z in author_surname_candidates:
                    if len(z.strip()) > 2:
                        try:
                            results[i].append(z)
                        except:
                            results[i] = [z,]
        else:
            for z in author_candidates:
                if len(z.strip()) > 2:
                    try:
                        results[i].append(z)
                    except:
                        results[i] = [z,]
    
    if len(results.keys()) == 0:        
        result = "not found"

    if len(review.person_names) == 0:        
        result = "not found"
    
    
    try:
        all_author_matches["type"].append(result) 
    except:
        all_author_matches["type"] = [result,]
        
    try:
        all_author_matches["results"].append(results)
    except:
        all_author_matches["results"] = [results,]
    try:
        all_author_matches["match_number"].append(len(results.keys()))
    except:
        all_author_matches["match_number"] = [len(results.keys()),]

In [122]:
# build viaf store, add columns for comparison
author_tuples = [(i.reviewed_author_name, i.reviewed_author_viaf_match) for i in aps_details_single if i.reviewed_author_name is not None and i.reviewed_author_name !='']
viaf_store = {}
count = 0
for f,g in author_tuples:
    i = f.split(";")
    uris = g.split(";")
    i = f.split(";")
    uris = g.split(";")
    tail = []
    if len(i) > len(uris):
        for m in range(len(i) - len(uris)):
            tail.append('not available')
        uris = uris+tail
    if len(uris) > len(i):
        uris = uris[:4]
        i = i[:4]
        
    for e, j in enumerate(uris):
        if j == '' or j.lower() == "not available":
            try: 
                is_in = viaf_store[i[e]]
            except:
                viaf_store[i[e]] = [count,]
                count +=1
        else:
            try:
                if j not in viaf_store[i[e]]:
                    viaf_store[i[e]].append(j)
            except:
                viaf_store[i[e]] = [j,]
viaf_store['Lucien Carr']

['http://viaf.org/viaf/71253540']

In [123]:
from collections import Counter
import random

top_candidates = []
for r in all_author_matches['results']:
    top = []
    for c in r.values():
        top_match = random.choice(c)
        top.append(top_match) 
    top.sort()
    top = list(set(top))
    top_candidates.append(";".join(top))   

In [124]:
import pandas as pd

df = pd.DataFrame(all_author_matches)
df['recorded_authors'] = authors
df['top_candidates'] = top_candidates
#df['recorded_viaf'] = recorded_viaf
#df['top_result_viaf'] = top_result_viaf
df = df.drop(['results'], axis=1)
len(df.loc[(df['match_number'] == 1) & (df['recorded_authors'] == df['top_candidates'])])/len(df.loc[(df['match_number'] == 1)])
#75.8% accurate when one author

0.7230769230769231

In [135]:
true_authors = []
for row in df.iterrows():
    true_authors.extend(row[1]['recorded_authors'].split(";"))
len(true_authors)

584

In [141]:
false_positives_count = []
missed_author_count = []
matched_correctly_count = []
perfect_match_count = []

for row in df.iterrows():
    candidate_list = [i.strip() for i in row[1]['top_candidates'].split(";") if i !='']
    target_list = [i.strip() for i in row[1]['recorded_authors'].split(";") if i !='']
    if len(candidate_list) > 0:
        # number of matches and number of false positives
        fp = 0
        matches = 0
        for name in candidate_list:
            if name not in target_list:
                fp += 1

            else:
                matches += 1
        false_positives_count.append(fp)
        matched_correctly_count.append(matches)

        # number of missed authors
        fn = 0
        for name in target_list:
            if name not in candidate_list:
                fn +=1
        missed_author_count.append(fn)

        # perfect match?
        if fn == 0 and fp == 0:
            perfect = True
        else:
            perfect = False

        perfect_match_count.append(perfect)
sum(false_positives_count), sum(missed_author_count), sum(matched_correctly_count)
#(149, 35, 149)
# including single match (188, 84, 309)



(193, 90, 297)

In [140]:
len([i for i in perfect_match_count if i == True])/len(perfect_match_count)
#0.3157894736842105
#0.3157894736842105

0.5478260869565217