# Initial Setup

- import various helpers, load data, select reviews by status and category

In [1]:
import sys
sys.path.append('../')

from application.name_obj_classes import PubName, PersonName, remove_punct

from application.review_obj_class import ReviewObj

from application.text_preprocessing import preprocess_text

import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [2]:
from database import *
import database.models as models

# load full text from db
aps_details_single = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'done'))).filter(models.Review.review_type == 'single_focus').all()

len(aps_details_single)

561

In [168]:
reviews_parsed = [ReviewObj(i.record_id, i.full_text) for i in aps_details_single if i.reviewed_book_publisher !='']
publishers = [i.reviewed_book_publisher for i in aps_details_single if i.reviewed_book_publisher !='']
len(publishers)
#reviews_parsed[0].cleaned_text
#reviews_parsed[0].cleaned_toks[:10]

516

In [170]:
len(reviews_parsed) == len(publishers)

True

## Publisher Names

The overall approach here is to fuzzy match uppercase N-Grams with known publisher names, and/or key terms like company and inc. Very much like the title function but it drops anything that doesn't meet "publisher" criteria.

In [36]:
import string
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords

def remove_function_tail(sequence):
    if sequence[-1].lower() in stopwords.words('english'):
        sequence.pop()
        return remove_function_tail(sequence)
    else:
        return sequence

In [171]:
# get pub names from db
known_publishers = list(set(publishers))

In [59]:
from collections import Counter
from nltk import word_tokenize

pub_tokens = [word_tokenize(i) for i in known_publishers]
flat_list = [item for sublist in pub_tokens for item in sublist]
Counter(flat_list).most_common(100)

[('.', 102), ('&', 92), ('Co', 81), ('Company', 44), (',', 35), ('The', 25), ('and', 18), ('Publishing', 16), ('C.', 13), ('H.', 13), ("'s", 11), ('A.', 9), ('Son', 8), ('B.', 8), ('Sons', 8), ('Brothers', 7), ('Macmillan', 7), ('Lippincott', 7), ('Press', 7), ('Houghton', 7), ('E.', 6), ('D.', 6), ('Appleton', 6), ('Harper', 6), ('Scribner', 6), ('University', 6), ('Mifflin', 6), ('F.', 5), ('W.', 5), ('P.', 5), ('Dutton', 5), ('J.', 5), ('G.', 4), ('Revell', 4), ('Armstrong', 4), ('J', 4), ('Putnam', 4), ('James', 4), ('of', 4), ('R.', 4), ('Baker', 3), ('Henry', 3), ('Stokes', 3), ('Duffield', 3), ('Marshall', 3), ('McClurg', 3), ('Little', 3), ('Brown', 3), ('Lea', 3), ('Saunders', 3), ('Dodd', 3), ('Mead', 3), ('Wells', 3), ('Pub', 3), ('Funk', 3), ('Wagnalls', 3), ('McClure', 3), ('Cassell', 3), ('Society', 3), ('Bros', 3), ('Boni', 3), ('Liveright', 3), ('Charles', 3), ('John', 3), ('Sherman', 2), ('Frederick', 2), ('Knopf', 2), ('Holt', 2), ('Jones', 2), ('B', 2), ('Treat', 2),

In [60]:
pub_ends = ['company','co','incorporated','inc','firm','press','group','publishers','publishing', \
            'publications','pub','books','ltd','limited','society','house','associates', 'book', 'university']
pub_ends = [x.capitalize() for x in pub_ends]
#pub_ends_list = '|'.join([x.capitalize()+'\.?(?!\w)' for x in pub_ends])

In [222]:
test_toks = reviews_parsed[507].cleaned_toks
test_toks[:10]

['III', 'THE', 'WOMAN', 'WITH', 'EMPTY', 'HANDS', '!', '*', 'This', 'record']

In [223]:
test_text = reviews_parsed[507].cleaned_text
test_text[:10]

'III THE WO'

In [274]:
# look for fuzzy match in free text
from fuzzysearch import find_near_matches

def match_known_publishers(text, k_p):
    all_matches = {}
    for p in k_p:
        #base fuzziness on length of pubname
        if len(p) < 10:
            fuzz=0
        elif len(p) > 9 and len(p) < 20:
            fuzz=1
        else:
            fuzz=3
        matches = find_near_matches(p, text, max_l_dist=fuzz)
        if len(matches) > 0:
            match_strings = [text[m.start:m.end] for m in matches]
            for i in match_strings:
                try:
                    all_matches[p].append(c)
                except:
                    all_matches[p] = [c,]
    for k,v in all_matches.items():
        if len(v) == 1 and k == v[0]:  
            return ("found", [k,])
        else:
            result = list(all_matches.keys())
            if len(result) > 0:
                return ("found", result)
            else:
                return ("not found", result)
    

def match_pub_end_sequences(tokens, pub_ends, k_p):
    title_candidates = [list(),]
    for token in tokens:
        if token.istitle() or token in ['and', '&'] or token in string.punctuation:
            if len(title_candidates[-1]) > 0:
                if token not in string.punctuation:
                    title_candidates[-1].append(token)
            else:
                if token.istitle():
                    title_candidates[-1].append(token)
        else:
            if len(title_candidates[-1]) > 0:
                title_candidates.append(list())
    
    matches = []
    for sequence in title_candidates:
        for token in sequence:
            normed_token = token.lower().translate(str.maketrans('', '', string.punctuation))
        
            if normed_token in [i.lower() for i in pub_ends]:
                matches.append(sequence)
                break
    result = [] 
    for sequence in matches:
        r, this_result = match_known_publishers(" ".join(sequence), k_p)
        
        if len(this_result) > 0:
            result.append(this_result)
    if len([" ".join(i) for i in result]) > 0:
        return ("found", [" ".join(i) for i in result])
    else:
        return ("novel", [" ".join(i) for i in matches])

In [275]:
# tests
result, publisher_candidates = match_known_publishers(test_text, known_publishers)
if len(publisher_candidates) == 0:
    result, publisher_candidates = match_pub_end_sequences(test_toks, pub_ends, known_publishers)
    
#[" ".join(i[1]) for i in match_pub_end_sequences(test_toks, pub_ends, known_publishers)]
result, publisher_candidates 


('found', ['Dodd, Mead, and Company', 'Dodd, Mead and Company'])

In [314]:
all_publisher_matches = {}

# run on all reviews, return matches, novel, or no match
for review in reviews_parsed:
    
    result, publisher_candidates = match_known_publishers(review.cleaned_text, known_publishers)
    
    if len(publisher_candidates) == 0:        
        result, publisher_candidates = match_pub_end_sequences(review.cleaned_toks, pub_ends, known_publishers)
    
    if len(publisher_candidates) > 0:
        top_publisher_candidate = publisher_candidates[0]
    else:
        result = "not found"
        top_publisher_candidate = ''
    
    try:
        all_publisher_matches["type"].append(result) 
    except:
        all_publisher_matches["type"] = [result,]
        
    try:
        all_publisher_matches["results"].append(top_publisher_candidate)
    except:
        all_publisher_matches["results"] = [top_publisher_candidate,]
    try:
        all_publisher_matches["match_number"].append(len(publisher_candidates))
    except:
        all_publisher_matches["match_number"] = [len(publisher_candidates),]

In [315]:
# build viaf store, add columns for comparison
pub_tuples = [(i.reviewed_book_publisher, i.reviewed_book_publisher_viaf_match) for i in aps_details_single if i.reviewed_book_publisher !='']
viaf_store = {}
count = 0
for i,j in pub_tuples:
    if j == '' or not j:
        try: 
            is_in = viaf_store[i]
        except:
            viaf_store[i] = [count,]
            count +=1
    else:
        try:
            if j not in viaf_store[i]:
                viaf_store[i].append(j)
        except:
            viaf_store[i] = [j,]
viaf_store['Houghton, Mifflin & Company']

['http://viaf.org/viaf/159556432']

In [306]:
import pandas as pd
df = pd.DataFrame(all_publisher_matches)
df['recorded_publisher'] = publishers

top_result_viaf = []
for i in list(df['results']):
    try: 
        top_result_viaf.append(viaf_store[i][0])
    except: 
        top_result_viaf.append(None)
        
recorded_viaf = []
for i in list(df['recorded_publisher']):
    try: 
        recorded_viaf.append(viaf_store[i][0])
    except: 
        recorded_viaf.append(None)
        
df['recorded_viaf'] = recorded_viaf
df['top_result_viaf'] = top_result_viaf
df
#df.loc[(df['type'] == 'match') & (df['match_number'] == 1)]
len(df.loc[df['recorded_viaf'] == df['top_result_viaf']])/len(df.loc[df['type'] == 'found'])

0.9733009708737864

In [308]:
len(df.loc[df['recorded_viaf'] != df['top_result_viaf']])/len(df)

0.22286821705426357

In [309]:
skipped = len(df.loc[(df['type'] != 'found')])
correct_or_skipped = len(df.loc[df['recorded_viaf'] == df['top_result_viaf']])+len(df.loc[(df['type'] != 'found')])
total = len(df)
1-correct_or_skipped/total, skipped/total

# 20.16% of reviews, no publisher found or novel publisher suggested
# 77.71% of reviews matched and are correct
# 2.13% are incorrect ... match could be in the ranked list for six of them

(0.02131782945736438, 0.20155038759689922)

In [312]:
#df.loc[(df['recorded_viaf'] != df['top_result_viaf']) & df['match_number'] > 0]
#df.loc[(df['type'] == 'found') & (df['recorded_viaf'] != df['top_result_viaf'])]
len(df.loc[df['type'] == 'not found'])

43

In [313]:
len(df.loc[df['type'] == 'novel'])

61

In [None]:
# assess accuracy of review.pubnames

all_publisher_matches = {}

# run on all reviews, return matches, novel, or no match
for review in reviews_parsed:
    
    result, publisher_candidates = match_known_publishers(review.cleaned_text, known_publishers)
    
    if len(publisher_candidates) == 0:        
        result, publisher_candidates = match_pub_end_sequences(review.cleaned_toks, pub_ends, known_publishers)
    
    if len(publisher_candidates) > 0:
        top_publisher_candidate = publisher_candidates[0]
    else:
        result = "not found"
        top_publisher_candidate = ''
    
    try:
        all_publisher_matches["type"].append(result) 
    except:
        all_publisher_matches["type"] = [result,]
        
    try:
        all_publisher_matches["results"].append(top_publisher_candidate)
    except:
        all_publisher_matches["results"] = [top_publisher_candidate,]
    try:
        all_publisher_matches["match_number"].append(len(publisher_candidates))
    except:
        all_publisher_matches["match_number"] = [len(publisher_candidates),]