In [5]:
with open("../data/english_words.txt") as f:
    english = f.read() 
    f.close()

ENGLISH = {i:True for i in english.split("\n")}

def is_english(word):
    try:
        ENGLISH[word]
        return True
    except:
        return False
    
correction_rules = {}
with open('../data/CorrectionRules.txt') as f:
    filelines = f.readlines()
    f.close()

for line in filelines:
    line = line.rstrip()
    fields = line.split("\t")
    correction_rules[fields[0]] = fields[1]

hyphen_rules = {}

with open('../data/HyphenRules.txt') as f:
    filelines = f.readlines()
    filelines.reverse()
    # Doing this so that unhyphenated forms get read before hyphenated ones.
    f.close()

for line in filelines:
    line = line.rstrip()
    fields = line.split("\t")
    Word = fields[0].rstrip()
    Corr = fields[1].rstrip()
    hyphen_rules[Word] = Corr
    if " " in Corr:
        StripWord = Word.replace("-", "")
        hyphen_rules[StripWord] = Corr
        # That's so that we split "tigermoth" as well as "tiger-moth" into "tiger moth."
            
    if "-" in Word:
        StripWord = Word.replace("-", "")
        StripCorr = Corr.replace(" ", "")
        StripCorr = StripCorr.replace("-", "")
        if StripWord != StripCorr and StripWord not in hyphen_rules:
            hyphen_rules[StripWord] = Corr
            
        ## The purpose of this is a bit obscure to me. It may be deletable.

fuse_rules = {}
with open('../data/FusingRules.txt') as f:
    filelines = f.readlines()
    f.close()

for Line in filelines:
    Line = Line.rstrip()
    LineParts = Line.split("\t")
    Word = LineParts[0].rstrip()
    Word = tuple(Word.split(' '))
    Corr = LineParts[1].rstrip()
    fuse_rules[Word] = Corr

syncope_rules = {}
with open('../data/SyncopeRules.txt') as f:
    filelines = f.readlines()
    f.close()

for line in filelines:
    line = line.rstrip()
    fields = line.split("\t")
    syncope_rules[fields[0]] = fields[1]

variant_rules = {}
with open('../data/VariantSpellings.txt') as f:
    filelines = f.readlines()
    f.close()
        
for line in filelines:
    line = line.rstrip()
    fields = line.split("\t")
    variant_rules[fields[0]] = fields[1]

In [12]:
def substitute_by_dict(tokens, rulestore):
    """A generalized function to apply substitution rules to a list of tokens"""
    output = []
    for token in tokens:
        token = token.lower()
        try:
            new = rulestore[token]
            output.append(new)
        except:
            output.append(token)
    return output

def hyphen_split(tokens):
    tokens_hyphen_split = []
    for i in tokens:
        if '-' in i:
            term_list = i.split('-')
            if is_english("".join(term_list)):
                tokens_hyphen_split.append(''.join(term_list))
            else:    
                e = True
                # test if all words are in the English dictionary
                for i in term_list:
                    if not is_english(i):
                        e = False
                        break
                if e == True:
                    tokens_hyphen_split.extend(term_list)
                else:
                    tokens_hyphen_split.append(''.join(term_list))
        else:
            tokens_hyphen_split.append(i)
    return tokens_hyphen_split



In [14]:
# move to tests folder
test = ['tbis', 'is','fome', 'dirty','ocr', 'tbat', 'i', 'want', 'to', 'test', 'abso-lutely', 'noth-ing', 'I', 'can', 'do', 'topsy-turvy', 'bitter-sweet']

for i in [correction_rules, hyphen_rules, fuse_rules, syncope_rules, variant_rules]:
    test = substitute_by_dict(test, i)
test = hyphen_split(test)
test == ['this','is','some','dirty','ocr','that','i','want','to','test','absolutely','nothing','i','can','do','topsyturvy','bittersweet']

True

In [15]:
import sys
sys.path.append('../')

from application.name_obj_classes import PubName, PersonName, remove_punct

from application.review_obj_class import ReviewObj

from application.text_preprocessing import preprocess_text

import os
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
from nltk.metrics import edit_distance
%pprint

Pretty printing has been turned OFF


In [16]:
from database import *
import database.models as models

# load full text from db
aps_details_single = models.Review().query.filter(models.Review.status.in_(('needs_crosscheck', 'done'))).filter(models.Review.review_type == 'single_focus').all()

len(aps_details_single)

561

In [17]:
titles = [i.reviewed_book_title for i in aps_details_single]
titles[:5]

['Missouri', 'The Confessions of Lord Byron', 'Carnival', 'Photographic Illustrations of Cutaneous Syphilis', 'Life and Letters of Dante Gabriel Rossetti']

In [18]:
reviews_parsed = [ReviewObj(i.record_id, i.full_text) for i in aps_details_single]
#reviews_parsed[0].cleaned_text
reviews_parsed[0].cleaned_toks[:10]

['Missouri', ':', 'A', 'Bone', 'of', 'Contention', "'", 'WE', 'aware', 'that']

In [22]:
import string
from nltk.util import ngrams
from collections import Counter
from nltk.corpus import stopwords

def remove_function_tail(sequence):
    if sequence[-1].lower() in stopwords.words('english'):
        sequence.pop()
        return remove_function_tail(sequence)
    else:
        return sequence

In [23]:
all_candidates_tidy = []
for one_review in reviews_parsed:
    title_candidates = [list(),]
    for token in one_review.cleaned_toks:
        if token.istitle() or token in stopwords.words('english') or token in string.punctuation:
            if len(title_candidates[-1]) > 0:
                if token not in string.punctuation:
                    title_candidates[-1].append(token)
            else:
                if token.istitle():
                    title_candidates[-1].append(token)
        else:
            if len(title_candidates[-1]) > 0:
                title_candidates.append(list())
    
    candidates_tidy = []
    for sequence in title_candidates:
        # rule out if all function words
        all_function = True
        for word in sequence:
            if word.lower() not in stopwords.words('english'):
                all_function = False
                break
        if all_function == False:
            #remove function word tails recursively
            sequence = remove_function_tail(sequence)
            candidates_tidy.append(sequence)
    all_candidates_tidy.append(candidates_tidy ) 

In [35]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize

In [40]:
all_scores = []
# loop all_candidates_tidy
for e,t in enumerate(all_candidates_tidy):
    # true label is titles[e]
    # get headline text
    block_one = aps_details_single[e].record_title
    block_one_tokens = word_tokenize(block_one)
    
    
    for ruleset in [correction_rules, hyphen_rules, fuse_rules, syncope_rules, variant_rules]:
        block_one_tokens = substitute_by_dict(block_one_tokens, ruleset)
        
    block_one = " ".join(block_one_tokens)
    
    text_blocks = []
    for i in t:
        for rule in [correction_rules, hyphen_rules, fuse_rules, syncope_rules, variant_rules]:
            i = substitute_by_dict(i, rule)
        text_blocks.extend(i)
    
    text_merged = [block_one + " " + " ".join(text_blocks),]    
    
    title_tokens = [word_tokenize(i) for i in titles]
    titles_normed = []
    for i in title_tokens:
        for rule in [correction_rules, hyphen_rules, fuse_rules, syncope_rules, variant_rules]:
            i = substitute_by_dict(i, rule)
        titles_normed.append(" ".join(i))
        
    # this title is comparison_set[-1]
    comparison_set = titles_normed + text_merged

    # compare 
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(comparison_set)
    vectors = X.toarray()
    # loop all, get similarity, last one is always 1.0
    scores = []
    for v in vectors:
        score = cosine_similarity([v], [vectors[-1]])
        scores.append(score)
    all_scores.append(scores)  



In [41]:
# how often is the right answer the top answer? how often is the correct match in the top 5?
output = []
for e, score_grid in enumerate(all_scores):
    df = pd.DataFrame()
    df['score'] = [i[0][0] for i in score_grid][:-1]
    df['title'] = titles
    match = []
    for i in range(len(titles)):
        if e == i:
            match.append('yes')
        else:
            match.append('no')
    df['match'] = match
    output.append(df.sort_values(by="score", ascending=False).reset_index(drop=False))

In [42]:
top_match = []
top_five = []
top_ten = []
top_25 = []
for i in output:
    df_five = i.iloc[:5]
    df_ten = i.iloc[:10]
    df_25 = i.iloc[:25]
    top_match.append(i.iloc[0]['match'] == 'yes')
    top_five.append(len(df_five.loc[df_five['match'] == 'yes']) > 0)
    top_ten.append(len(df_ten.loc[df_ten['match'] == 'yes']) > 0)
    top_25.append(len(df_25.loc[df_25['match'] == 'yes']) > 0)

In [48]:
# 50% correct without headline; 55.25% with; 55.43 with correction rules
len([i for i in top_match if i])/len(top_match)
# 62%; 65.59%; 65.59
len([i for i in top_five if i])/len(top_five)
# 65%; 68.98; 69.16
len([i for i in top_ten if i])/len(top_ten)
#69%; 73.44; 73.08
len([i for i in top_25 if i])/len(top_25)

0.7308377896613191

In [44]:
results_audit = []
for i in output:
    df = i.loc[i['match'] == 'yes']
    results_audit.append(df)
# results_audit[10]
# this title was 214th out of len(titles)
df = pd.concat(results_audit).reset_index(drop=False)
df = df.rename(columns={"index": "original_index", "level_0": "rank"})
df.loc[df['rank'] == 0]

Unnamed: 0,rank,original_index,score,title,match
0,0,0,0.674200,Missouri,yes
1,0,1,0.636107,The Confessions of Lord Byron,yes
2,0,2,0.373718,Carnival,yes
4,0,4,0.678571,Life and Letters of Dante Gabriel Rossetti,yes
5,0,5,0.305888,Master of Ballantrae,yes
...,...,...,...,...,...
553,0,553,0.345285,Une Nuit au Luxembourg,yes
554,0,554,0.187500,Chapters on Greek Metric,yes
557,0,557,0.349927,"Moses Brown, Captain, U.S.N.",yes
559,0,559,0.583383,Creative Chemistry,yes


## This kind of normalization doesn't seem to improve accuracy in a meaningful way