# Comparison Pipeline

Input:
    For each target, a comparison candidate from Annoy
    
Output:
    For each target-candidate pair, export a series of content-based stats.

In [1]:
%load_ext autoreload
%autoreload 2
from compare_tools.hathimeta import HathiMeta, get_json_meta
from compare_tools.configuration import config, init_htid_args
from compare_tools.comparison import Comparison, HTIDComparison
from compare_tools.utils import HTID
htid_args = init_htid_args(config)

In [192]:
left = HTID('hvd.ah4wfm', **htid_args) # College Sermons + A Set of Parish Sermons
right = HTID('hvd.ah4wfn', **htid_args) # just College Sermons
comp = HTIDComparison(left, right)

# Stats

In [3]:
comp.stat_pagecounts()

{'leftpagecount': 650,
 'rightpagecount': 372,
 'pageDiff': 278,
 'pagePropDiff': 0.4276923076923077}

In [4]:
comp.stat_simmat()

{'LeftSize': 35,
 'RightSize': 21,
 'minSize': 21,
 'LeftMeanMinSim': 0.02976813654306945,
 'RightMeanMinSim': 0.002212450008465155,
 'MeanSim': 0.046355374307591714,
 'LeftTruncSim': 0.0022332966767536966,
 'RightTruncSim': 0.002212450008465155,
 'LeftPropThresh001': 0.6,
 'LeftPropThresh003': 0.9428571428571428,
 'LeftPropThresh005': 0.9714285714285714,
 'LeftPropThresh008': 0.9714285714285714,
 'RightPropThresh001': 1.0,
 'RightPropThresh003': 1.0,
 'RightPropThresh005': 1.0,
 'RightPropThresh008': 1.0}

In [5]:
comp.stat_sw()

{'SW0020Len': 22, 'SW0005Len': 22}

In [79]:
# Get all at once
comp.all_stats()

{'leftpagecount': 129,
 'rightpagecount': 454,
 'pageDiff': -325,
 'pagePropDiff': -2.5193798449612403,
 'SW0020Len': 0,
 'SW0005Len': 0,
 'LeftSize': 22,
 'RightSize': 43,
 'minSize': 22,
 'LeftMeanMinSim': 0.1665143436005689,
 'RightMeanMinSim': 0.2025905484694421,
 'MeanSim': 0.32586165542979595,
 'RightTruncSim': 0.1721637839941062,
 'LeftTruncSim': 0.1665143436005689,
 'LeftPropThresh001': 0.0,
 'LeftPropThresh003': 0.0,
 'LeftPropThresh005': 0.0,
 'LeftPropThresh008': 0.0,
 'RightPropThresh001': 0.0,
 'RightPropThresh003': 0.0,
 'RightPropThresh005': 0.0,
 'RightPropThresh008': 0.0,
 'left': 'aeu.ark:/13960/t1zc8d06f',
 'right': 'hvd.32044106449234'}

Other potential stats
- jaccard sim
- How many unique words are there?
- sims by page, by SRP
- ...

## Example crunching

In [81]:
import glob
import time
import pandas as pd
from htrc_features import utils
annmatches = glob.glob('/projects/saddl-main/ann-ef/matches/*')

In [140]:
# To raise on warnings while debugging
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('error')
    #do_something

In [None]:
stats_collector = []
i = 0 
start = time.time()

for matchfile in annmatches:
    df = pd.read_parquet(matchfile)
    df = df[(df.prop_target > 0.5) | (df.prop_match > 0.5)]
    df = df[df['count'] > 1]
    df = df[df.match != df.target]
    
    for target, matches in df.groupby('target'):
        if i % 1000 == 0:
            print(i, len(stats_collector), target, time.time()-start)
        left = HTID(target, **htid_args)
        
        for match in matches['match']:
            right = HTID(match, **htid_args)
            comp = HTIDComparison(left, right)
            try:
                stats_collector.append(comp.all_stats())
            except KeyboardInterrupt:
                raise
            except:
                print("Issue with ", left.htid, right.htid)

        if len(stats_collector) > 10000:
            pd.DataFrame(stats_collector).to_parquet('/data/saddl/tmp-comp-output/4' + utils.clean_htid(target) + 'andmore.parquet', compression='snappy')
            stats_collector = []

        i += 1
pd.DataFrame(stats_collector).to_parquet('/data/saddl/tmp-comp-output/4' + utils.clean_htid(target) + 'andmore.parquet', compression='snappy')
end = time.time()

0 0 aeu.ark:/13960/t0000vt1c 0.05006051063537598


get rough ground truth

In [29]:
def judge_meta(l,r):
    judge_code = ""
    
    if l['title'] == r['title']:
        judge_code += "T1"
    elif l['title'][:20] == r['title'][:20]:
        judge_code += "T2"
    else:
        judge_code += "T0"
        
    for char, field in [('A', 'author'), ('P', 'rights_date_used'), ('O', 'oclc_num')]:
        if l[field] == r[field]:
            judge_code += char + "1"
        else:
            judge_code += char + "0"
        
    if l['page_count'] == r['page_count']: 
        judge_code += "C1"
    elif abs(l['page_count'] - r['page_count']) < 10:
        judge_code += "C2"
    elif abs(l['page_count'] - r['page_count']) < 20:
        judge_code += "C3"
    else:
        judge_code += "C0"
    
    if not l['description'] and not r['description']:
        judge_code += "D2"
    elif not l['description'] or not r['description']:
        judge_code += "D3"
    elif (l['description'] == r['description']):
        judge_code += "D1"
    else:
        judge_code += "D0"

    return judge_code

judge_meta(left.meta().to_dict(), right.meta().to_dict())

'T0A1P0O0C0D2'

In [None]:
start = time.time()
fields = ['title', 'author', 'rights_date_used', 'oclc_num', 'page_count', 'description']
meta = htid_args['hathimeta']
paths= glob.glob('/data/saddl/tmp-comp-output/*')

i = 0

for path in paths:
    collector = []
    df = pd.read_parquet(path, columns=['left', 'right'])
    for j, row in df.iterrows():
        lmeta = meta.get_volume(row.left, fields)
        rmeta = meta.get_volume(row.right, fields)
        try:
            meta_code = judge_meta(lmeta, rmeta)
        except:
            print('err')
        collector.append((row.left, row.right, meta_code))
        i += 1
        if i % 1000 == 0:
            print(i, time.time()-start)
    outpath = path.replace('tmp-comp-output', 'tmp-ground-truth')
    pd.DataFrame(collector, columns=['left', 'right', 'meta_code']).to_parquet(outpath)
end = time.time()

1000 63.76490330696106
2000 127.59842824935913
3000 191.3848659992218
4000 254.9635157585144
5000 318.5099902153015
6000 382.25969433784485
7000 446.05241680145264
8000 509.6741111278534
9000 573.5247383117676
10000 636.906721830368
11000 700.0996561050415
12000 763.8884723186493
13000 827.3452496528625
14000 890.7076849937439
15000 954.1151504516602


In [75]:
i = 0
for path in paths:
    i += pd.read_parquet(path).shape[0]
i

700577