# Comparison Pipeline

Input:
    For each target, a comparison candidate from Annoy
    
Output:
    For each target-candidate pair, export a series of content-based stats.

In [1]:
%load_ext autoreload
%autoreload 2
from compare_tools.hathimeta import HathiMeta, get_json_meta
from compare_tools.configuration import config, init_htid_args
from compare_tools.comparison import Comparison, HTIDComparison
from compare_tools.utils import HTID
htid_args = init_htid_args(config)

In [2]:
left = HTID('hvd.ah4wfm', **htid_args) # College Sermons + A Set of Parish Sermons
right = HTID('uc2.ark:/13960/t4fn13p7f', **htid_args) # just College Sermons
comp = HTIDComparison(left, right)

# Stats

In [3]:
comp.stat_pagecounts()

{'leftpagecount': 650,
 'rightpagecount': 372,
 'pageDiff': 278,
 'pagePropDiff': 0.4276923076923077}

In [4]:
comp.stat_quantiles('srp')

{'srpLSimQuantile0.0': 0.002980890604722397,
 'srpLSimQuantile0.1': 0.004821162643012888,
 'srpLSimQuantile0.2': 0.007044619193718238,
 'srpLSimQuantile0.3': 0.012501821534029059,
 'srpLSimQuantile0.4': 0.014625682792797034,
 'srpLSimQuantile0.5': 0.017877532621540015,
 'srpLSimQuantile0.6': 0.22987318474478613,
 'srpLSimQuantile0.7': 0.3727059124935101,
 'srpLSimQuantile0.8': 0.3874301896602962,
 'srpLSimQuantile0.9': 0.39520333368089494,
 'srpRSimQuantile0.0': 0.002980890604722397,
 'srpRSimQuantile0.1': 0.0038080200756848703,
 'srpRSimQuantile0.2': 0.005128242407271766,
 'srpRSimQuantile0.3': 0.006125291821976387,
 'srpRSimQuantile0.4': 0.008291871379677018,
 'srpRSimQuantile0.5': 0.011390809795881585,
 'srpRSimQuantile0.6': 0.013640361102202615,
 'srpRSimQuantile0.7': 0.014528551773676646,
 'srpRSimQuantile0.8': 0.015742281650323497,
 'srpRSimQuantile0.9': 0.01923474446064178}

In [5]:
comp.stat_simmat('srp', thresholds=[0.02, 0.03])

{'LSize': 17,
 'RSize': 10,
 'minSize': 10,
 'srpMeanSim': 0.3733786594829696,
 'srpLMeanMinSim': 0.1691424254623114,
 'srpLTruncSim': 0.012029156007692388,
 'srpLPropDist0020': 0.5294117647058824,
 'srpLPropDist0030': 0.5294117647058824,
 'srpRMeanMinSim': 0.012029156007692388,
 'srpRTruncSim': 0.012029156007692388,
 'srpRPropDist0020': 0.9,
 'srpRPropDist0030': 0.9}

In [6]:
comp.stat_sw()

{'SW0010Len': 10, 'SW0005Len': 10, 'SW0001Len': 10}

In [12]:
# Get all at once - hardcoded and may be better to collect elsewhere
stats = comp.all_stats()
stats.update(comp.stat_quantiles('srp'))
stats.update(comp.stat_quantiles('glove'))
stats

{'SW0010Len': 0,
 'SW0005Len': 0,
 'SW0004Len': 0,
 'SW0001Len': 0,
 'LSize': 10,
 'RSize': 3,
 'minSize': 3,
 'gloveMeanSim': 0.07262974579533292,
 'gloveLMeanMinSim': 0.06976596369485798,
 'gloveLTruncSim': 0.05608907686437,
 'gloveLPropDist0002': 0.0,
 'gloveLPropDist0005': 0.0,
 'gloveLPropDist0010': 0.0,
 'gloveLPropDist0020': 0.0,
 'gloveLPropDist0030': 0.0,
 'gloveRMeanMinSim': 0.052582923681141515,
 'gloveRTruncSim': 0.052582923681141515,
 'gloveRPropDist0002': 0.0,
 'gloveRPropDist0005': 0.0,
 'gloveRPropDist0010': 0.0,
 'gloveRPropDist0020': 0.0,
 'gloveRPropDist0030': 0.0,
 'left': 'yale.39002014323993',
 'right': 'mdp.39015010204520',
 'srpLSimQuantile0.0': 0.5203898469857141,
 'srpLSimQuantile0.1': 0.5336641846909338,
 'srpLSimQuantile0.2': 0.5423369559684419,
 'srpLSimQuantile0.3': 0.5510559132354244,
 'srpLSimQuantile0.4': 0.5610596141190138,
 'srpLSimQuantile0.5': 0.568627292708389,
 'srpLSimQuantile0.6': 0.5730703179610559,
 'srpLSimQuantile0.7': 0.5788606321085752,
 '

Other potential stats
- jaccard sim
- How many unique words are there?
- sims by page, by SRP
- ...

## Raw unrolled similarity matrix

In [37]:
import pandas as pd
pd.DataFrame([comp.unrolled_sim()])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0.99995,0.994012,0.992946,0.992918,0.988524,0.990168,0.986968,0.994553,0.99158,0.994741,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Example crunching

In [8]:
import glob
import time
import pandas as pd
from htrc_features import utils
annmatches = glob.glob('/projects/saddl-main/ann-ef/matches/*')

In [9]:
# To raise on warnings while debugging
import warnings
with warnings.catch_warnings():
    warnings.simplefilter('error')
    #do_something

In [10]:
handcoded = pd.read_csv('http://35.239.220.133/download')
handcoded.head()

Unnamed: 0.1,Unnamed: 0,rater,target,candidate,judgment,notes,timestamp
0,0,Peter,hvd.32044024501652,pst.000059706786,SWDE,"Mostly the same, but rebranded?",1580929000.0
1,1,Peter,hvd.32044024501652,uc2.ark:/13960/fk0tq5rc3v,SWDE,,1580929000.0
2,2,Peter,hvd.32044024501652,uiuo.ark:/13960/t0cv4jb3w,SWSM,,1580929000.0
3,3,Peter,hvd.32044024501652,loc.ark:/13960/t5s75px7c,AUTHOR,,1580929000.0
4,4,Peter,hvd.32044024501652,uva.x001053494,SWSM,,1580929000.0


In [None]:
%%time
from htrc_features import utils
stats_collector = []
errs = []
for leftid, candidates in handcoded[['target', 'candidate']].groupby('target'):
    try:
        leftid = utils.extract_htid(leftid)
        left = HTID(leftid, **htid_args)
    except:
        errs.append(('left', leftid))
    for rightid in candidates.candidate:
        try:
            rightid = utils.extract_htid(rightid)
            right = HTID(rightid, **htid_args)
        except:
            errs.append(('right', rightid))
        try:
            comp = HTIDComparison(left, right)
            stats = comp.all_stats()
            stats.update(comp.stat_quantiles('srp'))
            stats.update(comp.stat_quantiles('glove'))
            stats_collector.append(stats)
        except KeyboardInterrupt:
            raise
        except:
            errs.append(('comparison', leftid+'-'+rightid))
pd.DataFrame(stats_collector).to_parquet('/data/saddl/handcoded.parquet', compression='snappy')

In [None]:
stats_collector = []
i = 0 
start = time.time()

for matchfile in annmatches:
    df = pd.read_parquet(matchfile)
    df = df[(df.prop_target > 0.5) | (df.prop_match > 0.5)]
    df = df[df['count'] > 1]
    df = df[df.match != df.target]
    
    for target, matches in df.groupby('target'):
        if i % 1000 == 0:
            print(i, len(stats_collector), target, time.time()-start)
        left = HTID(target, **htid_args)
        
        for match in matches['match']:
            right = HTID(match, **htid_args)
            comp = HTIDComparison(left, right)
            try:
                stats = comp.all_stats()
                stats.update(comp.stat_quantiles('srp'))
                stats.update(comp.stat_quantiles('glove'))
                stats_collector.append(stats)
            except KeyboardInterrupt:
                raise
            except:
                continue
                #print("Issue with ", left.htid, right.htid)

        if len(stats_collector) > 10000:
            pd.DataFrame(stats_collector).to_parquet('/data/saddl/tmp-comp-output/8' + utils.clean_htid(target) + 'andmore.parquet', compression='snappy')
            stats_collector = []

        i += 1
pd.DataFrame(stats_collector).to_parquet('/data/saddl/tmp-comp-output/8' + utils.clean_htid(target) + 'andmore.parquet', compression='snappy')
end = time.time()

0 0 aeu.ark:/13960/t0000vt1c 0.07594108581542969
1000 4769 hvd.32044080919459 92.68209195137024
2000 632 hvd.32044107274516 194.62113428115845
3000 6302 coo.31924001127004 328.62200832366943
4000 1224 coo.31924018276497 417.61743569374084
5000 6969 hvd.32044097021786 527.1369841098785
6000 3861 hvd.hc2cih 649.3356335163116
