In [11]:
%load_ext autoreload
%autoreload 2

In [53]:
import pandas as pd
from compare_tools.hathimeta import clean_description, clean_title, HathiMeta
from compare_tools.configuration import config
import itertools

test = True
statdir = '/data/saddl/stats/testset_stats_07_07/'
config.update(config['test' if test else 'full'])
meta = HathiMeta(config['metadb_path'])

df = meta.full_table()
df.description = clean_description(df.description)

Deprecated: `full_table` is just a wrapper for HathiMeta.get_fields.


In [54]:
#oclc_with_multiples = 
with_desc = df[~df.description.isnull()]
# oclc+desc matches
swsm_counts = with_desc.groupby(['oclc_num', 'description']).htid.count().sort_index()
multiple_swsm = swsm_counts[swsm_counts > 1].reset_index()[['oclc_num', 'description']]
wp_dv_counts = multiple_swsm.groupby('oclc_num').description.count()
multiple_wp_dv = wp_dv_counts[wp_dv_counts > 1].reset_index()[['oclc_num']]

In [55]:
all_meta = pd.DataFrame([], columns=['left','right','judgment','notes'])

In [56]:
# Add ground truth for SWSM
judgments = []
for i, s in multiple_swsm.iterrows():
    q = " & ".join(["({}=='{}')".format(k,v) for k,v in s.items()])
    matches = with_desc.query(q)
    try:
        for left, right in itertools.product(matches.htid, matches.htid):
            if left == right:
                continue
            judgments.append({'left':left, 'right':right, 'judgment': 'SWSM', 'notes':'oclc+desc'})
    except:
        print("Problem with ", q)
all_meta = pd.concat([all_meta, pd.DataFrame(judgments)])
all_meta.shape

(21586, 5)

In [57]:
# Add ground truth for WP_DV. This will result in *some* SWSM, which will be overwritten later
judgments = []
for i, s in multiple_wp_dv.iterrows():
    q = " & ".join(["({}=='{}')".format(k,v) for k,v in s.items()])
    matches = with_desc.query(q)
    try:
        for left, right in itertools.product(matches.htid, matches.htid):
            if left == right:
                continue
            judgments.append({'left':left, 'right':right, 'judgment': 'WP_DV', 'notes':'oclc+desc'})
    except:
        print("Problem with ", q)
all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates()
# This will sort where 'SWSM' is before 'WP_DV' - then
# drop left/right duplicates while keeping the first ('SWSM') row
all_meta = all_meta.sort_values(['left','right', 'judgment']).drop_duplicates(['left','right'], keep='first')
all_meta.shape

(164066, 5)

Collect 'PARTOF' and 'CONTAINS' by expanding hyphenated descriptions like 'v.1-2'

In [58]:
def expand_multivol(desc):
    prefix, rangenum = desc.split('.')
    start, end = [int(s) for s in rangenum.split('-')]
    contains = ["{}.{}".format(prefix,i) for i in range(start, end+1)]
    return contains

multivols = df[df.description.fillna('').str.contains('^(v|c|no|pt)\.\d+-\d+$')].description.value_counts().index.values
judgments = []
for x in multivols:
    contains = expand_multivol(x)
    subset = df[df.description == x][['oclc_num','htid']]
    
    for i, row in subset.iterrows():
        contains_htids = df[(df.oclc_num == row.oclc_num) & df.description.isin(contains)].htid
        for right in contains_htids:
            judgments.append({'left':row.htid, 'right':right, 'judgment':'CONTAINS', 'notes':'desc split'})
            judgments.append({'left':right, 'right':row.htid, 'judgment':'PARTOF', 'notes':'desc split'})

all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates()
# if the v1-2 relationships were coded previously as WP_DV, drop those judgments
# in favour of PARTOF or CONTAINS
all_meta = all_meta.drop_duplicates(['left','right'], keep='last')
all_meta.shape

  return func(self, *args, **kwargs)


(164416, 5)

## Additional judgments

- Collect a sample of SWDE, where the title and author are the same, while the page count and date are different
- Collect a sample of same AUTHOR information, where the Author is the same but the title *seems* to be different (even with fuzzy matching).
- Collect a random sample of DIFF - where the Author is different.

Title comparisons are done with a fuzzy matching, using byte-pair encoding embeddings (BPEmb).

In [59]:
from bpemb import BPEmb
import numpy as np
from scipy.spatial.distance import cosine, pdist, cdist, squareform
bpemb_en = BPEmb(lang="en")

In [60]:
# Convert cleaned title to BPE encodings and keep those vectors
title_vecs = df.title.apply(clean_title).apply(bpemb_en.encode_ids).apply(lambda x: bpemb_en.vectors[x].sum(0)).values
title_vecs = np.vstack(title_vecs)

In [None]:
print("Processing ground truth for {} authors".format(len(df.author.unique())))
i = 0
judgments = []
for author, author_subset in df.sample(frac=1).groupby('author'):
    i += 1
    if i % 300 == 0:
        print(i, author)
        print(pd.DataFrame(judgments).judgment.value_counts())

    # Get pairwise similarity between all titles for author
    selection_vecs = title_vecs[author_subset.index]
    selection_sims = squareform(pdist(selection_vecs, metric='cosine'))
    #np.fill_diagonal(selection_sims, np.nan)

    for target_i, target in enumerate(author_subset.itertuples()):
        page_diff = author_subset.page_count.sub(target.page_count).abs()
        date_diff = author_subset.rights_date_used.sub(target.rights_date_used).abs()
        similar_pages = page_diff < 10
        different_pages_lo = (page_diff > 20) & (page_diff <= 30)
        different_pages_hi = page_diff > 30
        different_oclc = author_subset.oclc_num != target.oclc_num
        different_date_lo = (date_diff > 0) & (date_diff <= 10)
        different_date_hi = (date_diff > 10)
        same_titles = selection_sims[target_i] <= 0.1
        same_titles[target_i] = False
        different_titles = selection_sims[target_i] >= 0.3

        # Empty if the target_desc == None, which is preferred since we're building ground truth and desc=None is unpredictable - sometimes it's the same single volume, 
        # sometimes it's not
        same_desc = author_subset.description == target.description

        # SWDE different conditions with different strictness
        #swde = author_subset[cols][different_pages & same_titles & (different_oclc|different_date) & same_desc] # diff oclc is not reliable enough
        for p_cond_name, page_cond in [('pages(lo)', different_pages_lo), ('pages(hi)', different_pages_hi)]:
            for d_cond_name, date_cond in [('date(lo)', different_date_lo), ('date(hi)', different_date_hi)]:
                swde = author_subset[page_cond & same_titles & date_cond & same_desc]
                for htid in swde.htid:
                    judgments.append({'left':target.htid, 'right':htid, 'judgment':'SWDE', 'notes':'fuzztitle+desc/diff:{}+{}'.format(p_cond_name, d_cond_name)})
            
        # AUTHOR
        max_author = 2
        for htid in author_subset[different_titles].htid.iloc[:max_author]:
            judgments.append({'left':target.htid, 'right':htid, 'judgment':'AUTHOR', 'notes':'diff:fuzztitle'})

    # DIFF
    n = 2 # number of random diffs per value
    non_author_sample = df[df.author != author_subset.iloc[0].author].sample(author_subset.shape[0]*n)
    judgments += [{'left':left, 'right':right, 'judgment':'RANDDIFF', 'notes':'diff:author'} for left, right in zip(author_subset.htid.tolist() * n, non_author_sample.htid)]

all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates(['left','right'])
del judgments
all_meta.judgment.value_counts()

Processing ground truth for 6129 authors
300 Balfour, Clara Lucas, 1808-1878.
RANDDIFF    13690
AUTHOR      12761
SWDE          242
Name: judgment, dtype: int64
600 Botting, Douglas.
RANDDIFF    28010
AUTHOR      26666
SWDE          790
Name: judgment, dtype: int64
900 California. Commissioners of Transportation.
RANDDIFF    43570
AUTHOR      41758
SWDE         1288
Name: judgment, dtype: int64
1200 Committee for Economic Development.
RANDDIFF    56506
AUTHOR      54067
SWDE         1726
Name: judgment, dtype: int64
1500 Dick, John, 1764-1833.
RANDDIFF    70702
AUTHOR      67686
SWDE         1968
Name: judgment, dtype: int64
1800 Federal Writers' Project. New York (City)
RANDDIFF    84794
AUTHOR      80764
SWDE         2224
Name: judgment, dtype: int64
2100 Gill, Harjeet Singh, 1935-
RANDDIFF    100260
AUTHOR       95554
SWDE          2548
Name: judgment, dtype: int64
2400 Harmon, Robert B. 1932-
RANDDIFF    115032
AUTHOR      109371
SWDE          3980
Name: judgment, dtype: int64
2700

In [68]:
all_meta.sort_values('left').to_parquet(statdir + 'meta_gt_judgments.parquet', index=False)

In [69]:
all_meta.judgment.value_counts()

RANDDIFF    287728
AUTHOR      271388
WP_DV       141904
SWSM         21586
SWDE          9930
CONTAINS       463
PARTOF         463
Name: judgment, dtype: int64

In [79]:
# Write left/right records to json as needed by crunch_stats.py
import json
import pandas as pd
all_meta = pd.read_parquet(statdir + 'meta_gt_judgments.parquet')
with open(statdir + 'to_crunch_stats.json', mode='w') as f:
    for row in all_meta.to_dict(orient='records'):
        json.dump(row, f)
        f.write('\n')

In [71]:
# Also process handcoded stats
from htrc_features import utils
handcoded = pd.read_csv('http://35.239.220.133/download')
handcoded = handcoded.rename(columns={'target':'left', 'candidate':'right'})
for col in ['left', 'right']:
    handcoded[col] = handcoded[col].apply(utils.extract_htid)
handcoded['notes'] = handcoded['notes'].fillna('')
with open(statdir + 'handcoded_stats.json', mode='w') as f:
    for row in handcoded[['left', 'right', 'judgment', 'notes']].to_dict(orient='records'):
        json.dump(row, f)
        f.write('\n')

## Generating SIMDIFF judgments

For this class, we look for books that are unrelated (as per author) but similar (as evidenced by a suggestion from Annoy). The only 'metadata' inference is whether the book is by the same author. 

In [5]:
from compare_tools.MTAnnoy import MTAnnoy
ann = MTAnnoy(config['ann_path'], dims=300)

In [6]:
vols = meta.get_fields(['htid', 'author'])
vols.shape

(143864, 2)

In [None]:
import gzip, json
max_targets = 20000
keep_n = 10

max_targets = max_targets if max_targets < len(vols) else len(vols)

with gzip.open(statdir + 'simdiff_stats.json.gz', mode='w') as f:
    for i, (ind, (htid, author)) in enumerate(vols.sample(max_targets).iterrows()):
        # Select a volume, find the IDs of all the books by the same author,
        # then do an ANN search and filter out same-author works
        try:
            same_author = meta.get_where('author == "{}"'.format(author), 
                                         fields=['htid'])['htid']
            results = ann.doc_match_stats(htid, min_count=2)
            gt = results[~results.match.isin(same_author)][['target', 'match']]
            if gt.shape[0] > keep_n:
                gt = gt.sample(n=keep_n)
            gt.columns = ['left', 'right']
            gt['judgment'] = 'SIMDIFF'
            for record in gt.to_dict(orient='records'):
                f.write((json.dumps(record)+'\n').encode('utf-8'))
        except KeyboardInterrupt:
            raise
        except:
            print("Error with ", htid)
            continue
        if i % 250 == 0:
            print(i, end=', ')

0, 

## Using OCLC for ground truth

Rather than loading from online, you can download beforehand with `wget -O /data/saddl/oclc_classify/{}.xml -q http://classify.oclc.org/classify2/Classify?oclc={}&summary=false`

In [5]:
# Save a listing of OCLC numbers, for wgetting
df = meta.get_fields(['htid', 'oclc_num'])
df['oclc_num'].drop_duplicates().to_csv('unique_oclc.csv', index=False, header=False)

Parse OCLC files

In [51]:
import glob, json, xmltodict, gzip
oclc_paths = glob.glob('/data/saddl/oclc_classify/*')

In [17]:
def align_oclc(oclc_path, meta):
    with open(oclc_path) as f:
        oclc = f.read()
    a = xmltodict.parse(oclc)['classify']
    if a['response']['@code'] != '2':
        return []

    editions = a['editions']['edition']
    if type(editions) is not list:
        return []
    metas = [(int(edition['@holdings']), edition['@oclc'], edition['@itemtype']) for edition in editions]
    metas = sorted(metas)[::-1]
    # Trim results to top five-most held, or drop any books with
    # less than 20% of the top book's holding, which is less. Essentially, looking
    # here for widely held different manifestation or expressions - hope that reduces the number of 
    # errors where a new OCLC was assigned incorrectly
    max_holdings = metas[0][0]
    oclc_nums = [oclc_num for holdings, oclc_num, itemtype in metas if holdings >=max_holdings*.2][:5]

    # Only keep one book per oclc_num
    diff_editions = meta[meta.oclc_num.isin(oclc_nums)].sample(frac=1).drop_duplicates('oclc_num')
    if len(diff_editions) < 2:
        return []
    else:
        permutations = [(htid1, htid2) for htid1 in diff_editions.htid for htid2 in diff_editions.htid if htid1 != htid2]
        return permutations

In [44]:
all_all_oclc_swde = []
for i, oclc_path in enumerate(oclc_paths):
    try:
        results = align_oclc(oclc_path, df)
        all_oclc_swde += results
        break
    except KeyboardInterrupt:
        raise
    except:
        print("Error with", oclc_path)
    if i % 250 == 0:
        print(i, end=',')
print('Done')
len(all_oclc_swde)

86392

In [52]:
oclc_gt = pd.DataFrame(all_oclc_swde, columns=['left','right'])
oclc_gt['judgment'] = 'SWDE'
oclc_gt['notes'] = 'oclc_classify'
with gzip.open(statdir+'oclc_sim_stats.json.gz', mode='w') as f:
    for record in oclc_gt.to_dict(orient='records'):
        f.write((json.dumps(record)+'\n').encode('utf-8'))

Remember to deduplicate with the judgments from the other ground truth!

## Subsampling

The stats files saved to `statdir` may bear truncation. Here's how I did it during a training run:


```
sort -R pairwise_gr_stats.json | head -n 200000 > pairwise_gr_stats_100k-rand.json 
grep "RANDDIFF" to_crunch_stats.json | sort -R | head -n 50000  >to_crunch_stats_RANDDIFF_50k-rand.json 
grep "AUTHOR" to_crunch_stats.json | sort -R | head -n 100000  >to_crunch_stats_AUTHOR_100k-rand.json 
grep -vP "AUTHOR|RANDDIFF" to_crunch_stats.json | sort -R >to_crunch_stats_OTHER.json
cat oclc_sim_stats.json pairwise_gr_stats_200k-rand.json simdiff_stats.json testset_fake_stats.json to_crunch_stats_*json | sort -R >final_crunchlist.json
```