In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
1 

1

In [35]:
import pandas as pd
from compare_tools.hathimeta import clean_description, clean_title, HathiMeta
import itertools
meta = HathiMeta('/data/saddl/meta.db')
df = meta.full_table()
df.description = clean_description(df.description)

In [9]:
#oclc_with_multiples = 
with_desc = df[~df.description.isnull()]
# oclc+desc matches
swsm_counts = with_desc.groupby(['oclc_num', 'description']).htid.count().sort_index()
multiple_swsm = swsm_counts[swsm_counts > 1].reset_index()[['oclc_num', 'description']]
wp_dv_counts = multiple_swsm.groupby('oclc_num').description.count()
multiple_wp_dv = wp_dv_counts[wp_dv_counts > 1].reset_index()[['oclc_num']]

In [10]:
all_meta = pd.DataFrame([], columns=['left','right','judgment','note'])

In [11]:
# Add ground truth for SWSM
judgments = []
for i, s in multiple_swsm.iterrows():
    q = " & ".join(["({}=='{}')".format(k,v) for k,v in s.items()])
    matches = with_desc.query(q)
    try:
        for left, right in itertools.product(matches.htid, matches.htid):
            if left == right:
                continue
            judgments.append({'left':left, 'right':right, 'judgment': 'SWSM', 'notes':'oclc+desc'})
    except:
        print("Problem with ", q)
all_meta = pd.concat([all_meta, pd.DataFrame(judgments)])
all_meta.shape

(21586, 5)

In [12]:
# Add ground truth for WP_DV. This will result in *some* SWSM, which will be overwritten later
judgments = []
for i, s in multiple_wp_dv.iterrows():
    q = " & ".join(["({}=='{}')".format(k,v) for k,v in s.items()])
    matches = with_desc.query(q)
    try:
        for left, right in itertools.product(matches.htid, matches.htid):
            if left == right:
                continue
            judgments.append({'left':left, 'right':right, 'judgment': 'WP_DV', 'notes':'oclc+desc'})
    except:
        print("Problem with ", q)
all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates()
# This will sort where 'SWSM' is before 'WP_DV' - then
# drop left/right duplicates while keeping the first ('SWSM') row
all_meta = all_meta.sort_values(['left','right', 'judgment']).drop_duplicates(['left','right'], keep='first')
all_meta.shape

(164066, 5)

Collect 'PARTOF' and 'CONTAINS' by expanding hyphenated descriptions like 'v.1-2'

In [13]:
def expand_multivol(desc):
    prefix, rangenum = desc.split('.')
    start, end = [int(s) for s in rangenum.split('-')]
    contains = ["{}.{}".format(prefix,i) for i in range(start, end+1)]
    return contains

multivols = df[df.description.fillna('').str.contains('^(v|c|no|pt)\.\d+-\d+$')].description.value_counts().index.values
judgments = []
for x in multivols:
    contains = expand_multivol(x)
    subset = df[df.description == x][['oclc_num','htid']]
    
    for i, row in subset.iterrows():
        contains_htids = df[(df.oclc_num == row.oclc_num) & df.description.isin(contains)].htid
        for right in contains_htids:
            judgments.append({'left':row.htid, 'right':right, 'judgment':'CONTAINS', 'notes':'desc split'})
            judgments.append({'left':right, 'right':row.htid, 'judgment':'PARTOF', 'notes':'desc split'})

all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates()
# if the v1-2 relationships were coded previously as WP_DV, drop those judgments
# in favour of PARTOF or CONTAINS
all_meta = all_meta.drop_duplicates(['left','right'], keep='last')
all_meta.shape

(164416, 5)

## Additional judgments

- Collect a sample of SWDE, where the title and author are the same, while the page count and date are different
- Collect a sample of same AUTHOR information, where the Author is the same but the title *seems* to be different (even with fuzzy matching).
- Collect a random sample of DIFF - where the Author is different.

Title comparisons are done with a fuzzy matching, using byte-pair encoding embeddings (BPEmb).

In [14]:
from bpemb import BPEmb
import numpy as np
from scipy.spatial.distance import cosine, pdist, cdist, squareform
bpemb_en = BPEmb(lang="en")

In [15]:
# Convert cleaned title to BPE encodings and keep those vectors
title_vecs = df.title.apply(clean_title).apply(bpemb_en.encode_ids).apply(lambda x: bpemb_en.vectors[x].sum(0)).values
title_vecs = np.vstack(title_vecs)

In [17]:
print("Processing ground truth for {} authors".format(len(df.author.unique())))
i = 0
judgments = []
for author, author_subset in df.sample(frac=1).groupby('author'):
    i += 1
    if i % 300 == 0:
        print(i, author)
        print(pd.DataFrame(judgments).judgment.value_counts())

    # Get pairwise similarity between all titles for author
    selection_vecs = title_vecs[author_subset.index]
    selection_sims = squareform(pdist(selection_vecs, metric='cosine'))
    #np.fill_diagonal(selection_sims, np.nan)

    for target_i, target in enumerate(author_subset.itertuples()):
        page_diff = author_subset.page_count.sub(target.page_count).abs()
        date_diff = author_subset.rights_date_used.sub(target.rights_date_used).abs()
        similar_pages = page_diff < 10
        different_pages_lo = (page_diff > 20) & (page_diff <= 30)
        different_pages_hi = page_diff > 30
        different_oclc = author_subset.oclc_num != target.oclc_num
        different_date_lo = (date_diff > 0) & (date_diff <= 10)
        different_date_hi = (date_diff > 10)
        same_titles = selection_sims[target_i] <= 0.1
        same_titles[target_i] = False
        different_titles = selection_sims[target_i] >= 0.3

        # Empty if the target_desc == None, which is preferred since we're building ground truth and desc=None is unpredictable - sometimes it's the same single volume, 
        # sometimes it's not
        same_desc = author_subset.description == target.description

        # SWDE different conditions with different strictness
        #swde = author_subset[cols][different_pages & same_titles & (different_oclc|different_date) & same_desc] # diff oclc is not reliable enough
        for p_cond_name, page_cond in [('pages(lo)', different_pages_lo), ('pages(hi)', different_pages_hi)]:
            for d_cond_name, date_cond in [('date(lo)', different_date_lo), ('date(hi)', different_date_hi)]:
                swde = author_subset[page_cond & same_titles & date_cond & same_desc]
                for htid in swde.htid:
                    judgments.append({'left':target.htid, 'right':htid, 'judgment':'SWDE', 'notes':'fuzztitle+desc/diff:{}+{}'.format(p_cond_name, d_cond_name)})
            
        # AUTHOR
        max_author = 2
        for htid in author_subset[different_titles].htid.iloc[:max_author]:
            judgments.append({'left':target.htid, 'right':htid, 'judgment':'AUTHOR', 'notes':'diff:fuzztitle'})

    # DIFF
    n = 3 # number of random diffs per value
    non_author_sample = df[df.author != author_subset.iloc[0].author].sample(author_subset.shape[0]*n)
    judgments += [{'left':left, 'right':right, 'judgment':'DIFF', 'note':'diff:author'} for left, right in zip(author_subset.htid.tolist() * n, non_author_sample.htid)]

all_meta = pd.concat([all_meta, pd.DataFrame(judgments)]).drop_duplicates(['left','right'])
del judgments
all_meta.judgment.value_counts()

Processing ground truth for 6129 authors
300 Balfour, Clara Lucas, 1808-1878.
DIFF      20535
AUTHOR    12761
SWDE        242
Name: judgment, dtype: int64
600 Botting, Douglas.
DIFF      42015
AUTHOR    26666
SWDE        790
Name: judgment, dtype: int64
900 California. Commissioners of Transportation.
DIFF      65355
AUTHOR    41758
SWDE       1288
Name: judgment, dtype: int64
1200 Committee for Economic Development.
DIFF      84759
AUTHOR    54067
SWDE       1726
Name: judgment, dtype: int64
1500 Dick, John, 1764-1833.
DIFF      106053
AUTHOR     67686
SWDE        1968
Name: judgment, dtype: int64
1800 Federal Writers' Project. New York (City)
DIFF      127191
AUTHOR     80764
SWDE        2224
Name: judgment, dtype: int64
2100 Gill, Harjeet Singh, 1935-
DIFF      150390
AUTHOR     95554
SWDE        2548
Name: judgment, dtype: int64
2400 Harmon, Robert B. 1932-
DIFF      172548
AUTHOR    109371
SWDE        3980
Name: judgment, dtype: int64
2700 Hutchings, J. M. 1820-1902.
DIFF      192

DIFF        431592
AUTHOR      271381
WP_DV       141904
SWSM         21586
SWDE          9930
PARTOF         463
CONTAINS       463
Name: judgment, dtype: int64

In [26]:
all_meta.sort_values('left').to_parquet('../ground_truth_meta_judgments.parquet', index=False)

In [27]:
all_meta.judgment.value_counts()

DIFF        431592
AUTHOR      271381
WP_DV       141904
SWSM         21586
SWDE          9930
PARTOF         463
CONTAINS       463
Name: judgment, dtype: int64

In [5]:
# Write left/right records to json as needed by crunch_stats.py
import json
import pandas as pd
all_meta = pd.read_parquet('../ground_truth_meta_judgments.parquet')
with open('/tmp/to_crunch_stats.json', mode='w') as f:
    for row in all_meta.to_dict(orient='records'):
        json.dump(row, f)
        f.write('\n')

In [13]:
all_meta[all_meta.left.str.contains('uiuo')].head()

Unnamed: 0,left,right,judgment,notes
776922,uiuo.ark:/13960/t0000211j,hvd.32044004560470,DIFF,diff:author
776923,uiuo.ark:/13960/t0000211j,mdp.39015016923313,DIFF,diff:author
776924,uiuo.ark:/13960/t0000211j,mdp.39015059378672,DIFF,diff:author
776925,uiuo.ark:/13960/t0000211j,wu.89094335031,AUTHOR,diff:fuzztitle
776926,uiuo.ark:/13960/t0000211j,uc1.$b96097,AUTHOR,diff:fuzztitle


In [19]:
# Also process handcoded stats
from htrc_features import utils
handcoded = pd.read_csv('http://35.239.220.133/download')
handcoded = handcoded.rename(columns={'target':'left', 'candidate':'right'})
for col in ['left', 'right']:
    handcoded[col] = handcoded[col].apply(utils.extract_htid)
handcoded['notes'] = handcoded['notes'].fillna('')
with open('/tmp/handcoded_stats.json', mode='w') as f:
    for row in handcoded[['left', 'right', 'judgment', 'notes']].to_dict(orient='records'):
        json.dump(row, f)
        f.write('\n')

## Using OCLC for ground truth

Rather than loading from online, you can download beforehand with `wget -O /data/saddl/oclc_classify/{}.xml -q http://classify.oclc.org/classify2/Classify?oclc={}&summary=false`

In [20]:
from smart_open import open

In [66]:
df = meta.full_table()
df.shape

(143864, 27)

In [74]:
df['oclc_num'].drop_duplicates().to_csv('unique_oclc.csv', index=False, header=False)

In [71]:
import json
with open('http://classify.oclc.org/classify2/Classify?oclc=809019115&summary=false') as f:
    oclc = f.read()
oclc

'<?xml version="1.0" encoding="UTF-8" standalone="no"?>\n<classify xmlns="http://classify.oclc.org">\n  <response code="2"/>\n  <!--Classify is a product of OCLC Online Computer Library Center: http://classify.oclc.org-->\n  <work author="Kürschner, Joseph, 1853-1902 | Kürschner, Joseph, 1853-1902 [Editor]" editions="18" eholdings="12" format="Book" holdings="70" itemtype="itemtype-book" owi="3375803240" title="Deutsche National-Litteratur : Historisch-kritishe Ausgabe">7925241</work>\n  <authors>\n    <author lc="n83054455" viaf="45094588">Kürschner, Joseph, 1853-1902 [Editor]</author>\n  </authors>\n  <orderBy>thold desc</orderBy>\n  <input type="oclc">809019115</input>\n  <start>0</start>\n  <maxRecs>25</maxRecs>\n  <editions>\n    <edition author="Kürschner, Joseph, 1853-1902" eholdings="0" format="Book" holdings="37" itemtype="itemtype-book" language="ger" oclc="7925241" title="Deutsche National-Litteratur : Historisch-kritishe Ausgabe">\n      <classifications>\n        <class in

In [23]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(oclc, 'html.parser')

In [53]:
meta.random()

htid                                                      ien.35556021235395
access                                                                 allow
rights                                                                  pdus
ht_bib_key                                                           1107820
description                                                             None
source                                                                   IEN
source_bib_num                                                       2636944
oclc_num                                                              882921
isbn                                                                    None
issn                                                                    None
lccn                                                         62063405 /L/r80
title                      Essays on some unsettled questions in the econ...
imprint                    Graduate School of Business, Bureau of Busines...

In [56]:
a = xmltodict.parse(oclc)['classify']
assert a['response']['@code'] == '2'
a['work']

OrderedDict([('@author', 'Wilson, George W. (George Wilton), 1928-2004'),
             ('@editions', '5'),
             ('@eholdings', '5'),
             ('@format', 'Book'),
             ('@holdings', '218'),
             ('@itemtype', 'itemtype-book'),
             ('@owi', '9349663054'),
             ('@title',
              'Essays on some unsettled questions in the economics of transportation'),
             ('#text', '882921')])

In [28]:
import xmltodict

In [63]:
meta.get_where('oclc_num == "570650798"')

Unnamed: 0,htid,access,rights,ht_bib_key,description,source,source_bib_num,oclc_num,isbn,issn,...,pub_place,lang,bib_fmt,collection_code,content_provider_code,responsible_entity_code,digitization_agent_code,access_profile_code,author,page_count
