## Generating Evaluation Lists

In [1]:
from SRP import Vector_file
import pandas as pd
import os
import random
import numpy as np
from scipy.spatial.distance import cdist, pdist
meta = pd.read_csv('../../sampling/test_dataset.csv.gz', low_memory=False).set_index('htid').sort_index()
# Add index num as col, to preserve the htid index
meta['n'] = range(0, len(meta))
# TEMP - only look at full view texts
meta = meta.query('access == "allow"')

In [2]:
# Fast Text Matching
with Vector_file('fastTextVecs.bin', mode='r', dims=100) as vfile:
    allvecs = vfile.to_matrix()
    
# ADJUSTING TO TRIMMED DATASET
name_ref = pd.Series(allvecs['names']).reset_index().set_index(0)['index'][meta.index]
allvecs['names'] = pd.Series(allvecs['names'])[name_ref.values].tolist()
allvecs['matrix'] = allvecs['matrix'][name_ref.values]
name_ref = pd.Series(allvecs['names']).reset_index().set_index(0)['index']

In [3]:
def eval_list(target, meta, sim_cutoff=0.05, sim_metric='cosine', max_by_author=10, max_random=3, exclude_target=True):
    '''
    Return a list of works to hand-code for evaluation.
    
    target: A row representing the target book
    meta: The dataframe of all metadata
    '''
    # Remember not to fillna, else you'll get a lot of matches for a blank field!
    oclc_match = meta[(meta.oclc_num == target.oclc_num)]
    lccn_match = meta[(meta.lccn == target.lccn)]
    isbn_match = meta[meta.isbn == target.isbn]
    title_match = meta[meta['title'] == target['title']]
    short_title_match = meta[meta['title'].apply(lambda s: s[:30]) == target['title'][:30]]

    # Fast text matches
    target_n = name_ref[[target.name]].values
    results = cdist(allvecs['matrix'][target_n], allvecs['matrix'], metric=sim_metric)
    sorted_results = pd.Series(results[0]).sort_values()
    top_results_i = sorted_results[sorted_results < sim_cutoff].index.tolist()
    fasttext_match = meta.loc[name_ref[top_results_i].index.values]

    init_matches = pd.concat([oclc_match, lccn_match, isbn_match, title_match, short_title_match, fasttext_match]).drop_duplicates()
    if exclude_target:
        init_matches = init_matches[init_matches.index != target.name]

    # Cap number of same author results, but excluding above results
    author_match = meta[meta['author'] == target['author']]
    unique_amatches = author_match.index.difference(init_matches.index)
    author_match = author_match.loc[unique_amatches].iloc[:max_by_author]
    
    rand_match = meta.sample(max_random)

    final_matches = pd.concat([init_matches, author_match, rand_match]).drop_duplicates()
    return final_matches

In [4]:
from IPython.core.display import display, HTML

def print_meta_row(target):
    target = target.fillna(' ')
    html = "<strong><a href='https://babel.hathitrust.org/cgi/pt?id=%s&view=thumb&seq=1' target='_blank'>%s</a></strong> by <em>%s</em> (%s)<br/>" % (target.name, target.title, target.author, target.rights_date_used)
    html += "&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<strong>%s</strong> OCLC %s / ISBN %s / ISSN %s / LCCN %s" % (target.name, target.oclc_num, target.isbn, target.issn, target.lccn)
    #for i in range(10, 15):
    #    html += "<img src='https://babel.hathitrust.org/cgi/imgsrv/thumbnail?id=%s;seq=%d;width=250;rotation=0'>" % (target.name, i)
    return HTML(html)

print_meta_row(meta.sample(1).iloc[0])

In [5]:
def get_page_count(htid):
    from htrc_features import utils
    import ujson as json
    path = '/data/extracted-features-parquet/' + utils.id_to_rsync(htid).replace('json.bz2', 'meta.json')
    with open(path, 'r', encoding='utf-8') as f:
        pcount = json.load(f)['page_count']
    return pcount

In [248]:
random_targets = np.random.randint(meta.shape[0], size=40)

with pd.ExcelWriter('output.xlsx') as writer:
    for target_i in random_targets:
            target = meta.iloc[target_i]
            results = eval_list(target, meta, max_random=0, exclude_target=False)
            df = results[['title', 'author', 'description', 'rights_date_used', 'oclc_num', 'isbn', 'issn', 'lccn']].reset_index().copy()
            df['page_count'] = df.htid.apply(get_page_count)
            df['link'] = df.htid.apply(lambda x: '=HYPERLINK("https://babel.hathitrust.org/cgi/pt?id=%s&view=thumb&seq=1", "link")' % x)
            df["relationship"] = ""
            df["notes"] = ""
            df.loc[0, 'relationship'] = "TARGET"
            df.to_excel(writer, sheet_name=target.name.replace(':', '').replace('/', ''))

# Export for rating interface

In [103]:
data['target']['oclc_num']

'236028094'

In [110]:
import numpy as np
isinstance(data['target']['oclc_num'], float) and np.isnan(data['target']['oclc_num'])

False

In [117]:
target.rename(columns={'oclc_num', 'oclc', 'enum'})

title               Bird-killing as a method in ornithology. By Re...
author                                Robbins, Reginald C. 1871-1955.
description                                                       NaN
rights_date_used                                                 1901
oclc_num                                                      1249185
isbn                                                              NaN
issn                                                              NaN
lccn                                                         03008575
Name: hvd.32044107218935, dtype: object

In [116]:
from htrc_features import utils
import simplejson as json

class NpEncoder(json.JSONEncoder):
    ''' JSON module doesn't understand numpy - this encodes them'''
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        else:
            return super(NpEncoder, self).default(obj)
        
random_targets = np.random.randint(meta.shape[0], size=20)

for target_i in random_targets:
    rselect = ['title', 'author', 'description', 'rights_date_used',
               'oclc_num', 'isbn', 'issn', 'lccn']
    target = meta.iloc[target_i][rselect]
    results = eval_list(target, meta, max_random=0, exclude_target=True)

    df = results[rselect].reset_index().copy()
    df['page_count'] = df.htid.apply(get_page_count)
           
    data = {}
    data['target'] = df.iloc[0].to_dict()
    data['data'] = []
    for i, row in df.iloc[1:].iterrows():
        data['data'].append(row.to_dict())

    with open('/data/saddl/rating-candidates/batch1/%s.json' % utils.clean_htid(data['target']['htid']), mode='w') as f:
        json.dump(data, f, cls=NpEncoder, ignore_nan=True)

In [None]:
random_targets = np.random.randint(meta.shape[0], size=20)

for target_i in random_targets:
        target = meta.iloc[target_i]
        display(print_meta_row(target))
        print('=' * 20)
        results = eval_list(target, meta, max_random=0)

        for htid, result in results.iterrows():
            display(print_meta_row(result))
        print('\n')