## Generating Evaluation Lists

In [32]:
from SRP import Vector_file
import pandas as pd
import os
import random
import numpy as np
from scipy.spatial.distance import cdist, pdist
meta = pd.read_csv('../../sampling/test_dataset.csv.gz', low_memory=False).set_index('htid').sort_index()
# Add index num as col, to preserve the htid index
meta['n'] = range(1, len(meta)+1)

In [48]:
# Fast Text Matching
with Vector_file('fastTextVecs.bin', mode='r', dims=100) as vfile:
    allvecs = vfile.to_matrix()

In [215]:
def eval_list(target, meta, sim_cutoff=0.05, sim_metric='cosine', max_by_author=10, max_random=3):
    '''
    Return a list of works to hand-code for evaluation.
    
    target: A row representing the target book
    meta: The dataframe of all metadata
    '''
    # Remember not to fillna, else you'll get a lot of matches for a blank field!
    oclc_match = meta[(meta.oclc_num == target.oclc_num)]
    lccn_match = meta[(meta.lccn == target.lccn)]
    isbn_match = meta[meta.isbn == target.isbn]
    title_match = meta[meta['title'] == target['title']]
    short_title_match = meta[meta['title'].apply(lambda s: s[:30]) == target['title'][:30]]

    # Fast text matches
    results = cdist(allvecs['matrix'][target.n:target.n+1], allvecs['matrix'], metric=sim_metric)
    sorted_results = pd.Series(results[0]).sort_values()
    print(sorted_results.index[0])
    top_results_i = sorted_results[sorted_results < sim_cutoff].index.tolist()
    fasttext_match = meta.iloc[top_results_i]

    init_matches = pd.concat([oclc_match, lccn_match, isbn_match, title_match, short_title_match, fasttext_match]).drop_duplicates()
    init_matches = init_matches[init_matches.index != target.name]

    # Cap number of same author results, but excluding above results
    author_match = meta[meta['author'] == target['author']]
    unique_amatches = author_match.index.difference(init_matches.index)
    author_match = author_match.loc[unique_amatches].iloc[:max_by_author]
    
    rand_match = meta.sample(max_random)

    final_matches = pd.concat([init_matches, author_match, rand_match]).drop_duplicates()
    return final_matches

In [212]:
from IPython.core.display import display, HTML

def print_meta_row(target):
    target = target.fillna(' ')
    html = "<strong><a href='%s'>%s</a></strong> by <em>%s</em> (%s)" % (target.name, target.title, target.author, target.rights_date_used)
    html += "&nbsp;&nbsp;&nbsp;OCLC %s / ISBN %s / ISSN %s / LCCN %s" % (target.oclc_num, target.isbn, target.issn, target.lccn)
    return HTML(html)

print_meta_row(target)

In [216]:
target = meta.iloc[i]
target[['title', 'author', 'description', 'oclc_num', 'access', 'rights_date_used']]

random_targets = np.random.randint(meta.shape[0], size=20)

for target_i in random_targets:
        target = meta.iloc[target_i]
        display(print_meta_row(target))
        print('=' * 20)
        results = eval_list(target, meta, max_random=0)

        for htid, result in results.iterrows():
            display(print_meta_row(result))
        print('\n')

26096






39672






113786






15028






28246






6982






17083






105587






141950






91622






139301






78999






80919






117312






79010






128058






19832






61791






134721






39020




