In [41]:
%reload_ext autoreload
%autoreload 2

# Repackaging MinHashes

The process of creating MinHashes from Bookworm input files led to some duplicates. Since I'm not sure if duplicated volumes are fingerprints of the entire text or if they've been accidentally fragmented, I reprocess those volumes from the original [Extracted Features](https://wiki.htrc.illinois.edu/display/COM/Extracted+Features+Dataset) files. It's only a small portion of the collection. At the bottom, this notebook repackages the initial MinHashes into more evenly sized data files, without the original versions of the duplicated volumes.

This notebook is just for posterity: it's unlikely to be useful outside of my use case.

## Checking for volumes that need to be reprocessed

Volumes that show up more than once are saved to `duplicates.txt`.

In [2]:
%%time
import pandas as pd
import os
from hash_utils import HashReader
import glob

datpaths = glob.glob('data/minhashes/*.dat')
all_ids = []
i = 0

for path in datpaths:
    with HashReader(path) as hr:
        filename = os.path.splitext(os.path.split(path)[-1])[0]
        for htid, minhash in hr.hashes():
            all_ids.append(htid)
            i += 1
            if i % 500000 == 0:
                print(i, end=',')
print(i)
counts = pd.Series(all_ids).value_counts()
dupes = counts[counts > 1].index.tolist()

with open('duplicates.txt', mode='w') as f:
    f.write("\n".join(dupes))

500000,1000000,1500000,2000000,2500000,3000000,3500000,4000000,4500000,5000000,5500000,6000000,6500000,7000000,7500000,8000000,8500000,9000000,9500000,10000000,10500000,11000000,11500000,12000000,12500000,13000000,13500000,13751529
CPU times: user 8min 34s, sys: 7.77 s, total: 8min 42s
Wall time: 14min 27s


In [23]:
from htrc_features import utils
successes, fails = [], []
for htid in dupes:
    path = '/notebooks/features/' + utils.id_to_rsync(htid)
    try:
        os.stat(path)
        successes.append(path)
    except:
        fails.append(htid)
        
with open('success-paths.txt', mode='w') as f:
    f.write("\n".join(successes))
    
with open('fail-ids.txt', mode='w') as f:
    f.write("\n".join(fails))

## Repackage MinHashes

Rewrite all non-duplicated hashes. A few minutes of extra processing in the service of a cleaner dataset.

In [75]:
import struct

with open('duplicates.txt', mode='r') as f:
    dupes = set(f.read().split('\n'))
dupeset = set(dupes)

i, j = 0, 0

f = open('data/minhashes-repack/minhashes-3.%d.dat' % j, mode='wb')
for path in datpaths:
    with HashReader(path) as hr:
        for htid, part in hr.hashes(deserialize=False):
            if htid in dupeset:
                continue
            f.write(part)
            i += 1
            if i % 500000 == 0:
                print(i, end=',')
            if i % 2000000 == 0:
                j += 1
                f.close()
                f = open('data/minhashes-repack/minhashes-3.%d.dat' % j, mode='wb')

f.close()
print(i, glob.glob('data/minhashes-repack/minhashes-3*'))

292080 ['data/minhashes-repack/minhashes-2.0.dat']
