In [132]:
import numpy as np
import pandas as pd
import random
from itertools import permutations,combinations

In [156]:
SHINGLE_LEN = 5

def jac_sim(A,B):
    return len(A.intersection(B))/len(A.union(B))

def vec_sim(A,B):
    return np.sum((A == B).astype(int))/len(A)

class MinHash:

    @classmethod
    def fromfulltext(cls, doc_list, shin_len,num_hashes):
        doc_shins = [cls.get_shingles(doc_list[0],shin_len)]
        for doc in doc_list[1:]:
            doc_shins.append(cls.get_shingles(doc,shin_len))
        return cls(doc_shins,num_hashes)

        

    def __init__(self,doc_shins, num_hashes, b):

        self.num_docs = len(doc_shins)
        self.num_hashes = num_hashes
        self.sig_m = np.empty((self.num_docs,self.num_hashes))
        self.bands = dict()
        self.cand_pairs = set()
        all_shingles = doc_shins[0]
        for doc in doc_shins[1:]:
            all_shingles = all_shingles.union(doc)

        self.num_shin = len(all_shingles)
        self.shin_to_ind = dict(zip(list(all_shingles),range(self.num_shin)))

        self.onehots = np.zeros((self.num_docs,self.num_shin))
        for doc_i, doc_shin in enumerate(doc_shins):
            for shin in doc_shin:
                self.onehots[doc_i,self.shin_to_ind[shin]] = 1

        self.create_sig_m()
        self.create_bands(b)


    def get_shingles(self, doc, shin_len):
        shingle_set_ret = set()
        for i in range(len(doc) - shin_len + 1):
            shingle_set_ret.add(doc[i:i+shin_len])
        return shingle_set_ret

    def create_sig_m(self):
        shuf_inds = list(range(self.num_shin))
        for hash_i in range(self.num_hashes):
            random.shuffle(shuf_inds)
            for doc_i in range(self.num_docs):
                for i,hash in enumerate(shuf_inds):
                    if self.onehots[doc_i,hash] == 1:
                        self.sig_m[doc_i,hash_i] = i
                        break

    def get_sig_m(self):
        return self.sig_m

    def create_bands(self,b):
        for row_i in range(self.sig_m.shape[0]):
            doc_bands = np.array_split(self.sig_m[row_i,:], b)
            for band_i,band in enumerate(doc_bands):
                key = tuple(np.append(band,str(band_i)))
                if key in self.bands:
                    self.cand_pairs.update([(row_i,x) for x in self.bands[key]])
                else:
                    self.bands[key] = []
                self.bands[key] += [row_i]

    def get_cand_pairs(self):
        return self.cand_pairs


In [185]:
doc_list2 = []
all_perms = list(permutations('atone'))
doc_list2.append(set(all_perms[0:30]))
doc_list2.append(set(all_perms[15:45]))
doc_list2.append(set(all_perms[15:45]))
doc_list2.append(set(all_perms[70:110]))
doc_list2.append(set(all_perms[80:120]))

mh = MinHash(doc_list2,15,5)
sm = mh.get_sig_m()
all_c = list(combinations(list(range(len(doc_list2))),2))
df_eval = pd.DataFrame(index=all_c)
df_eval['True_Sim'] = [jac_sim(doc_list2[x[0]],doc_list2[x[1]]) for x in all_c]
df_eval['Sig_M_Sim'] = [vec_sim(sm[x[0],:],sm[x[1],:]) for x in all_c]
df_eval

mh.get_cand_pairs()


{(2, 1)}

In [143]:
A1 = np.array([1,2,3,4,5,6,7])

np.array_split(A1,3)

np.append(A1,8)

A2 = {1,2,3}

A2.update([5,6])

A2

{1, 2, 3, 5, 6}