In [1]:
def k_char_grams(k, filename):
    grams = set()
    with open(filename) as fp:
        text = fp.read()
        
    i = 0
    while i+k != len(text):
        grams.add(text[i:i+k])
        i += 1
        
    return grams


def k_word_grams(k, filename):
    grams = set()
    with open(filename) as fp:
        text = fp.read().split(' ')
        
    i = 0
    while i+k != len(text):
        grams.add(" ".join(text[i:i+k]))
        i += 1
    
    return grams

In [2]:
fnames = ['text/D1.txt', 'text/D2.txt', 'text/D3.txt', 'text/D4.txt']

G1 = {fname.split('/')[1]: k_char_grams(2, fname) for fname in fnames}
G2 = {fname.split('/')[1]: k_char_grams(3, fname) for fname in fnames}
G3 = {fname.split('/')[1]: k_word_grams(2, fname) for fname in fnames}

In [3]:
print('G1')
print('\n'.join([file + ': # of grams:' + str(len(grams)) for file, grams in G1.items()]))
print()
print('G2')
print('\n'.join([file + ': # of grams:' + str(len(grams)) for file, grams in G2.items()]))
print()
print('G3')
print('\n'.join([file + ': # of grams:' + str(len(grams)) for file, grams in G3.items()]))

G1
D1.txt: # of grams:266
D2.txt: # of grams:265
D3.txt: # of grams:258
D4.txt: # of grams:259

G2
D1.txt: # of grams:815
D2.txt: # of grams:804
D3.txt: # of grams:757
D4.txt: # of grams:771

G3
D1.txt: # of grams:308
D2.txt: # of grams:307
D3.txt: # of grams:294
D4.txt: # of grams:260


In [4]:
def jaccard_sim(A, B):
    return len(A.intersection(B))/len(A.union(B))    

In [5]:
import itertools

print('Jaccard similarity for G1')
for combo in itertools.combinations(G1.items(), 2):
    print('Similarity ({}, {}): {}'.format(combo[0][0], combo[1][0], 
                                           jaccard_sim(combo[0][1], combo[1][1])))
    
print()
print('Jaccard similarity for G2')
for combo in itertools.combinations(G2.items(), 2):
    print('Similarity ({}, {}): {}'.format(combo[0][0], combo[1][0], 
                                           jaccard_sim(combo[0][1], combo[1][1])))
    
print()
print('Jaccard similarity for G3')
for combo in itertools.combinations(G3.items(), 2):
    print('Similarity ({}, {}): {}'.format(combo[0][0], combo[1][0], 
                                           jaccard_sim(combo[0][1], combo[1][1])))

Jaccard similarity for G1
Similarity (D1.txt, D2.txt): 0.9962406015037594
Similarity (D1.txt, D3.txt): 0.9124087591240876
Similarity (D1.txt, D4.txt): 0.7326732673267327
Similarity (D2.txt, D3.txt): 0.9157509157509157
Similarity (D2.txt, D4.txt): 0.7293729372937293
Similarity (D3.txt, D4.txt): 0.6950819672131148

Jaccard similarity for G2
Similarity (D1.txt, D2.txt): 0.9648058252427184
Similarity (D1.txt, D3.txt): 0.7312775330396476
Similarity (D1.txt, D4.txt): 0.35555555555555557
Similarity (D2.txt, D3.txt): 0.7402452619843924
Similarity (D2.txt, D4.txt): 0.3496143958868895
Similarity (D3.txt, D4.txt): 0.3510167992926614

Jaccard similarity for G3
Similarity (D1.txt, D2.txt): 0.7672413793103449
Similarity (D1.txt, D3.txt): 0.2754237288135593
Similarity (D1.txt, D4.txt): 0.012477718360071301
Similarity (D2.txt, D3.txt): 0.3008658008658009
Similarity (D2.txt, D4.txt): 0.016129032258064516
Similarity (D3.txt, D4.txt): 0.014652014652014652


In [92]:
_mersenne_prime = (1 << 61) - 1
_max_hash = (1 << 32) - 1

class min_hash:
    def __init__(self, t, seed=1):
        num_perm = t
        self.seed = seed
        self.hashvalues = np.full(num_perm, _max_hash, dtype=np.uint64)
        generator = np.random.RandomState(self.seed)
        self.permutations = np.array([(generator.randint(1, _mersenne_prime, dtype=np.uint64),
                                       generator.randint(0, _mersenne_prime, dtype=np.uint64))
                                       for _ in range(t)], dtype=np.uint64).T
        
    def update(self, gram):
        hv = np.uint64(hash(gram))
        #hv = struct.unpack('<I', hashlib.sha1(gram).digest()[:4])[0]
        a, b = self.permutations
        phv = np.bitwise_and((hv + b) , np.uint64(_max_hash))
        self.hashvalues = np.minimum(phv, self.hashvalues)
        
    def jaccard(self, other):
        return np.float(np.count_nonzero(self.hashvalues==other.hashvalues)) / np.float(len(self.hashvalues))
    
    

In [95]:
ts = [20,60,150,300,600]
rang = 100
for t in ts:
    _sum = 0
    for r in range(rang):
        seed = random.randint(0,2^31)
        m1, m2 = min_hash(t, seed=seed), min_hash(t, seed=seed)
        for gram in G2['D1.txt']:
            m1.update(gram.encode('utf-8'))
        for gram in G2['D2.txt']:
            m2.update(gram.encode('utf-8'))

        _sum += m1.jaccard(m2)
    
    print(t, _sum/rang)

20 0.9770000000000008
60 0.9640000000000011
150 0.9625999999999999
300 0.9650333333333329
600 0.9648666666666668
