### Datasketch
https://pypi.org/project/datasketch/

In [30]:
!pip install datasketch



In [43]:
from datasketch import MinHash, MinHashLSH

from nltk import ngrams
from nltk import word_tokenize

In [44]:
text_array = ["A bird in hand is worth two in the bush.",
              "Good things come to those who wait.",
              "There are other fish in the sea.",
              "The ball is in your court."]

In [45]:
word_token_array = [word_tokenize(text) for text in text_array]

word_token_array

[['A', 'bird', 'in', 'hand', 'is', 'worth', 'two', 'in', 'the', 'bush', '.'],
 ['Good', 'things', 'come', 'to', 'those', 'who', 'wait', '.'],
 ['There', 'are', 'other', 'fish', 'in', 'the', 'sea', '.'],
 ['The', 'ball', 'is', 'in', 'your', 'court', '.']]

In [46]:
for index, word_tokens in enumerate(word_token_array):
    for n_gram in ngrams(word_tokens, 3):
        print(index, n_gram)

0 ('A', 'bird', 'in')
0 ('bird', 'in', 'hand')
0 ('in', 'hand', 'is')
0 ('hand', 'is', 'worth')
0 ('is', 'worth', 'two')
0 ('worth', 'two', 'in')
0 ('two', 'in', 'the')
0 ('in', 'the', 'bush')
0 ('the', 'bush', '.')
1 ('Good', 'things', 'come')
1 ('things', 'come', 'to')
1 ('come', 'to', 'those')
1 ('to', 'those', 'who')
1 ('those', 'who', 'wait')
1 ('who', 'wait', '.')
2 ('There', 'are', 'other')
2 ('are', 'other', 'fish')
2 ('other', 'fish', 'in')
2 ('fish', 'in', 'the')
2 ('in', 'the', 'sea')
2 ('the', 'sea', '.')
3 ('The', 'ball', 'is')
3 ('ball', 'is', 'in')
3 ('is', 'in', 'your')
3 ('in', 'your', 'court')
3 ('your', 'court', '.')


In [47]:
min_hash_lsh = MinHashLSH(threshold=0.5, num_perm=128)

In [48]:
min_hashes = {}

for index, text in enumerate(text_array):
    min_hash = MinHash(num_perm=128)

    for n_gram in ngrams(text, 3):
        min_hash.update("".join(n_gram).encode('utf-8'))
    
    min_hash_lsh.insert(index, min_hash)
    min_hashes[index] = min_hash

In [49]:
min_hashes

{0: <datasketch.minhash.MinHash at 0x11bc3a9b0>,
 1: <datasketch.minhash.MinHash at 0x11bc3ab00>,
 2: <datasketch.minhash.MinHash at 0x11bc392e8>,
 3: <datasketch.minhash.MinHash at 0x11bc39320>}

In [50]:
for i in min_hashes.keys():
    result = min_hash_lsh.query(min_hashes[i])
    print("Candidate pairs with Jaccard similarity > 0.5 for input", i, ":", result)

Candidate pairs with Jaccard similarity > 0.5 for input 0 : [0]
Candidate pairs with Jaccard similarity > 0.5 for input 1 : [1]
Candidate pairs with Jaccard similarity > 0.5 for input 2 : [2]
Candidate pairs with Jaccard similarity > 0.5 for input 3 : [3]


In [55]:
text_array = ["A bird in hand is worth two in the bush.",
              "A bird in hands is worth three in the bushes.",
              "Good things come to those who wait.",
              "Good tpings cxme to those who wait long.",
              "There are other fish in the sea.",
              "The ball is in your court."
             ]

In [56]:
min_hash_lsh = MinHashLSH(threshold=0.5, num_perm=128)

In [57]:
min_hashes = {}

for index, text in enumerate(text_array):
    min_hash = MinHash(num_perm=128)

    for n_gram in ngrams(text, 3):
        min_hash.update("".join(n_gram).encode('utf-8'))
    
    min_hash_lsh.insert(index, min_hash)
    min_hashes[index] = min_hash

In [58]:
for i in min_hashes.keys():
    result = min_hash_lsh.query(min_hashes[i])
    print("Candidate pairs with Jaccard similarity > 0.5 for input", i, ":", result)

Candidate pairs with Jaccard similarity > 0.5 for input 0 : [0, 1]
Candidate pairs with Jaccard similarity > 0.5 for input 1 : [0, 1]
Candidate pairs with Jaccard similarity > 0.5 for input 2 : [2, 3]
Candidate pairs with Jaccard similarity > 0.5 for input 3 : [2, 3]
Candidate pairs with Jaccard similarity > 0.5 for input 4 : [4]
Candidate pairs with Jaccard similarity > 0.5 for input 5 : [5]
