In [3664]:
import configparser  # for reading the parameters file
import sys  # for system errors and printouts
from pathlib import Path  # for paths of files
import os  # for reading the input data
import time  # for timing
import numpy as np  # for creating matrices or arrays
import random  # for randomly generating a and b for hash functions
from itertools import combinations  # for creating candidate pairs in lsh
import re
#import nltk
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
import random

In [3665]:
parameter_file = 'default_parameters.ini'  # the main parameters file
data_main_directory = Path('data')
parameters_dictionary = dict()
document_list = dict()

In [3666]:
def next_prime(N):
    def is_prime(n):
        if n <= 2:
            return n == 2
        if n % 2 == 0:
            return False
        p = 3
        while p * p <= n:
            if n % p == 0:
                return False
            p += 2
        return True

    prime = N + 1
    while not is_prime(prime):
        prime += 1
    return prime

In [3667]:
def read_parameters():
    config = configparser.ConfigParser()
    config.read(parameter_file)
    for section in config.sections():
        for key in config[section]:
            if key == 'data':
                parameters_dictionary[key] = config[section][key]
            elif key == 'naive':
                parameters_dictionary[key] = bool(config[section][key])
            elif key == 't':
                parameters_dictionary[key] = float(config[section][key])
            else:
                parameters_dictionary[key] = int(config[section][key])


read_parameters()

In [3668]:
def read_data(data_path):
    for (root, dirs, file) in os.walk(data_path):
        for f in file:
            file_path = data_path / f
            doc = open(file_path).read().strip().replace('\n', ' ')
            file_id = int(file_path.stem)
            document_list[file_id] = doc


data_folder = data_main_directory / parameters_dictionary['data']
read_data(data_folder)
document_list = {k: document_list[k] for k in sorted(document_list)}

Use the toy data to test, remove stop words as in Task 1

In [3669]:
Sentence_1 = " Big Data platform  students  Blackboard"
Sentence_2 = "Questions  MinHash project NTNU students Piazza"
Sentence_3 = "NTNU Big Data platform  Blackboard  Piazza"
Sentence_4 = " project data  students   Blackboard  Piazza"

Sentence_1, Sentence_2, Sentence_3, Sentence_4 = [sentence.lower()
                                                  for sentence in [Sentence_1, Sentence_2, Sentence_3, Sentence_4]]

document_list = {1: Sentence_1, 2: Sentence_2, 3: Sentence_3, 4: Sentence_4}

shingles = ['big', 'blackboard', 'data', 'minhash', 'ntnu',
                'piazza', 'platform', 'project', 'questions', 'students']

In [3670]:
for i in range(len(document_list)):
    print(i + 1, ': ', document_list[i + 1])

1 :   big data platform  students  blackboard
2 :  questions  minhash project ntnu students piazza
3 :  ntnu big data platform  blackboard  piazza
4 :   project data  students   blackboard  piazza


In [3671]:
# WORKS!
def k_shingles():
    docs_k_shingles = []
    k = parameters_dictionary['k']

    for _, document in document_list.items():

        cleaned_doc = ''.join(
            c for c in document if c.isalnum() or c.isspace())
        words = cleaned_doc.split()
        k_shingles_set = set(' '.join(words[i:i+k])
                             for i in range(len(words) - k + 1))
        docs_k_shingles.append(k_shingles_set)

    return docs_k_shingles

In [3672]:
all_docs_k_shingles = k_shingles()
for i in range(len(all_docs_k_shingles)):
    print(i + 1, ': ', all_docs_k_shingles[i])


all_docs_k_shingles

1 :  {'big data platform students blackboard'}
2 :  {'minhash project ntnu students piazza', 'questions minhash project ntnu students'}
3 :  {'ntnu big data platform blackboard', 'big data platform blackboard piazza'}
4 :  {'project data students blackboard piazza'}


[{'big data platform students blackboard'},
 {'minhash project ntnu students piazza',
  'questions minhash project ntnu students'},
 {'big data platform blackboard piazza', 'ntnu big data platform blackboard'},
 {'project data students blackboard piazza'}]

In [3673]:
def signature_set(k_shingles):

    all_unique_shingles = set().union(*k_shingles)  # can add *k_shingles instead
    all_unique_shingles_list = list(all_unique_shingles)

    shingle_to_index = {shingle: idx for idx,
                        shingle in enumerate(all_unique_shingles_list)}

    num_docs = len(k_shingles)
    num_shingles = len(all_unique_shingles)
    input_matrix = np.zeros((num_shingles, num_docs), dtype=int)

    for doc_idx, shingles_set in enumerate(k_shingles):
        for shingle in shingles_set:
            shingle_idx = shingle_to_index[shingle]
            input_matrix[shingle_idx, doc_idx] = 1

    return input_matrix

In [3674]:
input_matrix = signature_set(all_docs_k_shingles)
input_matrix

array([[1, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0]])

test (above) = actual (in task 1) 

1 = 1 

2 = 10


3 = 4


4 = 3


5 = 6


6 = 8


7 = 5


8 = 9


9 = 7


10 = 2

I.e all rows (shingles / Unique words) are present, but in a different order (not alphabetially sorted). OK

In [3675]:
input_matrix.shape[0], input_matrix.shape[1]
# Num Shingles X Num Docs

(6, 4)

In [3676]:
def generate_hash_functions(num_perm, N):
    hash_funcs = []
    for i in range(1, num_perm + 1):
        a = random.randint(1, N)
        b = random.randint(0, N)
        p = next_prime(N)
        hash_func = (lambda x, a=a, b=b, p=p: ((a * x + b) %
                     (p)) + 1, {'a': a, 'b': b, 'p': p})
        hash_funcs.append(hash_func)
    return hash_funcs

In [3677]:
def minHash(docs_signature_sets, hash_fn):

    input_matrix = docs_signature_sets  # for simplicity and readability

    num_shingles = input_matrix.shape[0]  # num rows
    num_docs = input_matrix.shape[1]  # num columns
    num_permutation = len(hash_fn)
    min_hash_signatures = np.full((num_permutation, num_docs), np.inf)

    for shingle in range(num_shingles):  # for each shingle, row
        for doc in range(num_docs):  # for each doc, column
            # ensures that hash functions are applied only to shingles that actually appear in the document
            if input_matrix[shingle, doc] == 1:
                for permutation, (hash_func, params) in enumerate(hash_fn):
                    shingle_hash = hash_func(shingle, **params)
                    min_hash_signatures[permutation, doc] = min(  # permutation is the row, doc is the column if Sig_M
                        min_hash_signatures[permutation, doc], shingle_hash)

    return min_hash_signatures

In [3678]:
#hash_fn = generate_hash_functions(parameters_dictionary['permutations'], len(input_matrix))
hash_fn = generate_hash_functions(100, len(input_matrix))
min_hash_signatures = minHash(input_matrix, hash_fn)

In [3679]:
# Permutations x Num_Documents
min_hash_signatures

array([[4., 1., 5., 2.],
       [6., 4., 1., 2.],
       [3., 5., 4., 2.],
       [1., 3., 4., 5.],
       [6., 1., 2., 3.],
       [2., 3., 1., 4.],
       [2., 1., 3., 7.],
       [3., 1., 2., 4.],
       [6., 2., 4., 1.],
       [1., 2., 6., 3.],
       [7., 2., 3., 4.],
       [3., 2., 4., 1.],
       [5., 1., 2., 4.],
       [7., 4., 1., 5.],
       [6., 1., 2., 3.],
       [3., 1., 2., 4.],
       [2., 4., 3., 1.],
       [4., 3., 2., 1.],
       [3., 5., 4., 2.],
       [7., 1., 2., 3.],
       [1., 5., 4., 2.],
       [4., 3., 2., 1.],
       [4., 2., 1., 7.],
       [4., 1., 3., 5.],
       [7., 2., 3., 4.],
       [5., 4., 1., 2.],
       [1., 3., 4., 5.],
       [4., 2., 1., 7.],
       [1., 3., 4., 5.],
       [2., 1., 5., 6.],
       [6., 1., 2., 3.],
       [5., 2., 6., 3.],
       [7., 1., 5., 2.],
       [7., 4., 3., 1.],
       [3., 1., 5., 6.],
       [7., 4., 1., 5.],
       [6., 1., 3., 5.],
       [3., 2., 4., 1.],
       [3., 2., 4., 1.],
       [1., 3., 4., 5.],


In [3680]:
true_jaccard_similarity = [0.1, 0.5714, 0.4285, 0.2, 0.375, 0.375]

In [3681]:
def jaccard_similarity(matrix, i, j):
    return np.sum(matrix[:, i] == matrix[:, j]) / matrix.shape[0]

'''
If we let the number og permutations grow large, the error from true similarity decreases. I.e., it works 
However, its a bit trash, need like 1000 permuatations for the result to be decent. And 1000 rows is more
than the original, and there is no computational gain, but this may be because our data is so small
'''

for i in range(min_hash_signatures.shape[1]):
    for j in range(i + 1, min_hash_signatures.shape[1]):
        print(
            f"Jaccard Similarity between Sentence {i + 1} and Sentence {j + 1} is {jaccard_similarity(min_hash_signatures, i, j)}")

Jaccard Similarity between Sentence 1 and Sentence 2 is 0.0
Jaccard Similarity between Sentence 1 and Sentence 3 is 0.0
Jaccard Similarity between Sentence 1 and Sentence 4 is 0.0
Jaccard Similarity between Sentence 2 and Sentence 3 is 0.0
Jaccard Similarity between Sentence 2 and Sentence 4 is 0.0
Jaccard Similarity between Sentence 3 and Sentence 4 is 0.0


In [3682]:
def lsh(min_hash_signatures):

    b = parameters_dictionary['b']
    num_rows = min_hash_signatures.shape[0]  # num permutations
    num_docs = min_hash_signatures.shape[1]
    rows_per_band = num_rows // b  # divide the rows of the signature matrix into b bands
    candidates = set()

    for band in range(b):
        start_index = band * rows_per_band
        end_index = start_index + rows_per_band
        buckets = {}

        for doc in range(num_docs):
            # extract MinHash values for the current document in the current band
            band_slice = tuple(min_hash_signatures[start_index:end_index, doc])
            print(band_slice)
            band_hash = hash(band_slice)
            print(band_hash)
            if band_hash not in buckets:
                buckets[band_hash] = [doc]
            else:
                for candidate_doc in buckets[band_hash]:
                    candidates.add((candidate_doc, doc))
                buckets[band_hash].append(doc)

    return candidates

In [3683]:
candidates = lsh(min_hash_signatures)
candidates

(4.0, 6.0, 3.0, 1.0, 6.0)
-1404534377251821405
(1.0, 4.0, 5.0, 3.0, 1.0)
-582485975262488327
(5.0, 1.0, 4.0, 4.0, 2.0)
-8906811249165081092
(2.0, 2.0, 2.0, 5.0, 3.0)
-8377305782804328457
(2.0, 2.0, 3.0, 6.0, 1.0)
2113126137666713194
(3.0, 1.0, 1.0, 2.0, 2.0)
6241205306661066447
(1.0, 3.0, 2.0, 4.0, 6.0)
-6305498743342790870
(4.0, 7.0, 4.0, 1.0, 3.0)
-8493525859713514233
(7.0, 3.0, 5.0, 7.0, 6.0)
1960841052811665867
(2.0, 2.0, 1.0, 4.0, 1.0)
-4144346545389656819
(3.0, 4.0, 2.0, 1.0, 2.0)
3112524076205350403
(4.0, 1.0, 4.0, 5.0, 3.0)
-8537250652368104507
(3.0, 2.0, 4.0, 3.0, 7.0)
6780078700826965266
(1.0, 4.0, 3.0, 5.0, 1.0)
-4729945923772302536
(2.0, 3.0, 2.0, 4.0, 2.0)
6220349440742185675
(4.0, 1.0, 1.0, 2.0, 3.0)
-8438153806684176286
(1.0, 4.0, 4.0, 4.0, 7.0)
3059798096485113988
(5.0, 3.0, 2.0, 1.0, 2.0)
-9148755222282865386
(4.0, 2.0, 1.0, 3.0, 3.0)
-2325550999726738870
(2.0, 1.0, 7.0, 5.0, 4.0)
871000895491068085
(5.0, 1.0, 4.0, 1.0, 2.0)
-1166675331747248844
(4.0, 3.0, 2.0, 3.0, 1.

set()

In [3684]:
def candidates_similarities(candidate_docs, min_hash_matrix):
    similarity_dict = {}
    #t = parameters_dictionary['t']
    t = 0.5

    for candidate_pair in candidate_docs:

        doc1, doc2 = list(candidate_pair)

        agreement = np.sum(
            min_hash_matrix[:, doc1] == min_hash_matrix[:, doc2])

        similarity = agreement / min_hash_matrix.shape[0]

        if similarity > t:
            similarity_dict[candidate_pair] = similarity

    return similarity_dict

In [3685]:
similarities = candidates_similarities(candidates, min_hash_signatures)
similarities

{}

In [3686]:
def generate_hash_functions(num_perm, N):
    hash_funcs = []
    for i in range(1, num_perm + 1):
        a = random.randint(1, N)
        b = random.randint(0, N)
        p = next_prime(N)
        hash_func = (lambda x, a=a, b=b, p=p: ((a * x + b) %
                     (p)) + 1, {'a': a, 'b': b, 'p': p})
        hash_funcs.append(hash_func)
    return hash_funcs

In [3687]:
hash_funcs = generate_hash_functions(3, len(input_matrix))
hash_funcs

[(<function __main__.generate_hash_functions.<locals>.<lambda>(x, a=4, b=1, p=7)>,
  {'a': 4, 'b': 1, 'p': 7}),
 (<function __main__.generate_hash_functions.<locals>.<lambda>(x, a=6, b=1, p=7)>,
  {'a': 6, 'b': 1, 'p': 7}),
 (<function __main__.generate_hash_functions.<locals>.<lambda>(x, a=4, b=0, p=7)>,
  {'a': 4, 'b': 0, 'p': 7})]

In [3688]:
for permutation, (hash_func, params) in enumerate(hash_fn):
    print(
        f"Hash function #{permutation}: a={params['a']}, b={params['b']}, p={params['p']}")

Hash function #0: a=4, b=3, p=7
Hash function #1: a=1, b=5, p=7
Hash function #2: a=2, b=2, p=7
Hash function #3: a=6, b=0, p=7
Hash function #4: a=6, b=5, p=7
Hash function #5: a=3, b=1, p=7
Hash function #6: a=4, b=1, p=7
Hash function #7: a=5, b=2, p=7
Hash function #8: a=3, b=5, p=7
Hash function #9: a=3, b=0, p=7
Hash function #10: a=6, b=6, p=7
Hash function #11: a=4, b=2, p=7
Hash function #12: a=2, b=4, p=7
Hash function #13: a=4, b=6, p=7
Hash function #14: a=6, b=5, p=7
Hash function #15: a=5, b=2, p=7
Hash function #16: a=2, b=1, p=7
Hash function #17: a=6, b=3, p=7
Hash function #18: a=2, b=2, p=7
Hash function #19: a=1, b=6, p=7
Hash function #20: a=5, b=0, p=7
Hash function #21: a=6, b=3, p=7
Hash function #22: a=1, b=3, p=7
Hash function #23: a=5, b=3, p=7
Hash function #24: a=6, b=6, p=7
Hash function #25: a=6, b=4, p=7
Hash function #26: a=6, b=0, p=7
Hash function #27: a=1, b=3, p=7
Hash function #28: a=6, b=0, p=7
Hash function #29: a=6, b=1, p=7
Hash function #30: a

In [3689]:
tuple_in = min_hash_signatures[0:10, 0]
tuple_in

array([4., 6., 3., 1., 6., 2., 2., 3., 6., 1.])

In [3690]:
band_slice = tuple(tuple_in)


band_hash = hash(band_slice)
band_hash

4829124873993804287