In [2141]:
import configparser  # for reading the parameters file
import sys  # for system errors and printouts
from pathlib import Path  # for paths of files
import os  # for reading the input data
import time  # for timing
import numpy as np  # for creating matrices or arrays
import random  # for randomly generating a and b for hash functions
from itertools import combinations  # for creating candidate pairs in lsh
import re
#import nltk
#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize
import random

In [2142]:
parameter_file = 'default_parameters.ini'  # the main parameters file
data_main_directory = Path('data')
parameters_dictionary = dict()
document_list = dict()

In [2143]:
def next_prime(N):
    def is_prime(n):
        if n <= 2:
            return n == 2
        if n % 2 == 0:
            return False
        p = 3
        while p * p <= n:
            if n % p == 0:
                return False
            p += 2
        return True

    prime = N + 1
    while not is_prime(prime):
        prime += 1
    return prime

In [2144]:
def read_parameters():
    config = configparser.ConfigParser()
    config.read(parameter_file)
    for section in config.sections():
        for key in config[section]:
            if key == 'data':
                parameters_dictionary[key] = config[section][key]
            elif key == 'naive':
                parameters_dictionary[key] = bool(config[section][key])
            elif key == 't':
                parameters_dictionary[key] = float(config[section][key])
            else:
                parameters_dictionary[key] = int(config[section][key])


read_parameters()

In [2145]:
def read_data(data_path):
    for (root, dirs, file) in os.walk(data_path):
        for f in file:
            file_path = data_path / f
            doc = open(file_path).read().strip().replace('\n', ' ')
            file_id = int(file_path.stem)
            document_list[file_id] = doc


data_folder = data_main_directory / parameters_dictionary['data']
read_data(data_folder)
document_list = {k: document_list[k] for k in sorted(document_list)}

In [2146]:
Sentence_1 = " Big Data platform  students  Blackboard"
Sentence_2 = "Questions  MinHash project NTNU students Piazza"
Sentence_3 = "NTNU Big Data platform  Blackboard  Piazza"
Sentence_4 = " project data  students   Blackboard  Piazza"

Sentence_1, Sentence_2, Sentence_3, Sentence_4 = [sentence.lower()
                                                  for sentence in [Sentence_1, Sentence_2, Sentence_3, Sentence_4]]

document_list = {1: Sentence_1, 2: Sentence_2, 3: Sentence_3, 4: Sentence_4}

shingles = ['big', 'blackboard', 'data', 'minhash', 'ntnu',
                'piazza', 'platform', 'project', 'questions', 'students']

In [2147]:
for i in range(len(document_list)):
    print(i + 1, ': ', document_list[i + 1])

1 :   big data platform  students  blackboard
2 :  questions  minhash project ntnu students piazza
3 :  ntnu big data platform  blackboard  piazza
4 :   project data  students   blackboard  piazza


In [2148]:
# WORKS!

def k_shingles():
    docs_k_shingles = []
    #k = parameters_dictionary['k']
    k = 1
    non_word_pattern = re.compile(r'[^\w\s]')

    for doc_id, document in document_list.items():
        cleaned_doc = re.sub(non_word_pattern, '', document)
        words = cleaned_doc.split()
        k_shingles_set = set([' '.join(words[i:i+k]) for i in range(len(words) - k + 1)])
        docs_k_shingles.append(k_shingles_set)

    return docs_k_shingles

In [2149]:
all_docs_k_shingles = k_shingles()
for i in range(len(all_docs_k_shingles)):
    print(i + 1, ': ', all_docs_k_shingles[i])


all_docs_k_shingles

1 :  {'platform', 'students', 'data', 'big', 'blackboard'}
2 :  {'students', 'minhash', 'piazza', 'project', 'ntnu', 'questions'}
3 :  {'platform', 'data', 'piazza', 'ntnu', 'big', 'blackboard'}
4 :  {'students', 'piazza', 'data', 'project', 'blackboard'}


[{'big', 'blackboard', 'data', 'platform', 'students'},
 {'minhash', 'ntnu', 'piazza', 'project', 'questions', 'students'},
 {'big', 'blackboard', 'data', 'ntnu', 'piazza', 'platform'},
 {'blackboard', 'data', 'piazza', 'project', 'students'}]

In [2150]:
def signature_set(k_shingles):

    all_unique_shingles = set().union(*k_shingles)  # can add *k_shingles instead
    all_unique_shingles_list = list(all_unique_shingles)

    shingle_to_index = {shingle: idx for idx,
                        shingle in enumerate(all_unique_shingles_list)}

    num_docs = len(k_shingles)
    num_shingles = len(all_unique_shingles)
    input_matrix = np.zeros((num_shingles, num_docs), dtype=int)

    for doc_idx, shingles_set in enumerate(k_shingles):
        for shingle in shingles_set:
            shingle_idx = shingle_to_index[shingle]
            input_matrix[shingle_idx, doc_idx] = 1

    return input_matrix

In [2151]:
input_matrix = signature_set(all_docs_k_shingles)
input_matrix

array([[1, 0, 1, 0],
       [1, 1, 0, 1],
       [0, 1, 0, 0],
       [1, 0, 1, 1],
       [0, 1, 1, 1],
       [0, 1, 0, 1],
       [0, 1, 1, 0],
       [0, 1, 0, 0],
       [1, 0, 1, 0],
       [1, 0, 1, 1]])

test (above) = actual (in task 1) 

1 = 1 

2 = 10


3 = 4


4 = 3


5 = 6


6 = 8


7 = 5


8 = 9


9 = 7


10 = 2

I.e all rows (shingles / Unique words) are present, but in a different order (not alphabetially sorted). OK

In [2152]:
input_matrix.shape[0], input_matrix.shape[1]
# Num Shingles X Num Docs

(10, 4)

In [2153]:
def generate_hash_functions(num_perm, N):
    hash_funcs = []
    for i in range(1, num_perm + 1):
        a = random.randint(1, N)  
        b = random.randint(0, N)  
        p = next_prime(N)     
        hash_func = (lambda x, a=a, b=b, p=p: ((a * x + b) % (p)) + 1, {'a': a, 'b': b, 'p': p})
        hash_funcs.append(hash_func)
    return hash_funcs

In [2154]:
def minHash(docs_signature_sets, hash_fn):
    input_matrix = docs_signature_sets # simplicity
    num_shingles = input_matrix.shape[0]  # num rows
    num_docs = input_matrix.shape[1]  # num columns
    num_permutation = len(hash_fn)
    min_hash_signatures = np.full((num_permutation, num_docs), np.inf)

    for permutation, (hash_func, params) in enumerate(hash_fn):
        print(
            f"Hash function #{permutation}: a={params['a']}, b={params['b']}, p={params['p']}")
        for doc in range(num_docs): # for each doc, column
            for shingle in range(num_shingles): # for each shingle
                if input_matrix[shingle, doc] == 1:                    
                    shingle_hash = hash_func(shingle)
                    min_hash_signatures[permutation, doc] = min(
                        min_hash_signatures[permutation, doc], shingle_hash)

    return min_hash_signatures

In [2155]:
#hash_fn = generate_hash_functions(parameters_dictionary['permutations'], len(input_matrix))
hash_fn = generate_hash_functions(1000, len(input_matrix))
min_hash_signatures = minHash(input_matrix, hash_fn)

Hash function #0: a=1, b=6, p=11
Hash function #1: a=10, b=0, p=11
Hash function #2: a=1, b=10, p=11
Hash function #3: a=3, b=2, p=11
Hash function #4: a=2, b=8, p=11
Hash function #5: a=3, b=10, p=11
Hash function #6: a=2, b=5, p=11
Hash function #7: a=4, b=4, p=11
Hash function #8: a=2, b=7, p=11
Hash function #9: a=10, b=4, p=11
Hash function #10: a=4, b=6, p=11
Hash function #11: a=5, b=6, p=11
Hash function #12: a=3, b=5, p=11
Hash function #13: a=4, b=6, p=11
Hash function #14: a=3, b=0, p=11
Hash function #15: a=5, b=7, p=11
Hash function #16: a=3, b=4, p=11
Hash function #17: a=7, b=6, p=11
Hash function #18: a=6, b=10, p=11
Hash function #19: a=2, b=7, p=11
Hash function #20: a=4, b=0, p=11
Hash function #21: a=2, b=3, p=11
Hash function #22: a=9, b=4, p=11
Hash function #23: a=7, b=10, p=11
Hash function #24: a=9, b=5, p=11
Hash function #25: a=5, b=6, p=11
Hash function #26: a=3, b=0, p=11
Hash function #27: a=5, b=7, p=11
Hash function #28: a=4, b=9, p=11
Hash function #29:

In [2156]:
# Permutations x Num_Documents
min_hash_signatures

array([[4., 1., 2., 1.],
       [1., 5., 1., 3.],
       [1., 1., 3., 1.],
       ...,
       [5., 1., 4., 4.],
       [1., 3., 1., 2.],
       [1., 2., 1., 3.]])

In [2157]:
true_jaccard_similarity = [0.1, 0.5714, 0.4285, 0.2, 0.375, 0.375]

In [2158]:
def jaccard_similarity(matrix, i, j):
    return np.sum(matrix[:, i] == matrix[:, j]) / matrix.shape[0]

'''
If we let the number og permutations grow large, the error from true similarity decreases. I.e., it works 
However, its a bit trash, need like 1000 permuatations for the result to be decent
'''

for i in range(min_hash_signatures.shape[1]):
    for j in range(i + 1, min_hash_signatures.shape[1]):
        print(
            f"Jaccard Similarity between Sentence {i + 1} and Sentence {j + 1} is {jaccard_similarity(min_hash_signatures, i, j)}")

Jaccard Similarity between Sentence 1 and Sentence 2 is 0.097
Jaccard Similarity between Sentence 1 and Sentence 3 is 0.562
Jaccard Similarity between Sentence 1 and Sentence 4 is 0.412
Jaccard Similarity between Sentence 2 and Sentence 3 is 0.22
Jaccard Similarity between Sentence 2 and Sentence 4 is 0.366
Jaccard Similarity between Sentence 3 and Sentence 4 is 0.345
