# Homework 1

You are to implement the stages of finding textually similar documents based on Jaccard similarity using the shingling, minhashing, and locality-sensitive hashing (LSH) techniques and corresponding algorithms. The implementation can be done using any big data processing framework, such as Apache Spark, Apache Flink, or no framework, e.g., in Java, Python, etc. To test and evaluate your implementation, write a program that uses your implementation to find similar documents in a corpus of 5-10 or more documents, such as web pages or emails.

The stages should be implemented as a collection of classes, modules, functions, or procedures depending on the framework and the language of your choice. Below, we describe sample classes implementing different stages of finding textually similar documents. You do not have to develop the exact same classes and data types described below. Feel free to use data structures that suit you best.

In [1]:
# import packages
from pyspark import SparkContext, SparkConf
import hashlib
import random
import numpy as np
import pandas as pd
import findspark

In [2]:
# initializing Spark
findspark.init()
conf = SparkConf().setAppName("SimDoc").setMaster("local[*]")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/05 20:31:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# collect texts from dataframes
df1 = pd.read_csv("datasets/winemag-data_first150k.csv")

docus = df1['description'].dropna().tolist()

# creating a list of documents where we store all our texts
documents_map = {i: content for i, content in enumerate(docus)}
documents_map = dict(list(documents_map.items())[:10])

# testing text creation
#print(len(documents_map))
#print(documents_map[0])


In [4]:
# Parameters
k = 5  # Shingle length
signature_len = 100  # Length of minhash signatures
similarity_threshold = 0.1  # Similarity threshold for LSH
num_bands = 20  # Number of bands in LSH
rows_per_band = signature_len // num_bands

In [5]:
def create_shingles(id, text, k):
    """Create k-shingles for a document."""
    content = text.lower().replace('.', '').replace(',', '')
    shingles = set(text[i:i + k] for i in range(len(text) - k + 1))
    return id, shingles


def generate_hashed_shingles(text):
    """Generate hashed shingles for each document."""
    id, shingles = text
    hashed_shingles = set(hash(shingle) for shingle in shingles)
    return id, hashed_shingles


### Jaccard Similarity Calculation

In [6]:
def jaccard_similarity(docu1, docu2):
    """Calculate Jaccard similarity between two sets of shingles."""
    id1, shingles1 = docu1
    id2, shingles2 = docu2
    intersection = shingles1.intersection(shingles2)
    union = shingles1.union(shingles2)
    similarity = len(intersection) / len(union) if union else 0.0
    return (id1, id2), similarity


In [7]:
# convert documents_map to an RDD
jaccard_rdd = sc.parallelize(documents_map.items())

# create shinglings
shingles_rdd = jaccard_rdd.map(lambda doc: create_shingles(doc[0], doc[1], k))
hashed_shingles_rdd = shingles_rdd.map(generate_hashed_shingles)

# getting all pairs
jaccard_pairs = hashed_shingles_rdd.cartesian(hashed_shingles_rdd).filter(lambda x: x[0][0] < x[1][0]) #remove duplicated pairs

In [8]:
# computing Jaccard similarities 
jaccard_pairs_with_similarities = jaccard_pairs.map(lambda pair: jaccard_similarity(pair[0], pair[1]))
jaccard_pairs_threshold = jaccard_pairs_with_similarities.filter(lambda x: x[1] >= similarity_threshold)

In [9]:
# print out the pairs which are not below the threshold and similar enough
result = jaccard_pairs_threshold.collect()
for item in result:
    print(item)



((1, 3), 0.10535117056856187)
((1, 5), 0.166015625)
((1, 7), 0.15412844036697249)
((3, 5), 0.11260504201680673)
((5, 6), 0.11070110701107011)
((5, 7), 0.12099644128113879)
((6, 7), 0.11228070175438597)


                                                                                

### MinHash

- source of next_prime: http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
- minHash source: https://github.com/chrisjmccormick/MinHash/blob/master/runMinHashExample.py
- hash function: h(x) = (a*x + b) % max_shingle_id
    - a, b: random coefficients - these are fixed we generate it ones
    - always choose the minimum from the hashed values


In [None]:
#signature_len = 100 
max_shingle_id = 2**32-1

#  h(x) = (a*x + b) % max_shingle_id
# a, b: random coefficients

def get_coefficients():
    #max_shingle_id: it can be any integer number based on the range for the coeffs
    coeffs = []
    while len(coeffs) < signature_len:
        rand_idx = random.randint(1, max_shingle_id) 
        #print('anyad')
        while rand_idx in coeffs:
            rand_idx = random.randint(1, max_shingle_id)
            #print('apad')
        coeffs.append(rand_idx)
    return coeffs

coeffs_a = get_coefficients()
coeffs_b = get_coefficients()
    
def get_minhash_signature(hashed_shingles):
    signature = []
    for i in range(signature_len):
        min_hash_code = min([(coeffs_a[i] * shingle + coeffs_b[i]) % max_shingle_id for shingle in hashed_shingles])
        signature.append(min_hash_code)
    return signature

def minhash_docu(id, hashed_shingles):
    return id, get_minhash_signature(hashed_shingles)

[3864713615, 478001295, 301531630, 3216443062, 4049061597, 41551255, 2736980200, 2942452256, 1287632781, 2314594600, 408633167, 841068115, 773500145, 3923002983, 2220638513, 118087531, 526814677, 8603579, 4211948036, 2722277260, 3495604815, 2113013151, 572458016, 1010439377, 104103746, 2242861726, 3749222777, 366216041, 1867447115, 2841861178, 3872926141, 145562691, 1894556135, 2251729982, 4114990643, 2374578010, 2380349444, 2050083027, 124722449, 3824550241, 2312688337, 953128213, 1540181870, 3365116640, 1015748145, 742748107, 3138377816, 3248356770, 113493578, 3343118406, 1077007330, 554669489, 362227268, 4211159107, 3936370102, 953852285, 1041396583, 1239194279, 1879396790, 1771097148, 733740683, 4211150561, 71733968, 1645338532, 3979405313, 564874982, 2100351697, 227898626, 2711341495, 3798481561, 3303384349, 3696190304, 4124073711, 64718181, 2092744766, 1583367596, 655165754, 2638127525, 2135666295, 4263558313, 3510820552, 1024910116, 2619700470, 2208567758, 2498155952, 2766418922

In [11]:
def signature_similarity(docu1, docu2, signature_len):
    id1, signature1 = docu1
    id2, signature2 = docu2
    
    agree_cnt = sum(1 for i in range(signature_len) if signature1[i] == signature2[i])
    similarity = agree_cnt / signature_len
    
    return (id1, id2), similarity

In [17]:
minhash_rdd = hashed_shingles_rdd.map(lambda doc: minhash_docu(doc[0], doc[1]))
minhash_pairs = minhash_rdd.cartesian(minhash_rdd).filter(lambda x: x[0][0] < x[1][0]) # remove duplicates
minhash_with_similarities = minhash_pairs.map(lambda doc: signature_similarity(doc[0], doc[1], signature_len))
minhash_threshold = minhash_with_similarities.filter(lambda x: x[1] >= similarity_threshold)

In [20]:
result = minhash_threshold.collect()
for item in result:
    print(item)



((1, 3), 0.12)
((1, 5), 0.17)
((1, 6), 0.12)
((1, 7), 0.14)
((1, 8), 0.13)
((3, 5), 0.11)
((3, 6), 0.1)
((3, 8), 0.12)
((4, 8), 0.12)
((5, 6), 0.11)
((5, 7), 0.16)
((6, 7), 0.11)


                                                                                