# Homework 1

You are to implement the stages of finding textually similar documents based on Jaccard similarity using the shingling, minhashing, and locality-sensitive hashing (LSH) techniques and corresponding algorithms. The implementation can be done using any big data processing framework, such as Apache Spark, Apache Flink, or no framework, e.g., in Java, Python, etc. To test and evaluate your implementation, write a program that uses your implementation to find similar documents in a corpus of 5-10 or more documents, such as web pages or emails.

The stages should be implemented as a collection of classes, modules, functions, or procedures depending on the framework and the language of your choice. Below, we describe sample classes implementing different stages of finding textually similar documents. You do not have to develop the exact same classes and data types described below. Feel free to use data structures that suit you best.

In [2]:
# import packages
from pyspark import SparkContext, SparkConf
import hashlib
import random
import numpy as np
import pandas as pd
import findspark

In [3]:
# initializing Spark
findspark.init()
conf = SparkConf().setAppName("SimDoc").setMaster("local[*]")
sc = SparkContext(conf=conf)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/04 21:02:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/04 21:02:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
# collect texts from dataframes
df1 = pd.read_csv("datasets/winemag-data_first150k.csv")

docus = df1['description'].dropna().tolist()

# creating a list of documents where we store all our texts
documents_map = {i: content for i, content in enumerate(docus)}
documents_map = dict(list(documents_map.items())[:20])

# testing text creation
#print(len(documents_map))
#print(documents_map[0])


In [5]:
# Parameters
k = 5  # Shingle length
signature_len = 100  # Length of minhash signatures
similarity_threshold = 0.8  # Similarity threshold for LSH
num_bands = 20  # Number of bands in LSH
rows_per_band = signature_len // num_bands

In [6]:
def create_shingles(id, text, k):
    """Create k-shingles for a document."""
    content = text.lower().replace('.', '').replace(',', '')
    shingles = set(text[i:i + k] for i in range(len(text) - k + 1))
    return id, shingles


def generate_hashed_shingles(text):
    """Generate hashed shingles for each document."""
    id, shingles = text
    hashed_shingles = set(hash(shingle) for shingle in shingles)
    return id, hashed_shingles


### Jaccard Similarity Calculation

In [7]:
def jaccard_similarity(docu1, docu2):
    """Calculate Jaccard similarity between two sets of shingles."""
    id1, shingles1 = docu1
    id2, shingles2 = docu2
    intersection = shingles1.intersection(shingles2)
    union = shingles1.union(shingles2)
    similarity = len(intersection) / len(union) if union else 0.0
    return (id1, id2), similarity


In [8]:
# convert documents_map to an RDD
jaccard_rdd = sc.parallelize(documents_map.items())

# create shinglings
shingles_rdd = jaccard_rdd.map(lambda doc: create_shingles(doc[0], doc[1], k))
hashed_shingles_rdd = shingles_rdd.map(generate_hashed_shingles)

# getting all pairs
jaccard_pairs = hashed_shingles_rdd.cartesian(hashed_shingles_rdd).filter(lambda x: x[0][0] < x[1][0]) #remove duplicated pairs

In [9]:
# computing Jaccard similarities 
jaccard_pairs_with_similarities = jaccard_pairs.map(lambda pair: jaccard_similarity(pair[0], pair[1]))
jaccard_pairs_threshold = jaccard_pairs_with_similarities.filter(lambda x: x[1] >= similarity_threshold)

In [10]:
# print out the pairs which are not below the threshold and similar enough
result = jaccard_pairs_threshold.collect()
for item in result:
    print(item)

                                                                                

### MinHash

- source of next_prime: http://compoasso.free.fr/primelistweb/page/prime/liste_online_en.php
- minHash source: https://github.com/chrisjmccormick/MinHash/blob/master/runMinHashExample.py


In [11]:
#signature_len = 100
next_prime = 4294967311 
max_shingle_id = 2**32-1

#  h(x) = (a*x + b) % max_shingle_id
# a, b: random coefficients

def get_coefficients():
    coeffs = []
    while len(coeffs) < signature_len:
        rand_idx = random.randint(1, max_shingle_id) 
        while rand_idx in coeffs:
            rand_idx = random.randint(1, max_shingle_id)
        coeffs.append(rand_idx)
    return coeffs

coeffs_a = get_coefficients()
coeffs_b = get_coefficients()
    
def get_minhash_signature(hashed_shingles):
    signature = []
    for i in range(signature_len):
        min_hash_code = min([(coeffs_a[i] * shingle + coeffs_b[i]) % next_prime for shingle in hashed_shingles])
        signature.append(min_hash_code)
    return signature

def minhash_docu(id, hashed_shingles):
    return id, get_minhash_signature(hashed_shingles)

In [12]:
def signature_similarity(doc_1, doc_2, signature_len):
    id_1, signature_1 = doc_1
    id_2, signature_2 = doc_2
    
    agree_cnt = sum(1 for i in range(signature_len) if signature_1[i] == signature_2[i])
    similarity = agree_cnt / signature_len
    
    return (id_1, id_2), similarity

In [13]:
min_hash_rdd = shingles_rdd.map(lambda doc: minhash_docu(doc[0], doc[1]))
minhash_pairs = min_hash_rdd.cartesian(min_hash_rdd).filter(lambda x: x[0][0] < x[1][0]) # remove duplicates
minhash_with_similarities = minhash_pairs.map(lambda pair: signature_similarity(pair[0], pair[1], signature_len))
minhash_threshold = minhash_with_similarities.filter(lambda x: x[1] >= similarity_threshold)

In [None]:
result = minhash_with_similarities.collect()
for item in result:
    print(item)

[Stage 1:>                                                         (0 + 4) / 16]