# Finding Similar Items: Textually Similar Documents


## Task
The task of this lab was to implement the stages of finding textually similar documents based on Jaccard similarity using the shingling, minhashing, and locality-sensitive hashing (LSH) techniques and corresponding algorithms. Theory can be found in the docs directory. 

## How to Run 
Run the code by sequentially executing each block of the code. Hyperparameters can be set for part 1, 2 and 3 under each parts respective section *Hyperparameters*. 

### All code by
Philip Claesson and Miguel Maricalva

In [120]:
import pandas as pd
import numpy as np
import hashlib
from OrderedSet import OrderedSet

### Read Documents

In [121]:
import os

path = "../data/OpinRankDataset/hotels/small_sample/"
raw_documents = []

for root, dirs, files in os.walk(path):  
    for filename in files[0:5]:
        raw_doc = ""
        with open(path + filename, encoding='utf-8', errors='replace') as f:
            for line in f.readlines():
                raw_doc += line
        raw_documents.append(raw_doc)

In [122]:
## Number of documents
print(len(raw_documents))

5


In [123]:
class Document: 
    def __init__(this, text, n_buckets, shingle_length, filename):
        this.filename = filename
        this.text = text
        this.length = len(text)
        this.n_buckets = n_buckets
        this.shingle_length = shingle_length
    
    def createShingles(this): 
        this.shingles = OrderedSet()
        
        for i in range (len(this.text) - this.shingle_length):
            this.shingles.add(this.text[slice(i, i + this.shingle_length)])
            
    def hashShingles(this):
        this.hashedShingles = OrderedSet()
        
        for shingle in this.shingles: 
            this.hashedShingles.add(this.hashStr(shingle))
    
    def hashStr(this, string):
        ## Hashes a string to an integer 0 <= i < n_buckets
        return abs(hash(string)) % (this.n_buckets)
    
    def jaccardSimilarity(this, other): 
        ## Compares two docs based on the jaccard similarity of the hashed shingles. 
        intersection = (this.hashedShingles & other.hashedShingles)
        union = (this.hashedShingles | other.hashedShingles)
        return (len(intersection)/len(union))
    
    def createSignature(this, h):
        ## builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).
        this.signature = np.ones(h.n_hashes) * np.inf ## the signature has length k, corresponding to each hash
        for i in range(h.n_hashes):
            for e in this.shingles:
                this.signature[i] = np.minimum(this.signature[i], h.hash(i, e)) # keep the smallest hash value of each hash function only
        
    def signatureSimilarity(this, other):
        ## estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.
        agree = 0
        
        for i in range(this.signature.size): 
            if (this.signature[i] == other.signature[i]): 
                agree += 1
                
        similarity = agree/this.signature.size
        return similarity
        

In [124]:
class Hasher: 
    def __init__(this, n_hashes, n_buckets): 
        ## A class that contains k random hash functions
        this.n_hashes = n_hashes
        this.n_buckets = n_buckets
        
        
    def hash(this, i, string): 
        ## returns hash function i of x: h_i(x)
        a = this.a_values[i]
        b = this.b_values[i]
        x = abs(hash(string))
        return (a*x + b) % this.n_buckets
    
    def generateRandomHashFunctions(this):
        # Each hash function is a function of random parameters a, b and c. 
        # create a and b values: 0 < random_interval_size
        this.a_values = (np.random.rand(this.n_hashes) * this.n_buckets).astype(int)
        this.b_values = (np.random.rand(this.n_hashes) * this.n_buckets).astype(int)
        
    def getHashMatrix(this, shingles):
        # Creates a hash matrix based on an input array of shingles
        # M(i, j) = h_i(e_j)
        this.hash_matrix = np.zeros((this.n_hashes, shingles.size))
        for i in range(this.n_hashes): # for each hash function
            for e in shingles:
                this.hash_matrix[i, j] = this.hash(i, e)
                
    

In [125]:
class LSH: 
    def __init__(this, n_rows, n_bands, t, n_buckets):
        this.n_rows = n_rows # rows per band
        this.n_bands = n_bands
        this.t = t
        this.n_buckets = n_buckets
        this.hash_weights = (np.random.rand(this.n_rows) * n_buckets).astype(int)
        
    def getCandidatePairs(this, documents):
        # takes a list of documents, returns a set of similar pairs based on LSH of their signatures. 
        bucket_doc_mat = np.zeros((this.n_buckets, len(documents)))
        this.candidate_pairs = set()
        for docindex, d in enumerate(documents):
        # for each doc
            for b in range(n_bands):
            # for each band 
                band = d.signature[b * n_rows: (b+1) * n_rows]
                hb = this.hashBand(band)
                # hash band
                # put 1 in bucket-doc matrix
                bucket_doc_mat[hb, docindex] = 1

        for bucket in bucket_doc_mat:
        # for each bucket
            doc_indices = np.nonzero(bucket)[0] # returns indices of all documents in the bucket
            # add each pair to set
            for d1 in doc_indices: # slow and stupid solution but concept works
                for d2 in doc_indices: 
                    if (d1 < d2): ## make sure we just add each pair once..
                        this.candidate_pairs.add((d1, d2))

        return this.candidate_pairs
    
    def getSimilarPairs(this, documents):
        ## Returns all similar pairs, based on candidate pairs and signature similarity >= t
        this.similar_pairs = set()
        for (d1, d2) in this.candidate_pairs: 
            if (documents[d1].signatureSimilarity(documents[d2]) >= this.t):
                this.similar_pairs.add((d1, d2, documents[d1].signatureSimilarity(documents[d2])))

        return this.similar_pairs
    
    def hashBand(this, band): 
        # takes the band (an array of integers) and hashes it to a
        band_weight_sum = 0
        for i, row in enumerate(band):
            band_weight_sum += (this.hash_weights[i] * row) 
            
        hashed_band = int(band_weight_sum % this.n_buckets)
        return int(hashed_band)
    
    

# 1. Jaccard Similarity of sets 

- Create shingle representation
- Create hashed version of the representation. 
- Compute Jaccard Similarity of documents based on sets of hashed shingles 


## Hyperparameters

#### n_buckets
The number of hash buckets (or the number of different possible output values for the hash function). A high n_buckets will yield low similarity measures, because more possible hash values means fewer shingles will be hashed to the same buckets.

#### shingle_length
The number of characters in each shingle. Large value will yield low similarity measures, as there will be more combinations of shingles. 

In [126]:
n_buckets = 2 ** 13 # number of hash buckets. This strongly affects the measure of similarity
shingle_length = 10

### Create documents. add to an array Documents


In [127]:
documents = []
for i, d in enumerate(raw_documents):
    documents.append(Document(d, n_buckets, shingle_length, filename = files[i]))


### Example of document: 

In [128]:
documents[0].text



In [129]:
for d in documents: 
    d.createShingles()
    d.hashShingles() 
    
print("Jaccard Similarity of doc 1 and doc 2 is {}".format(documents[1].jaccardSimilarity(documents[2])))

Jaccard Similarity of doc 1 and doc 2 is 0.814453125


In [130]:
## Example of shingles 
documents[0].shingles



In [131]:
## Example of hashed shingles 
documents[0].hashedShingles

OrderedSet([3660, 2439, 1430, 3390, 4170, 1572, 4553, 6223, 7571, 3846, 3497, 7198, 7220, 1717, 1619, 1457, 8148, 7231, 3655, 3913, 8142, 6858, 6880, 1892, 6483, 520, 8123, 3255, 2524, 4747, 1542, 5831, 2496, 1953, 7006, 7873, 4559, 804, 5210, 2569, 4136, 399, 3327, 839, 7199, 556, 3237, 4989, 3382, 3825, 5147, 3296, 370, 3786, 5204, 6124, 6740, 5189, 4816, 5549, 2206, 3070, 3304, 3629, 3682, 2154, 6935, 6641, 95, 3626, 761, 1059, 4126, 7572, 3536, 2990, 1351, 4073, 3476, 107, 4138, 1532, 4451, 4055, 5374, 2701, 3872, 7407, 1302, 3414, 3616, 2399, 3631, 7875, 6790, 2176, 5217, 5039, 3539, 1168, 7566, 5181, 2257, 4328, 5221, 2600, 1399, 2805, 4624, 5976, 1292, 724, 1380, 4344, 241, 5897, 2659, 613, 6736, 3895, 2923, 3034, 8022, 3344, 2594, 4327, 801, 6270, 6490, 7986, 6875, 1817, 7516, 2535, 6592, 6434, 7846, 5639, 5641, 3843, 270, 4030, 3538, 6508, 4014, 6084, 835, 222, 3578, 5195, 6200, 7298, 2241, 4680, 1964, 3997, 4386, 4887, 2474, 7905, 5183, 931, 6578, 6757, 1029, 6856, 1041, 5099

### Example: compute similarities for 10 documents.

In [132]:
# Compute jaccard of some documents and store in dataframe
n_docs = 5
similarity_threshold = 0.5
similarities = np.zeros((n_docs, n_docs))
for i in range(n_docs): 
    for j in range(n_docs): 
        similarities[i, j] = documents[i].jaccardSimilarity(documents[j])
        
simDF = pd.DataFrame(similarities)
simDF.head(n_docs)

Unnamed: 0,0,1,2,3,4
0,1.0,0.967285,0.79621,0.614594,0.887255
1,0.967285,1.0,0.814453,0.623657,0.912598
2,0.79621,0.814453,1.0,0.552583,0.76519
3,0.614594,0.623657,0.552583,1.0,0.593441
4,0.887255,0.912598,0.76519,0.593441,1.0


## Jaccard Similarity results 
### Find similar docs

In [133]:
similar = []
for i in range(n_docs): 
    for j in range(n_docs): 
        if(similarities[i, j] > similarity_threshold):
            if (i < j): ## This is in order to just include each pair once.. 
                print("Documents {} and {} are similar, similarity {}".format(i, j, round(similarities[i, j], 4)))
                similar.append((i, j))

Documents 0 and 1 are similar, similarity 0.9673
Documents 0 and 2 are similar, similarity 0.7962
Documents 0 and 3 are similar, similarity 0.6146
Documents 0 and 4 are similar, similarity 0.8873
Documents 1 and 2 are similar, similarity 0.8145
Documents 1 and 3 are similar, similarity 0.6237
Documents 1 and 4 are similar, similarity 0.9126
Documents 2 and 3 are similar, similarity 0.5526
Documents 2 and 4 are similar, similarity 0.7652
Documents 3 and 4 are similar, similarity 0.5934


In [134]:
## Show first part of two similar'docs
print(documents[similar[0][0]].text[0:500])
print("\n")
print(documents[similar[0][1]].text[0:500])

Oct 30 2009 	OK value though shabby; good position.	I extended a one night stay to three. The staff were basically helpful when asked. My room was clean, comfortable, and daily serviced. I think the position is good for a budget hotel, near the Golden Gate theatre, right by Asian Arts Museum and Civic Centre. It's an old and architecturally atmospheric hotel.Negatives: it is very badly neglected. Things that might put you off are the need usually to ring the outside doorbell and wait to get some


Oct 28 2009 	Good value	Nice little hotel in a great location. Not 5* but then you're not paying 5* rates. Clean, friendly staff and an easy 5 minute walk to Union Square.	
Oct 25 2009 	Good location but cramped and unclean	We stayed in the Adante for two nights when we arrived in SF and again for one night the following week before we left. The hotel is ideally situated only a short walk from Union Square and the front desk staff were helpful. We were allowed to check-in early and were even 

# 2. Min-Hashing
- Create a set of k random hash function
- Get the signature of each document by: 
    - for each shingle e in doc
        - save the index i of the hash function producing smallest h(e)
        
        
## Hyperparameters

#### n_buckets
The number of hash buckets (or the number of different possible output values for the hash function). A high n_buckets will yield low similarity measures, because more possible hash values means fewer shingles will be hashed to the same buckets.

#### n_hashes 
The number of hash functions used to create the min hashing signature. 



In [135]:
n_hashes = 100
n_buckets = 2 ** 13 # number of hash buckets. This strongly affects the measure of similarity


In [136]:
## Create a set of k random hash functions (permutations). 
h = Hasher(n_hashes, n_buckets)
h.generateRandomHashFunctions()

## Create signatures for each set
for count, d in enumerate(documents):
    d.createSignature(h)
    print("Processed doc {}".format(count))


  del sys.path[0]


Processed doc 0
Processed doc 1
Processed doc 2
Processed doc 3
Processed doc 4


In [137]:
# Example of document signature
documents[0].signature

array([   0.,    1.,    0.,    3.,    0.,    1.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    1.,    0.,    2.,    1.,    0.,
          0.,    0.,    0.,    7.,    6.,    0.,    3.,    0.,    1.,
          0.,    1.,    0.,    0.,    0.,    2.,    1.,    0.,   20.,
          1.,    0.,    3.,    0.,    0.,    0.,    0.,    0.,    0.,
          1.,    0.,    0.,    0.,    1.,    0.,    0.,    0.,    0.,
          1.,    0.,    0.,    0.,    0.,    2.,    1.,    0.,    0.,
          1.,    2.,    1.,    1.,    0.,    0.,    0.,    0.,    1.,
          2.,    1.,    0.,    1.,  162.,    0.,    0.,    1.,    3.,
          0.,    0.,    0.,    0.,    1.,    0.,    0.,    3.,    0.,
          1.,    0.,    0.,    0.,    1.,    0.,    0.,    0.,    0.,    1.])

### Example: compute similarities for 10 documents.

In [139]:
# Compute minhash signature similarity of some documents and store in dataframe
n_docs = 5
similarity_threshold = 0.7
similarities = np.zeros((n_docs, n_docs))
for i in range(n_docs): 
    for j in range(n_docs): 
        similarities[i, j] = documents[i].signatureSimilarity(documents[j])
        
simDF = pd.DataFrame(similarities)
simDF.head(n_docs)

Unnamed: 0,0,1,2,3,4
0,1.0,0.98,0.88,0.75,0.96
1,0.98,1.0,0.9,0.76,0.96
2,0.88,0.9,1.0,0.72,0.88
3,0.75,0.76,0.72,1.0,0.75
4,0.96,0.96,0.88,0.75,1.0


## Min-hashing results
### Find similar docs

In [140]:
similar = []
for i in range(n_docs): 
    for j in range(n_docs): 
        if(similarities[i, j] > similarity_threshold):
            if (i < j): ## This is in order to just include each pair once.. 
                print("Documents {} and {} are similar, similarity {}".format(i, j, round(similarities[i, j], 4)))
                similar.append((i, j))
    
    

Documents 0 and 1 are similar, similarity 0.98
Documents 0 and 2 are similar, similarity 0.88
Documents 0 and 3 are similar, similarity 0.75
Documents 0 and 4 are similar, similarity 0.96
Documents 1 and 2 are similar, similarity 0.9
Documents 1 and 3 are similar, similarity 0.76
Documents 1 and 4 are similar, similarity 0.96
Documents 2 and 3 are similar, similarity 0.72
Documents 2 and 4 are similar, similarity 0.88
Documents 3 and 4 are similar, similarity 0.75


# 3. Locality Sensitive Hashing 
- Hash bands of signatures of each document into buckets 
- For each bucket, add all docs in the same bucket to a a set of Candidate Pairs
- Filter out candidate pairs which are less similar than t (false positives) to get the Similar Pairs

## Hyperparameters

### n_bands and n_rows
The number of bands, and number of rows in each band. b * r should be equal to the number of integers in the signature of a document, n. 

### t
The threshold of similarity to filter out not false positives from candidate pairs. 

### n_lsh_buckets
The number of buckets in the hash function which hashes the bands of the signatures. Should be as large as possible, but increases the runtime! 

In [141]:
## Hyperparameters
n_bands = 20
n_rows = 5 # rows per band
t = 0.55 # (1/n_bands) ** (1/n_rows) = 0.549
n_lsh_buckets = 2 ** 20

In [142]:
lsh = LSH(n_bands = n_bands, n_rows = n_rows, t = t, n_buckets = n_lsh_buckets)

candidate_pairs = lsh.getCandidatePairs(documents)

similar_pairs = lsh.getSimilarPairs(documents) 

In [143]:
print("Out of {} documents, {} candidate pairs were found. Out of these, {} had a similarity of at least {}. "
      .format(len(documents), len(candidate_pairs), len(similar_pairs), t))



Out of 5 documents, 10 candidate pairs were found. Out of these, 10 had a similarity of at least 0.55. 


## LSH Result
### Find similar docs

In [144]:
for d1, d2, sim in (similar_pairs): 
    if (sim > similarity_threshold):
        print("Documents {} and {} are similar, similarity {}".format(d1, d2, round(sim, 4)))

    

Documents 2 and 4 are similar, similarity 0.88
Documents 0 and 2 are similar, similarity 0.88
Documents 1 and 4 are similar, similarity 0.96
Documents 0 and 3 are similar, similarity 0.75
Documents 3 and 4 are similar, similarity 0.75
Documents 0 and 4 are similar, similarity 0.96
Documents 1 and 3 are similar, similarity 0.76
Documents 2 and 3 are similar, similarity 0.72
Documents 0 and 1 are similar, similarity 0.98
Documents 1 and 2 are similar, similarity 0.9


In [145]:
print("Doc 0 and 1 are considered among the most equal, they correspond to the files '{} and '{}' which can be found in ".format(documents[0].filename, documents[1].filename))

Doc 0 and 1 are considered among the most equal, they correspond to the files 'usa_san francisco_abigail_hotel and 'usa_san francisco_adante_hotel' which can be found in 
