In [81]:
## !!! typical mistake: comparing signatures is given as probability. Using jaccard to compute this will result ina division by 200 which does not make sense! 
import pandas as pd
import numpy as np
import hashlib

## Objective
The task is to implement the stages of finding textually similar documents based on Jaccard similarity using the shingling, minhashing, and locality-sensitive hashing (LSH) techniques and corresponding algorithms. 

In [111]:
with open("../data/OpinRankDataset/hotels/small_sample/usa_san francisco_abigail_hotel", encoding='utf-8', errors='replace') as f: 
    raw_documents = f.readlines()

In [112]:
## Number of documents
print(len(raw_documents))

31


## Hyperparameters

#### n_buckets
The number of hash buckets (or the number of different possible output values for the hash function). A high n_buckets will yield low similarity measures, because more possible hash values means fewer shingles will be hashed to the same buckets.

#### shingle_length
The number of characters in each shingle. Large value will yield low similarity measures, as there will be more combinations of shingles. 

#### n_hashes 
The number of hash functions

In [402]:
n_buckets = 2 ** 6 # number of hash buckets. This strongly affects the measure of similarity
shingle_length = 10

In [403]:
class Document: 
    def __init__(this, text, n_buckets, shingle_length):
        this.text = text
        this.length = len(text)
        this.n_buckets = n_buckets
        this.shingle_length = shingle_length
    
    def createShingles(this): 
        this.shingles = set()
        
        ## number of shingles = length/k
        n = int(this.length/this.shingle_length)
        
        ## slice string
        for i in range(n-1):
            this.shingles.add(this.text[slice(this.shingle_length * i, this.shingle_length * (i+1))])
    
    def hashShingles(this):
        this.hashedShingles = set()
        
        for shingle in this.shingles: 
            this.hashedShingles.add(this.hashStr(shingle))
    
    def hashStr(this, string):
        ## Hashes a string to an integer 0 <= i < n_buckets
        return abs(hash(string)) % (this.n_buckets)
    
    def jaccardSimilarity(this, other): 
        ## Compares two docs based on the jaccard similarity of the hashed shingles. 
        intersection = this.hashedShingles.intersection(other.hashedShingles)
        union = this.hashedShingles.union(other.hashedShingles)
        return (len(intersection)/len(union))
    
    def createSignature(this, h):
        ## builds a minHash signature (in the form of a vector or a set) of a given length n from a given set of integers (a set of hashed shingles).
        this.signature = np.ones(h.n_hashes) * np.inf ## the signature has length k, corresponding to each hash
        for i in range(h.n_hashes):
            for e in this.shingles:
                this.signature[i] = np.minimum(this.signature[i], h.hash(i, e)) # keep the smallest hash value of each hash function only
        
    def signatureSimilarity(this, other):
        ## estimates similarity of two integer vectors – minhash signatures – as a fraction of components, in which they agree.
        agree = 0
        
        for i in range(this.signature.size): 
            if (this.signature[i] == other.signature[i]): 
                agree += 1
                
        similarity = agree/this.signature.size
        return similarity
        

In [412]:
class Hasher: 
    def __init__(this, n_hashes, n_buckets): 
        ## A class that contains k random hash functions
        this.n_hashes = n_hashes
        this.n_buckets = n_buckets
        
        
    def hash(this, i, string): 
        ## returns hash function i of x: h_i(x)
        a = this.a_values[i]
        b = this.b_values[i]
        x = abs(hash(string))
        return (a*x + b) % this.n_buckets
    
    def generateRandomHashFunctions(this):
        # Each hash function is a function of random parameters a, b and c. 
        # create a and b values: 0 < random_interval_size
        this.a_values = (np.random.rand(this.n_hashes) * this.n_buckets).astype(int)
        this.b_values = (np.random.rand(this.n_hashes) * this.n_buckets).astype(int)
        
    def getHashMatrix(this, shingles):
        # Creates a hash matrix based on an input array of shingles
        # M(i, j) = h_i(e_j)
        this.hash_matrix = np.zeros((this.n_hashes, shingles.size))
        for i in range(this.n_hashes): # for each hash function
            for e in shingles:
                this.hash_matrix[i, j] = this.hash(i, e)
                
    

### Create documents. add to an array Documents


In [405]:
documents = []
for d in raw_documents:
    documents.append(Document(d, n_buckets, n_shingles))
documents[0].text

"Oct 30 2009 \tOK value though shabby; good position.\tI extended a one night stay to three. The staff were basically helpful when asked. My room was clean, comfortable, and daily serviced. I think the position is good for a budget hotel, near the Golden Gate theatre, right by Asian Arts Museum and Civic Centre. It's an old and architecturally atmospheric hotel.Negatives: it is very badly neglected. Things that might put you off are the need usually to ring the outside doorbell and wait to get someone to the unattended reception, unreliable breakfast availability (nil when I was there but I can see from other reviews that it can appear!), a very unreliable (but rather beautiful) classic Otis elevator, some very dilapidated corridors on the way to your room when the elevator is kaput, shabby though formerly elegant decor, homeless on nearby streets, though the street immediately outside was clear and felt safe and I didn't see any drug taking or pushing and I was never bothered anywhere

# 1. Jaccard Similarity of sets 

- Create shingle representation
- Create hashed version of the representation. 
- Compute Jaccard Similarity of documents based on sets of hashed shingles 


In [406]:
for d in documents: 
    d.createShingles()
    d.hashShingles() 
    
print("Jaccard Similarity of doc 0 and doc 1 is {}".format(documents[0].jaccardSimilarity(documents[1])))

Jaccard Similarity of doc 0 and doc 1 is 0.4642857142857143


### Example: compute similarities for 10 documents.

In [407]:
# Compute jaccard of some documents and store in dataframe
n_docs = 10
similarity_threshold = 0.7
similarities = np.zeros((n_docs, n_docs))
for i in range(n_docs): 
    for j in range(n_docs): 
        similarities[i, j] = documents[i].jaccardSimilarity(documents[j])
        
simDF = pd.DataFrame(similarities)
simDF.head(n_docs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.464286,0.609375,0.639344,0.590164,0.396552,0.766667,0.440678,0.704918,0.746032
1,0.464286,1.0,0.396552,0.444444,0.442308,0.255319,0.4,0.340426,0.490909,0.466667
2,0.609375,0.396552,1.0,0.677966,0.6,0.454545,0.666667,0.448276,0.634921,0.758065
3,0.639344,0.444444,0.677966,1.0,0.55,0.4,0.645161,0.421053,0.639344,0.709677
4,0.590164,0.442308,0.6,0.55,1.0,0.345455,0.571429,0.5,0.644068,0.66129
5,0.396552,0.255319,0.454545,0.4,0.345455,1.0,0.40678,0.347826,0.372881,0.380952
6,0.766667,0.4,0.666667,0.645161,0.571429,0.40678,1.0,0.380952,0.68254,0.75
7,0.440678,0.340426,0.448276,0.421053,0.5,0.347826,0.380952,1.0,0.370968,0.516667
8,0.704918,0.490909,0.634921,0.639344,0.644068,0.372881,0.68254,0.370968,1.0,0.774194
9,0.746032,0.466667,0.758065,0.709677,0.66129,0.380952,0.75,0.516667,0.774194,1.0


### Find similar docs

In [408]:
similar_pairs = []
for i in range(n_docs): 
    for j in range(n_docs): 
        if(similarities[i, j] > similarity_threshold):
            if (i < j): ## This is in order to just include each pair once.. 
                print("Documents {} and {} are similar, similarity {}".format(i, j, round(similarities[i, j], 4)))
                similar_pairs.append((i, j))

Documents 0 and 6 are similar, similarity 0.7667
Documents 0 and 8 are similar, similarity 0.7049
Documents 0 and 9 are similar, similarity 0.746
Documents 2 and 9 are similar, similarity 0.7581
Documents 3 and 9 are similar, similarity 0.7097
Documents 6 and 9 are similar, similarity 0.75
Documents 8 and 9 are similar, similarity 0.7742


In [409]:
print(documents[similar_pairs[0][0]].text)
print(documents[similar_pairs[0][1]].text)

Oct 30 2009 	OK value though shabby; good position.	I extended a one night stay to three. The staff were basically helpful when asked. My room was clean, comfortable, and daily serviced. I think the position is good for a budget hotel, near the Golden Gate theatre, right by Asian Arts Museum and Civic Centre. It's an old and architecturally atmospheric hotel.Negatives: it is very badly neglected. Things that might put you off are the need usually to ring the outside doorbell and wait to get someone to the unattended reception, unreliable breakfast availability (nil when I was there but I can see from other reviews that it can appear!), a very unreliable (but rather beautiful) classic Otis elevator, some very dilapidated corridors on the way to your room when the elevator is kaput, shabby though formerly elegant decor, homeless on nearby streets, though the street immediately outside was clear and felt safe and I didn't see any drug taking or pushing and I was never bothered anywhere. H

# 2. Min-Hashing
- Create a set of k random hash function
- Get the signature of each document by: 
    - for each shingle e in doc
        - save the index i of the hash function producing smallest h(e)
        
        
## Hyperparameters

#### n_buckets
The number of hash buckets (or the number of different possible output values for the hash function). A high n_buckets will yield low similarity measures, because more possible hash values means fewer shingles will be hashed to the same buckets.

#### n_hashes 
The number of hash functions used to create the min hashing signature. 



In [410]:
n_hashes = 100

In [413]:
## Create a set of k random hash functions (permutations). 
h = Hasher(n_hashes, n_buckets)
h.generateRandomHashFunctions()

## Create signatures for each set
for d in documents:
    d.createSignature(h)


  del sys.path[0]


In [414]:
documents[10].signature

array([  0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,   0.,
         0.,  14.,   1.,  10.,   2.,   0.,   0.,   0.,   0.,   0.,   0.,
        20.,   1.,   0.,   0.,   3.,   0.,   0.,   0.,   0.,   0.,   1.,
         1.,   0.,   0.,   0.,   0.,   0.,   0.,   1.,   0.,   1.,   0.,
         3.,   4.,   1.,   0.,   1.,   2.,   0.,   1.,   2.,   0.,   0.,
         0.,   0.,   1.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   1.,
         1.,   1.,   0.,   0.,   0.,   0.,   1.,   1.,   0.,   0.,   0.,
         0.,   0.,   1.,   0.,  30.,   5.,   6.,   1.,   0.,   2.,   0.,
         2.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
         0.])

### Example: compute similarities for 10 documents.

In [415]:
# Compute minhash signature similarity of some documents and store in dataframe
n_docs = 10
similarity_threshold = 0.7
similarities = np.zeros((n_docs, n_docs))
for i in range(n_docs): 
    for j in range(n_docs): 
        similarities[i, j] = documents[i].signatureSimilarity(documents[j])
        
simDF = pd.DataFrame(similarities)
simDF.head(n_docs)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.0,0.75,0.82,0.86,0.8,0.55,0.89,0.66,0.9,0.86
1,0.75,1.0,0.68,0.72,0.71,0.51,0.73,0.58,0.77,0.72
2,0.82,0.68,1.0,0.84,0.83,0.58,0.89,0.63,0.86,0.88
3,0.86,0.72,0.84,1.0,0.78,0.56,0.85,0.64,0.86,0.85
4,0.8,0.71,0.83,0.78,1.0,0.55,0.82,0.65,0.84,0.81
5,0.55,0.51,0.58,0.56,0.55,1.0,0.58,0.51,0.6,0.54
6,0.89,0.73,0.89,0.85,0.82,0.58,1.0,0.65,0.93,0.91
7,0.66,0.58,0.63,0.64,0.65,0.51,0.65,1.0,0.62,0.64
8,0.9,0.77,0.86,0.86,0.84,0.6,0.93,0.62,1.0,0.9
9,0.86,0.72,0.88,0.85,0.81,0.54,0.91,0.64,0.9,1.0


### Find similar docs

In [416]:
similar_pairs = []
for i in range(n_docs): 
    for j in range(n_docs): 
        if(similarities[i, j] > similarity_threshold):
            if (i < j): ## This is in order to just include each pair once.. 
                print("Documents {} and {} are similar, similarity {}".format(i, j, round(similarities[i, j], 4)))
                similar_pairs.append((i, j))
    
    

Documents 0 and 1 are similar, similarity 0.75
Documents 0 and 2 are similar, similarity 0.82
Documents 0 and 3 are similar, similarity 0.86
Documents 0 and 4 are similar, similarity 0.8
Documents 0 and 6 are similar, similarity 0.89
Documents 0 and 8 are similar, similarity 0.9
Documents 0 and 9 are similar, similarity 0.86
Documents 1 and 3 are similar, similarity 0.72
Documents 1 and 4 are similar, similarity 0.71
Documents 1 and 6 are similar, similarity 0.73
Documents 1 and 8 are similar, similarity 0.77
Documents 1 and 9 are similar, similarity 0.72
Documents 2 and 3 are similar, similarity 0.84
Documents 2 and 4 are similar, similarity 0.83
Documents 2 and 6 are similar, similarity 0.89
Documents 2 and 8 are similar, similarity 0.86
Documents 2 and 9 are similar, similarity 0.88
Documents 3 and 4 are similar, similarity 0.78
Documents 3 and 6 are similar, similarity 0.85
Documents 3 and 8 are similar, similarity 0.86
Documents 3 and 9 are similar, similarity 0.85
Documents 4 and

In [417]:
print(documents[similar_pairs[0][0]].text)
print(documents[similar_pairs[0][1]].text)

Oct 30 2009 	OK value though shabby; good position.	I extended a one night stay to three. The staff were basically helpful when asked. My room was clean, comfortable, and daily serviced. I think the position is good for a budget hotel, near the Golden Gate theatre, right by Asian Arts Museum and Civic Centre. It's an old and architecturally atmospheric hotel.Negatives: it is very badly neglected. Things that might put you off are the need usually to ring the outside doorbell and wait to get someone to the unattended reception, unreliable breakfast availability (nil when I was there but I can see from other reviews that it can appear!), a very unreliable (but rather beautiful) classic Otis elevator, some very dilapidated corridors on the way to your room when the elevator is kaput, shabby though formerly elegant decor, homeless on nearby streets, though the street immediately outside was clear and felt safe and I didn't see any drug taking or pushing and I was never bothered anywhere. H

# 3. Locality Sensitive Hashing 
- Hash bands of signatures of each document
- All that share at least 


![lsh](./docs/lsh.png/)