# Exemplo 02: Similaridade de Textos: LSH

## Locality-Sensitive Hashing (LSH) Algorithms

LSH for Euclidean distance metrics. The input is a dense or sparse vectors, each of which represents a point in the Euclidean distance space. The output will be vectors of configurable dimension. Hash values in the same dimension are calculated by the same hash function.

In [1]:
# Load Spark Library
#from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import *

import time, os, string

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import BucketedRandomProjectionLSH

from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import regexp_replace, trim, ltrim, rtrim, col, lower, when, size, lit, avg

In [2]:
# Configuration

cwd = os.getcwd()
book_folder = "/data/"
book1 = 'file://'+cwd+book_folder+"01-Harry_Potter_and_the_Sorcerers_Stone.txt.gz"
book2 = 'file://'+cwd+book_folder+"02-Harry_Potter_and_the_Chamber_of_Secrets.txt.gz"

In [3]:
# Functions to converte to lower case, remove ponctuation and empty lines

def removePunctuation(column):
    return trim(lower(regexp_replace(column,'[!,*)@#%|“”(&$_?.^—]', ''))).alias('text')

class RemoveEmptyLines(Transformer):
    def __init__(self, column: StringType() ):
        super(RemoveEmptyLines, self).__init__()
        self.column = column

    def _transform(self, df: DataFrame) -> DataFrame:
        return df.withColumn(self.column, when(size(col(self.column)) == 0, lit(None)).otherwise(col(self.column))).na.drop()

## Create Spark Session

In [4]:
# Create local Spark session
sc = SparkSession.builder\
     .appName("SparkSimilarityLSH")\
     .master("local[*]") \
     .getOrCreate()

start_time = time.time()

### Reading Data

In [5]:
# Read de file Book 1
text_1 = sc.read.text(book1)
print("Original text ===>")
text_1.show(8,truncate=100)
# Remove empty lines
text_1 = text_1.filter("value != ''")
print("Remove empty lines ===>")
text_1.show(4,truncate=100)
# Remove punctuation and convert to lowercase
text_1 = text_1.select(removePunctuation(col('value')))
# Rename column
text_1 = text_1.withColumnRenamed('value', 'text')
print("Remove punctuation, convert to lowercase, rename column  ===>")
text_1.show(4,truncate=100)

Original text ===>
+----------------------------------------------------------------------------------------------------+
|                                                                                               value|
+----------------------------------------------------------------------------------------------------+
|M r. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly n...|
|                                                                                                    |
|Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy m...|
|                                                                                                    |
|The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was ...|
|                                                                                                    |
|When Mr. and Mrs. Dursley woke up on the dull, gray T

In [6]:
# Read de book file:  Book 2
text_2 = sc.read.text(book2)
# Remove blank lines
text_2 = text_2.filter("value != ''")
# Remove punctuation and rename column
text_2 = text_2.select(removePunctuation(col('value')))
text_2 = text_2.withColumnRenamed('value', 'text')

## Calculate Similarity

### Tokenizer

In [7]:
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokens", minTokenLength=2, toLowercase=True)
tokenData = tokenizer.transform(text_1)
tokenData.show(4,truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                              text|                                            tokens|
+--------------------------------------------------+--------------------------------------------------+
|m r and mrs dursley of number four privet drive...|[and, mrs, dursley, of, number, four, privet, d...|
|mr dursley was the director of a firm called gr...|[mr, dursley, was, the, director, of, firm, cal...|
|the dursleys had everything they wanted but the...|[the, dursleys, had, everything, they, wanted, ...|
|when mr and mrs dursley woke up on the dull gra...|[when, mr, and, mrs, dursley, woke, up, on, the...|
+--------------------------------------------------+--------------------------------------------------+
only showing top 4 rows



### Shingling

In [8]:
ngram = NGram(n=3, inputCol="tokens", outputCol="ngrams")
ngramData = ngram.transform(tokenData)

rememptylines = RemoveEmptyLines(column = "ngrams")
ngramData = rememptylines.transform(ngramData)

ngramData.show(4,truncate=40)

+----------------------------------------+----------------------------------------+----------------------------------------+
|                                    text|                                  tokens|                                  ngrams|
+----------------------------------------+----------------------------------------+----------------------------------------+
|m r and mrs dursley of number four pr...|[and, mrs, dursley, of, number, four,...|[and mrs dursley, mrs dursley of, dur...|
|mr dursley was the director of a firm...|[mr, dursley, was, the, director, of,...|[mr dursley was, dursley was the, was...|
|the dursleys had everything they want...|[the, dursleys, had, everything, they...|[the dursleys had, dursleys had every...|
|when mr and mrs dursley woke up on th...|[when, mr, and, mrs, dursley, woke, u...|[when mr and, mr and mrs, and mrs dur...|
+----------------------------------------+----------------------------------------+----------------------------------------+


### Counting Hash

In [9]:
hash_tf = HashingTF(inputCol="ngrams", outputCol="vectors")

hashtfData = hash_tf.transform(ngramData)
hashtfData.show(5,truncate=30) 

+------------------------------+------------------------------+------------------------------+------------------------------+
|                          text|                        tokens|                        ngrams|                       vectors|
+------------------------------+------------------------------+------------------------------+------------------------------+
|m r and mrs dursley of numb...|[and, mrs, dursley, of, num...|[and mrs dursley, mrs dursl...|(262144,[11709,13571,14226,...|
|mr dursley was the director...|[mr, dursley, was, the, dir...|[mr dursley was, dursley wa...|(262144,[11297,13362,26388,...|
|the dursleys had everything...|[the, dursleys, had, everyt...|[the dursleys had, dursleys...|(262144,[3002,3464,4923,531...|
|when mr and mrs dursley wok...|[when, mr, and, mrs, dursle...|[when mr and, mr and mrs, a...|(262144,[5287,6590,13929,24...|
|none of them noticed a larg...|[none, of, them, noticed, l...|[none of them, of them noti...|(262144,[2188,2984,21644

### Min-Hashing

In [10]:
minhash = MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=3).fit(hashtfData)

minhashData = minhash.transform(hashtfData)
minhashData.show(4,truncate=25)

+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|                     text|                   tokens|                   ngrams|                  vectors|                      lsh|
+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|m r and mrs dursley of...|[and, mrs, dursley, of...|[and mrs dursley, mrs ...|(262144,[11709,13571,1...|[[7.0281551E7], [47864...|
|mr dursley was the dir...|[mr, dursley, was, the...|[mr dursley was, dursl...|(262144,[11297,13362,2...|[[1.3450292E7], [1.459...|
|the dursleys had every...|[the, dursleys, had, e...|[the dursleys had, dur...|(262144,[3002,3464,492...|[[3.1730732E7], [3.226...|
|when mr and mrs dursle...|[when, mr, and, mrs, d...|[when mr and, mr and m...|(262144,[5287,6590,139...|[[5.7933269E7], [3.445...|
+-------------------------+-------------------------+-----------------------

#### Using Pipeline

In [11]:
pipeline = Pipeline(stages=[
            tokenizer,
            ngram,
            rememptylines,
            hash_tf,
            minhash
        ])

model= pipeline.fit(text_1)

text_A = model.transform(text_1)
text_B = model.transform(text_2)

text_A.show(4,truncate=25)

+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|                     text|                   tokens|                   ngrams|                  vectors|                      lsh|
+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|m r and mrs dursley of...|[and, mrs, dursley, of...|[and mrs dursley, mrs ...|(262144,[11709,13571,1...|[[7.0281551E7], [47864...|
|mr dursley was the dir...|[mr, dursley, was, the...|[mr dursley was, dursl...|(262144,[11297,13362,2...|[[1.3450292E7], [1.459...|
|the dursleys had every...|[the, dursleys, had, e...|[the dursleys had, dur...|(262144,[3002,3464,492...|[[3.1730732E7], [3.226...|
|when mr and mrs dursle...|[when, mr, and, mrs, d...|[when mr and, mr and m...|(262144,[5287,6590,139...|[[5.7933269E7], [3.445...|
+-------------------------+-------------------------+-----------------------

In [12]:
text_B.show(4,truncate=25)

+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|                     text|                   tokens|                   ngrams|                  vectors|                      lsh|
+-------------------------+-------------------------+-------------------------+-------------------------+-------------------------+
|not for the first time...|[not, for, the, first,...|[not for the, for the ...|(262144,[2157,11458,29...|[[2.1736297E7], [1.102...|
|third time this week h...|[third, time, this, we...|[third time this, time...|(262144,[45337,50380,6...|[[1419311.0], [9469319...|
|harry tried yet again ...|[harry, tried, yet, ag...|[harry tried yet, trie...|(262144,[43840,47565,9...|[[3.74485618E8], [1.40...|
|she’s bored he said sh...|[she’s, bored, he, sai...|[she’s bored he, bored...|(262144,[2579,28737,37...|[[4.70882235E8], [6.28...|
+-------------------------+-------------------------+-----------------------

### Locality-Sensitive Hashing (LSH)

In [13]:
rows_text_A = text_A.count()
rows_text_B = text_B.count()

# Show similarity with Jaccard Distance below 0.9 
result_A_B = model.stages[-1].approxSimilarityJoin(text_A, text_B, 0.9, distCol="JaccardDistance")
result_A_B.show(5,truncate=50)

rows_result_A_B = result_A_B.count()
simil_index_AB = rows_result_A_B / rows_text_B * 100
print("Similarity Text 1 x Text 2 = ",simil_index_AB, " %")

+--------------------------------------------------+--------------------------------------------------+------------------+
|                                          datasetA|                                          datasetB|   JaccardDistance|
+--------------------------------------------------+--------------------------------------------------+------------------+
|{shut up said ron, [shut, up, said, ron], [shut...|{shut up said harry frantically it’ll hear you,...|0.8571428571428572|
|{dear mr potter, [dear, mr, potter], [dear mr p...|{dear mr potter, [dear, mr, potter], [dear mr p...|               0.0|
|{bye said harry and ron the twins slid the comp...|{yes said harry and ron together instantly, [ye...|0.8666666666666667|
|{the what said harry and ron, [the, what, said,...|{yes said harry and ron together instantly, [ye...|0.7142857142857143|
|{detention she shouted and twenty points from s...|{there’s something else said harry watching her...|0.8974358974358975|
+---------------

In [14]:
# Show similarity with Jaccard Distance below 0.9
result_A_A = model.stages[-1].approxSimilarityJoin(text_A, text_A, 0.9, distCol="JaccardDistance")
result_A_A .show(5,truncate=50)

simil_index_AA = result_A_A.count() / rows_text_A * 100
print("Similarity Text 1 x Text 1 = ",simil_index_AA, " %")

+--------------------------------------------------+--------------------------------------------------+------------------+
|                                          datasetA|                                          datasetB|   JaccardDistance|
+--------------------------------------------------+--------------------------------------------------+------------------+
|{uncle vernon ripped open the bill snorted in d...|{uncle vernon ripped open the bill snorted in d...|               0.0|
|{if yeh know where to go said hagrid, [if, yeh,...|{if yeh know where to go said hagrid, [if, yeh,...|               0.0|
|{twenty minutes later they left eeylops owl emp...|{twenty minutes later they left eeylops owl emp...|               0.0|
|{whispers followed harry from the moment he lef...|{whispers followed harry from the moment he lef...|               0.0|
|{did you see his face the great lump, [did, you...|{did you see his scar, [did, you, see, his, sca...|0.7142857142857143|
+---------------

## Finishing

In [15]:
print("--- Execution time: %s seconds ---" % (time.time() - start_time))
# Stop Spark
sc.stop()

--- Execution time: 23.980924129486084 seconds ---
