# Exemplo 02: Similaridade de Textos: LSH

## Locality-Sensitive Hashing (LSH) Algorithms

LSH for Euclidean distance metrics. The input is a dense or sparse vectors, each of which represents a point in the Euclidean distance space. The output will be vectors of configurable dimension. Hash values in the same dimension are calculated by the same hash function.

In [1]:
# Load Spark Library
#from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import *

import time, os, string

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import BucketedRandomProjectionLSH

from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import regexp_replace, trim, col, lower, when, size, lit, avg

In [2]:
# Configuration

cwd = os.getcwd()
book_folder = "/data/"
book1 = 'file://'+cwd+book_folder+"01-Harry_Potter_and_the_Sorcerers_Stone.txt.gz"
book2 = 'file://'+cwd+book_folder+"02-Harry_Potter_and_the_Chamber_of_Secrets.txt.gz"

In [3]:
# Functions to remove ponctuation and empty lines

def removePunctuation(column):
    return trim(lower(regexp_replace(column,'[!,*)@#%(&$_?.^—]', ''))).alias('text')
#[^a-zA-Z0-9]+
# [^\s+a-zA-Z0-9]


class RemoveEmptyLines(Transformer):
    def __init__(self, column: StringType() ):
        super(RemoveEmptyLines, self).__init__()
        self.column = column

    def _transform(self, df: DataFrame) -> DataFrame:
        return df.withColumn(self.column, when(size(col(self.column)) == 0, lit(None)).otherwise(col(self.column))).na.drop()
        #return df

## Create Spark Session

In [4]:
# Create local Spark session
sc = SparkSession.builder\
     .appName("SparkSimilarityLSH")\
     .master("local[*]") \
     .getOrCreate()

start_time = time.time()

### Reading Data

In [5]:
# Read de book file: Ulysses 
text_1 = sc.read.text(book1)
text_1.show(10, truncate = False)
text_1 = text_1.filter("value != ''")
text_1.show(10, truncate = False)
text_1 = text_1.select(removePunctuation(col('value')))
text_1 = text_1.withColumnRenamed('value', 'text')
text_1.show(10, truncate = False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                                                                   

In [6]:
# Read de book file:  Dubliner
text_2 = sc.read.text(dubliners)
text_2 = text_2.filter("value != ''")
text_2 = text_2.select(removePunctuation(col('value')))
text_2 = text_2.withColumnRenamed('value', 'text')
text_2.show(10, truncate = False)

NameError: name 'dubliners' is not defined

## Calculate Similarity

### Tokenizer

In [7]:
tokenizer = RegexTokenizer(pattern='\s+', inputCol="text", outputCol="tokens", minTokenLength=3, toLowercase=True)

tokenData = tokenizer.transform(text_1)
tokenData.show(truncate = False)

  tokenizer = RegexTokenizer(pattern='\s+', inputCol="text", outputCol="tokens", minTokenLength=3, toLowercase=True)


+------------------------------------------------------------------+----------------------------------------------------------------------+
|text                                                              |tokens                                                                |
+------------------------------------------------------------------+----------------------------------------------------------------------+
|the project gutenberg ebook of ulysses by james joyce             |[the, project, gutenberg, ebook, ulysses, james, joyce]               |
|this ebook is for the use of anyone anywhere at no cost and with  |[this, ebook, for, the, use, anyone, anywhere, cost, and, with]       |
|almost no restrictions whatsoever you may copy it give it away or |[almost, restrictions, whatsoever, you, may, copy, give, away]        |
|reuse it under the terms of the project gutenberg license included|[reuse, under, the, terms, the, project, gutenberg, license, included]|
|with this ebook or 

### Shingling

In [8]:
ngram = NGram(n=3, inputCol="tokens", outputCol="ngrams")
ngramData = ngram.transform(tokenData)

rememptylines = RemoveEmptyLines(column = "ngrams")
ngramData = rememptylines.transform(ngramData)

ngramData.show(truncate=False)

+---------------------------------------------------------------------+------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                 |tokens                                                                        |ngrams                                                                                                                                                                             |
+---------------------------------------------------------------------+------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|the pr

### Counting Hash

In [9]:
hash_tf = HashingTF(inputCol="ngrams", outputCol="vectors")

hashtfData = hash_tf.transform(ngramData)
hashtfData.show() 

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|
+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|
|release date augu...|[release, date, a...|[release date aug...|(262144,[63333,72...|
|last updated octo...|[last, updated, o...|[last updated oct...|(262144,[25383,13...|
|character set enc...|[character, set, ...|[character 

### Min-Hashing

In [10]:
minhash = MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=3).fit(hashtfData)

minhashData = minhash.transform(hashtfData)
minhashData.show() #truncate=False)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|[[5.36229329E8], ...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[5.4838929E7], [...|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[6.70133526E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[1.65155346E8], ...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[8.36705138E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[3.09315538E8], ...|
|release date augu...|[release, date,

#### Using Pipeline

In [11]:
pipeline = Pipeline(stages=[
            tokenizer,
            ngram,
            rememptylines,
            hash_tf,
            minhash
        ])

model= pipeline.fit(text_1)

text_A = model.transform(text_1)
text_B = model.transform(text_2)

text_A.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|[[5.36229329E8], ...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[5.4838929E7], [...|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[6.70133526E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[1.65155346E8], ...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[8.36705138E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[3.09315538E8], ...|
|release date augu...|[release, date,

In [12]:
text_B.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[74024,11...|[[2.6934767E7], [...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[5.4838929E7], [...|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[6.70133526E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[1.65155346E8], ...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[8.36705138E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[3.09315538E8], ...|
|release date sept...|[release, date,

### Locality-Sensitive Hashing (LSH)

In [13]:
rows_text_A = text_A.count()
rows_text_B = text_B.count()

# Show similarity with Jaccard Distance below 0.9  minhashData
result_A_B = model.stages[-1].approxSimilarityJoin(text_A, text_B, 0.9, distCol="JaccardDistance")
result_A_B.show()

rows_result_A_B = result_A_B.count()
simil_index_AB = rows_result_A_B / rows_text_B * 100
print("Similarity Ulysses x Dubliners = ",simil_index_AB, " %")

+--------------------+--------------------+------------------+
|            datasetA|            datasetB|   JaccardDistance|
+--------------------+--------------------+------------------+
|{who are you laug...|{who are you char...|             0.875|
|{so he starts tel...|{come in said mr ...|0.8571428571428572|
|{so as to evoke a...|{approach us with...|0.8333333333333334|
|{goodwin in a bow...|{here takes three...|0.8888888888888888|
|{blooms thoughts ...|{for the little s...|0.8888888888888888|
|{1f3 limited righ...|{1f4 except for t...|0.8181818181818181|
|{wobbled a while ...|{childish white t...|              0.75|
|{murphys my name ...|{bottles of stout...|0.8888888888888888|
|{gutenbergtm elec...|{things that you ...|             0.875|
|{this work you we...|{singers today as...|0.8888888888888888|
|{are you going yo...|{eh are you going...|0.8888888888888888|
|{no mr bloom said...|{the religious st...|0.8888888888888888|
|{archbishop was i...|{what other tinke...|0.8888888888

In [14]:
# Show similarity with Jaccard Distance below 0.5
result_A_A = model.stages[-1].approxSimilarityJoin(text_A, text_A, 0.5, distCol="JaccardDistance")
result_A_A .show()

simil_index_AA = result_A_A.count() / rows_text_A * 100
print("Similarity Ulysses x Ulysses = ",simil_index_AA, " %")

+--------------------+--------------------+---------------+
|            datasetA|            datasetB|JaccardDistance|
+--------------------+--------------------+---------------+
|{lead him not int...|{lead him not int...|            0.0|
|{anxiety in his e...|{anxiety in his e...|            0.0|
|{stephen laid the...|{stephen laid the...|            0.0|
|{have you the key...|{have you the key...|            0.0|
|{haines sat down ...|{haines sat down ...|            0.0|
|{i blow him out a...|{i blow him out a...|            0.0|
|{that i make when...|{that i make when...|            0.0|
|{memory fabled it...|{memory fabled it...|            0.0|
|{rouge a blue fre...|{rouge a blue fre...|            0.0|
|{orangeblossoms d...|{orangeblossoms d...|            0.0|
|{something he bur...|{something he bur...|            0.0|
|{theres a smell o...|{theres a smell o...|            0.0|
|{happened of cour...|{happened of cour...|            0.0|
|{plumtrees potted...|{plumtrees potted.

## Finishing

In [15]:
print("--- Execution time: %s seconds ---" % (time.time() - start_time))
# Stop Spark
sc.stop()

--- Execution time: 41.434675216674805 seconds ---
