# Exemplo 02: Similaridade de Textos: LSH

## Locality-Sensitive Hashing (LSH) Algorithms

LSH for Euclidean distance metrics. The input is a dense or sparse vectors, each of which represents a point in the Euclidean distance space. The output will be vectors of configurable dimension. Hash values in the same dimension are calculated by the same hash function.

In [1]:
# Load Spark Library
#from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.types import *

import time, os, string

from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.feature import BucketedRandomProjectionLSH

from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import regexp_replace, trim, col, lower, when, size, lit, avg

In [2]:
# Configuration

cwd = os.getcwd()
book_folder = "/data/"
dubliners = 'file://'+cwd+book_folder+"Dubliners_James_Joyce.txt.gz"
ulysses= 'file://'+cwd+book_folder+"Ulysses_James_Joyce.txt.gz"

In [3]:
# Functions to remove ponctuation and empty lines

def removePunctuation(column):
    return trim(lower(regexp_replace(column,'[^\sa-zA-Z0-9]',''))).alias('text')

class RemoveEmptyLines(Transformer):
    def __init__(self, column: StringType() ):
        super(RemoveEmptyLines, self).__init__()
        self.column = column

    def _transform(self, df: DataFrame) -> DataFrame:
        return df.withColumn(self.column, when(size(col(self.column)) == 0, lit(None)).otherwise(col(self.column))).na.drop()
        #return df

## Create Spark Session

In [4]:
# Create local Spark session
sc = SparkSession.builder\
     .appName("SparkSimilarityLSH")\
     .master("local[*]") \
     .getOrCreate()

start_time = time.time()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/15 09:35:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Reading Data

In [5]:
# Read de book file: Ulysses 
text_1 = sc.read.text(ulysses)
text_1.show(10, truncate = False)
text_1 = text_1.filter("value != ''")
text_1.show(10, truncate = False)
text_1 = text_1.select(removePunctuation(col('value')))
text_1 = text_1.withColumnRenamed('value', 'text')
text_1.show(10, truncate = False)

+-------------------------------------------------------------------+
|value                                                              |
+-------------------------------------------------------------------+
|                                                                   |
|The Project Gutenberg EBook of Ulysses, by James Joyce             |
|                                                                   |
|This eBook is for the use of anyone anywhere at no cost and with   |
|almost no restrictions whatsoever. You may copy it, give it away or|
|re-use it under the terms of the Project Gutenberg License included|
|with this eBook or online at www.gutenberg.org                     |
|                                                                   |
|                                                                   |
|Title: Ulysses                                                     |
+-------------------------------------------------------------------+
only showing top 10 

In [6]:
# Read de book file:  Dubliner
text_2 = sc.read.text(dubliners)
text_2 = text_2.filter("value != ''")
text_2 = text_2.select(removePunctuation(col('value')))
text_2 = text_2.withColumnRenamed('value', 'text')
text_2.show(10, truncate = False)

+------------------------------------------------------------------+
|text                                                              |
+------------------------------------------------------------------+
|the project gutenberg ebook of dubliners by james joyce           |
|this ebook is for the use of anyone anywhere at no cost and with  |
|almost no restrictions whatsoever you may copy it give it away or |
|reuse it under the terms of the project gutenberg license included|
|with this ebook or online at wwwgutenbergorg                      |
|title dubliners                                                   |
|author james joyce                                                |
|release date september 2001 ebook 2814                            |
|last updated january 20 2019                                      |
|language english                                                  |
+------------------------------------------------------------------+
only showing top 10 rows



## Calculate Similarity

### Tokenizer

In [7]:
tokenizer = RegexTokenizer(pattern='\s+', inputCol="text", outputCol="tokens", minTokenLength=3, toLowercase=True)

tokenData = tokenizer.transform(text_1)
tokenData.show(truncate = False)

+------------------------------------------------------------------+----------------------------------------------------------------------+
|text                                                              |tokens                                                                |
+------------------------------------------------------------------+----------------------------------------------------------------------+
|the project gutenberg ebook of ulysses by james joyce             |[the, project, gutenberg, ebook, ulysses, james, joyce]               |
|this ebook is for the use of anyone anywhere at no cost and with  |[this, ebook, for, the, use, anyone, anywhere, cost, and, with]       |
|almost no restrictions whatsoever you may copy it give it away or |[almost, restrictions, whatsoever, you, may, copy, give, away]        |
|reuse it under the terms of the project gutenberg license included|[reuse, under, the, terms, the, project, gutenberg, license, included]|
|with this ebook or 

### Shingling

In [8]:
ngram = NGram(n=3, inputCol="tokens", outputCol="ngrams")
ngramData = ngram.transform(tokenData)

rememptylines = RemoveEmptyLines(column = "ngrams")
ngramData = rememptylines.transform(ngramData)

ngramData.show(truncate=False)

+---------------------------------------------------------------------+------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|text                                                                 |tokens                                                                        |ngrams                                                                                                                                                                             |
+---------------------------------------------------------------------+------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|the pr

### Counting Hash

In [9]:
hash_tf = HashingTF(inputCol="ngrams", outputCol="vectors")

hashtfData = hash_tf.transform(ngramData)
hashtfData.show() 

+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|
+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|
|release date augu...|[release, date, a...|[release date aug...|(262144,[63333,72...|
|last updated octo...|[last, updated, o...|[last updated oct...|(262144,[25383,13...|
|character set enc...|[character, set, ...|[character 

### Min-Hashing

In [10]:
minhash = MinHashLSH(inputCol="vectors", outputCol="lsh", numHashTables=3).fit(hashtfData)

minhashData = minhash.transform(hashtfData)
minhashData.show() #truncate=False)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|[[5.5438488E7], [...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[3533150.0], [4....|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[5.25455278E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[8.1880145E7], [...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[3.92004444E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[1.40204519E9], ...|
|release date augu...|[release, date,

#### Using Pipeline

In [11]:
pipeline = Pipeline(stages=[
            tokenizer,
            ngram,
            rememptylines,
            hash_tf,
            minhash
        ])

model= pipeline.fit(text_1)

text_A = model.transform(text_1)
text_B = model.transform(text_2)

text_A.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[57299,74...|[[5.5438488E7], [...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[3533150.0], [4....|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[5.25455278E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[8.1880145E7], [...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[3.92004444E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[1.40204519E9], ...|
|release date augu...|[release, date,

In [12]:
text_B.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|              tokens|              ngrams|             vectors|                 lsh|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|the project guten...|[the, project, gu...|[the project gute...|(262144,[74024,11...|[[2.69028398E8], ...|
|this ebook is for...|[this, ebook, for...|[this ebook for, ...|(262144,[17652,18...|[[3533150.0], [4....|
|almost no restric...|[almost, restrict...|[almost restricti...|(262144,[37352,11...|[[5.25455278E8], ...|
|reuse it under th...|[reuse, under, th...|[reuse under the,...|(262144,[74024,89...|[[8.1880145E7], [...|
|with this ebook o...|[with, this, eboo...|[with this ebook,...|(262144,[61204,17...|[[3.92004444E8], ...|
|  author james joyce|[author, james, j...|[author james joyce]|(262144,[190411],...|[[1.40204519E9], ...|
|release date sept...|[release, date,

### Locality-Sensitive Hashing (LSH)

In [13]:
rows_text_A = text_A.count()
rows_text_B = text_B.count()

# Show similarity with Jaccard Distance below 0.9  minhashData
result_A_B = model.stages[-1].approxSimilarityJoin(text_A, text_B, 0.9, distCol="JaccardDistance")
result_A_B.show()

rows_result_A_B = result_A_B.count()
simil_index_AB = rows_result_A_B / rows_text_B * 100
print("Similarity Ulysses x Dubliners = ",simil_index_AB, " %")

                                                                                

+--------------------+--------------------+------------------+
|            datasetA|            datasetB|   JaccardDistance|
+--------------------+--------------------+------------------+
|{second tankard t...|{i heard that tha...|0.8571428571428572|
|{martin cunningha...|{he told his hear...|             0.875|
|{gutenberg associ...|{gutenberg associ...|               0.0|
|{or corrupt data ...|{or corrupt data ...|               0.0|
|{without further ...|{without further ...|               0.0|
|{gazing far away ...|{anything but joe...|0.8888888888888888|
|{1e1 the followin...|{1e1 the followin...|               0.0|
|{gutenberg litera...|{the project gute...|               0.8|
|{payments should ...|{payments should ...|               0.0|
|{professor michae...|{professor michae...|               0.0|
|{subscribe to our...|{subscribe to our...|               0.0|
|{extr taraxel lig...|{we didnt learn t...|0.8571428571428572|
|{updated editions...|{updated editions...|            

[Stage 21:>                                                         (0 + 1) / 1]

Similarity Ulysses x Dubliners =  15.121467526028756  %


                                                                                

In [14]:
# Show similarity with Jaccard Distance below 0.5
result_A_A = model.stages[-1].approxSimilarityJoin(text_A, text_A, 0.5, distCol="JaccardDistance")
result_A_A .show()

simil_index_AA = result_A_A.count() / rows_text_A * 100
print("Similarity Ulysses x Ulysses = ",simil_index_AA, " %")

                                                                                

+--------------------+--------------------+---------------+
|            datasetA|            datasetB|JaccardDistance|
+--------------------+--------------------+---------------+
|{haines from the ...|{haines from the ...|            0.0|
|{that they never ...|{that they never ...|            0.0|
|{curran ten guine...|{curran ten guine...|            0.0|
|{two the two mari...|{two the two mari...|            0.0|
|{she likes in the...|{she likes in the...|            0.0|
|{l ci darem with ...|{l ci darem with ...|            0.0|
|{remind her of th...|{remind her of th...|            0.0|
|{yes yes mr bloom...|{yes yes mr bloom...|            0.0|
|{so often you hav...|{so often you hav...|            0.0|
|{fingering still ...|{fingering still ...|            0.0|
|{now quite right ...|{now quite right ...|            0.0|
|{cry fishs face b...|{cry fishs face b...|            0.0|
|{my kneecap is hu...|{my kneecap is hu...|            0.0|
|{gone bad well an...|{gone bad well an.

[Stage 32:>                                                         (0 + 1) / 1]

Similarity Ulysses x Ulysses =  100.1751313485114  %


                                                                                

## Finishing

In [15]:
print("--- Execution time: %s seconds ---" % (time.time() - start_time))
# Stop Spark
sc.stop()

--- Execution time: 41.9450900554657 seconds ---
