In [1]:
import os
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp

from pyspark.sql.types import StructType, StructField, StringType, IntegerType
os.environ['PYSPARK_SUBMIT_ARGS'] = '--driver-class-path /home/marcelo/libs/mysql-connector-java-8.0.19.jar --jars /home/marcelo/libs/mysql-connector-java-8.0.19.jar pyspark-shell'

import findspark

findspark.add_packages('mysql:mysql-connector-java:8.0.19')

import sys
!{sys.executable} -m pip install thefuzz





In [2]:
spark = SparkSession \
    .builder \
    .appName('POC ETL') \
    .master("local[*]") \
    .getOrCreate()

### Reading Data From JDBC

In [53]:
address_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql:3306/persondb") \
    .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "address") \
    .option("user", "root").option("password", "123456").load()

phone_df = spark.read.format("jdbc") \
    .option("url", "jdbc:mysql://mysql:3306/persondb") \
    .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "phone") \
    .option("user", "root").option("password", "123456").load()


address_df.show()

phone_df.show()

+--------------------+--------------------+-----------------+---------+-------+-----------------+--------------------+
|                  id|           person_id|            line1|    line2|zipcode|             city|             country|
+--------------------+--------------------+-----------------+---------+-------+-----------------+--------------------+
|00014071-9237-4c2...|4b33720c-9434-459...|       Olson Ramp|Suite 518|  67839|     Chelseaville|French Southern T...|
|0003dc42-4a96-458...|271f4c7c-0d7e-4b9...|  Kenneth Squares|Suite 589|  56824|       Jeremybury|               Malta|
|00041403-b684-439...|74de06cd-1dd1-4a2...|     Pierce Walks| Apt. 039|  02452| West Deborahport|            Tanzania|
|00044b19-fd6c-467...|b01fac40-84ef-4ba...|    Hunter Bypass|Suite 718|  82920|      South Larry|              Guyana|
|000b85a3-8119-41e...|00c80261-400a-46c...| Hernandez Hollow| Apt. 962|  15283|     Lake Natasha|             Bolivia|
|000eb31b-5824-4a0...|92ef5699-65a9-422...|     

### ETL

In [55]:
# Group and create 2 columns to compare
from pyspark.sql.functions import udf
from thefuzz import fuzz
from thefuzz import process
from pyspark.sql import functions as F
from itertools import combinations
from IPython.display import display
from pyspark.sql.functions import col, size


# Create new column to compare
import hashlib
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import upper
from pyspark.sql.types import StructType, StructField, IntegerType, StringType


def md5_str(val):
    result = hashlib.md5(val.encode())
    return str(result.hexdigest())

@udf(returnType=StringType())
def md5_udf_func(val):
    return md5(val)


@udf(returnType=StructType([
    StructField("ratio", IntegerType(), False),
    StructField("md5_elem_1", StringType(), False),
    StructField("md5_elem_2", StringType(), False)
    ])
)
def fuzz_udf_func(arr):
    # return type(arr)
    combinations_elements = list(combinations(arr, 2))
    max_ratio = -1
    md5_address_1 = ""
    md5_address_2 = ""
    for elem in combinations_elements:
        ratio = fuzz.ratio(elem[0], elem[1])
        if ratio > max_ratio:
            max_ratio = ratio
            md5_elem_1 = md5_str(elem[0])
            md5_elem_2 = md5_str(elem[1])
    return (max_ratio, md5_elem_1, md5_elem_2)


address_df = address_df.withColumn("complete_address", concat_ws(",","line1",'line2','zipcode','city','country'))
address_df = address_df.withColumn("complete_address", upper('complete_address')).withColumn("md5_complete_address", md5_udf_func('complete_address'))


addresses_grouped_df = address_df.groupBy('person_id').agg(F.collect_list("complete_address").alias('addresses'))
addresses_grouped_df = addresses_grouped_df.withColumn('addresses_size', size("addresses"))
addresses_grouped_df = addresses_grouped_df.where(col("addresses_size") > 1)
addresses_grouped_df = addresses_grouped_df.withColumn("fuzz_info", fuzz_udf_func('addresses'))
addresses_grouped_df.show()


+--------------------+--------------------+--------------+--------------------+
|           person_id|           addresses|addresses_size|           fuzz_info|
+--------------------+--------------------+--------------+--------------------+
|003de947-d36e-401...|[ANTHONY ROAD,APT...|             2|{100, f7095ccf6d2...|
|0043773f-249c-483...|[GIBSON KEY,APT. ...|             2|{100, 7ec22eea166...|
|004e1ea4-f969-4c3...|[CRUZ COMMON,SUIT...|             2|{100, 78c2674500e...|
|0057ea2c-5124-450...|[ANTHONY TRACE,AP...|             2|{100, 151d0398076...|
|005c70ef-a685-498...|[HAYES SPUR,APT. ...|             2|{100, 696df0905be...|
|006a9878-cfff-4f7...|[JACOB FORK,SUITE...|             2|{100, 00cececfc06...|
|00724e3e-c0b9-4ec...|[BRANCH THROUGHWA...|             2|{100, 6b2f0e22b9b...|
|0086d2c6-d5b2-45f...|[SHEILA FLATS,APT...|             2|{100, af04db6ed2c...|
|008cb3f9-27c9-421...|[WANG TRAIL,SUITE...|             2|{100, 5312fe883e0...|
|00a542e5-02c4-4ba...|[GARCIA OVERPASS,.

In [56]:
addresses_grouped_df.select("person_id", "fuzz_info.ratio", "fuzz_info.md5_elem_1", "fuzz_info.md5_elem_2").show()

+--------------------+-----+--------------------+--------------------+
|           person_id|ratio|          md5_elem_1|          md5_elem_2|
+--------------------+-----+--------------------+--------------------+
|003de947-d36e-401...|  100|f7095ccf6d282c6e9...|f7095ccf6d282c6e9...|
|0043773f-249c-483...|  100|7ec22eea166a33eb9...|7ec22eea166a33eb9...|
|004e1ea4-f969-4c3...|  100|78c2674500e560c8f...|78c2674500e560c8f...|
|0057ea2c-5124-450...|  100|151d039807625d401...|151d039807625d401...|
|005c70ef-a685-498...|  100|696df0905be0dd8f2...|696df0905be0dd8f2...|
|006a9878-cfff-4f7...|  100|00cececfc06df3e38...|00cececfc06df3e38...|
|00724e3e-c0b9-4ec...|  100|6b2f0e22b9bb07e98...|6b2f0e22b9bb07e98...|
|0086d2c6-d5b2-45f...|  100|af04db6ed2cbee595...|af04db6ed2cbee595...|
|008cb3f9-27c9-421...|  100|5312fe883e0ad2551...|5312fe883e0ad2551...|
|00a542e5-02c4-4ba...|  100|9cf5ea4a68fd5d2af...|9cf5ea4a68fd5d2af...|
|00d07698-45aa-48c...|  100|529381c639ecf4f2c...|529381c639ecf4f2c...|
|00e5c

In [57]:
addresses_grouped_df.write.mode('overwrite').parquet("identical_addresses")