# PySpark Notebook
1. Run PostgreSQL ddl script
2. Load CSV Data files
3. Write Data to PostgreSQL source db
4. Analyze Data with Spark SQL
5. Transform data into hash values
6. Write Data to PostgreSQL target db

_Prepared by: [Noam Marianne]

### Run PostgreSQL Script
Run the PostgreSQL sql script

In [1]:
# ! pip install psycopg2-binary --upgrade --quiet

In [2]:
%run -i 'users_ddl.py'

In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import hashlib
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
spark = SparkSession \
    .builder \
    .appName('pyspark_demo_app') \
    .config('spark.driver.extraClassPath',
            'postgresql-42.2.10.jar') \
    .master("local[*]") \
    .getOrCreate()

### declare PostgreSQL source prop

In [5]:
properties = {
    'driver': 'org.postgresql.Driver',
    'url': 'jdbc:postgresql://postgres:5432/source',
    'target_url': 'jdbc:postgresql://postgres:5432/target',
    'user': 'postgres',
    'password': 'postgres1234',
    'dbtable': 'users',
}

### Create schema
load users csv into a DataFrame

In [6]:
# File location and type
file_location = "input_files/users_de.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
users_df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)

users_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)



In [7]:
users_df.show()
users_df.count()

+------------------+--------------------+--------------------+
|              name|             address|            user_uid|
+------------------+--------------------+--------------------+
|     Jacob Lambert|    7992 Amber Trace|                null|
|     Port Clifford|           MI 17651"|62e50cd2-9844-4bf...|
|        Ryan Short|966 Dana Ford Apt...|                null|
|        South Cory|           ME 03533"|081dcf15-a0f3-401...|
|   Brian Henderson|8574 Kathryn Club...|                null|
|         Grantfort|           MA 30163"|67801cda-40e2-4cf...|
|Christopher Wilson|  7008 Hoffman Trail|                null|
|       New Amytown|           AL 76586"|1445efee-26e8-48d...|
|      Patrick Lane| 54302 White Parkway|                null|
|    Adrianachester|           WI 93258"|841bf619-e8d0-4ad...|
|      Bryan Prince|  PSC 0164, Box 5614|                null|
|     APO AE 05076"|a2d178e9-f431-4de...|                null|
|     Taylor Dudley|5941 Gonzales Uni...|              

200

In [None]:
### enrich df with cre_datetime column

In [8]:
users_df = users_df.withColumn("cre_datetime",F.current_timestamp())
users_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = false)



In [9]:
users_df.show()

+------------------+--------------------+--------------------+--------------------+
|              name|             address|            user_uid|        cre_datetime|
+------------------+--------------------+--------------------+--------------------+
|     Jacob Lambert|    7992 Amber Trace|                null|2022-03-12 19:28:...|
|     Port Clifford|           MI 17651"|62e50cd2-9844-4bf...|2022-03-12 19:28:...|
|        Ryan Short|966 Dana Ford Apt...|                null|2022-03-12 19:28:...|
|        South Cory|           ME 03533"|081dcf15-a0f3-401...|2022-03-12 19:28:...|
|   Brian Henderson|8574 Kathryn Club...|                null|2022-03-12 19:28:...|
|         Grantfort|           MA 30163"|67801cda-40e2-4cf...|2022-03-12 19:28:...|
|Christopher Wilson|  7008 Hoffman Trail|                null|2022-03-12 19:28:...|
|       New Amytown|           AL 76586"|1445efee-26e8-48d...|2022-03-12 19:28:...|
|      Patrick Lane| 54302 White Parkway|                null|2022-03-12 19:

### Write to PostgreSQL source db Table

In [10]:
users_df.write \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .mode('append') \
    .save()

In [None]:
### read data from PostgreSQL source db Table

In [11]:
users_df_from_db = spark.read \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .load()

users_df_from_db.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)



In [12]:
users_df_from_db.show(10)
users_df_from_db.count()

+------------------+--------------------+--------------------+--------------------+
|              name|             address|            user_uid|        cre_datetime|
+------------------+--------------------+--------------------+--------------------+
|     Jacob Lambert|    7992 Amber Trace|                null|2022-03-12 19:28:...|
|     Port Clifford|           MI 17651"|62e50cd2-9844-4bf...|2022-03-12 19:28:...|
|        Ryan Short|966 Dana Ford Apt...|                null|2022-03-12 19:28:...|
|        South Cory|           ME 03533"|081dcf15-a0f3-401...|2022-03-12 19:28:...|
|   Brian Henderson|8574 Kathryn Club...|                null|2022-03-12 19:28:...|
|         Grantfort|           MA 30163"|67801cda-40e2-4cf...|2022-03-12 19:28:...|
|Christopher Wilson|  7008 Hoffman Trail|                null|2022-03-12 19:28:...|
|       New Amytown|           AL 76586"|1445efee-26e8-48d...|2022-03-12 19:28:...|
|      Patrick Lane| 54302 White Parkway|                null|2022-03-12 19:

200

### Analyze Data with Spark SQL

In [13]:
users_df_from_db.createOrReplaceTempView("users")
df_sql = spark.sql("SELECT sum(case when user_uid is null then 0 else 1 end) as cnt_users_with_id, " +
                "sum(case when user_uid is null then 1 else 0 end) as cnt_users_without_id FROM users")
df_sql.show(10)

+-----------------+--------------------+
|cnt_users_with_id|cnt_users_without_id|
+-----------------+--------------------+
|               90|                 110|
+-----------------+--------------------+



In [None]:
### Transform Data

In [14]:
#   Define the UDF function
def algo(input_string):
    if (input_string):
        encoded_string = input_string.encode("utf-8")
    else:
        encoded_string = "none".encode("utf-8")
    return hashlib.sha256(encoded_string).hexdigest()

#   Register the UDF function.
algo_udf = spark.udf.register("algo", algo)

In [15]:
users_df_from_db.createOrReplaceTempView("users")
df_sql = spark.sql("SELECT algo(name) as name, algo(address) as address, algo(user_uid) as user_uid, current_timestamp() as cre_datetime FROM users")
df_sql.show(10)

+--------------------+--------------------+--------------------+--------------------+
|                name|             address|            user_uid|        cre_datetime|
+--------------------+--------------------+--------------------+--------------------+
|fd1aa8d0bd441275a...|bb9e087631b08eda3...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|46185a59c07cc6ab8...|29e02c36d792893e1...|f7ab56e55b0a35a11...|2022-03-12 19:28:...|
|33ef16bc42d0dcd92...|befa6f7fca9b370e5...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|46e623e8f41abc5eb...|2e0e640962814f79a...|cf291219e9be361a0...|2022-03-12 19:28:...|
|eda0eafffd48a7df9...|a32f5b2f9c54a5bd3...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|a2f1ac6b5d7e6104a...|f5cd793677a84529e...|4438d389d7709e1e5...|2022-03-12 19:28:...|
|c7f9f0661c78bab99...|f564a512d15d7ce58...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|2026550fc3e67cc3a...|e7147d9715689a178...|b9c85274f66b0716b...|2022-03-12 19:28:...|
|ae23696d8db28761f...|6a051be4beb4a54eb...|140bedbf9c3

In [8]:
### Write users_df to PostgreSQL target db Table

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)
 |-- name_hashed: string (nullable = true)
 |-- address_hashed: string (nullable = true)
 |-- user_uid_hashed: string (nullable = true)



In [16]:
df_sql.write \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['target_url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .mode('append') \
    .save()

In [None]:
### read data from PostgreSQL target db Table

In [17]:
users_df_from_target_db = spark.read \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['target_url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .load()

users_df_from_target_db.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)



In [18]:
users_df_from_target_db.show(10)

+--------------------+--------------------+--------------------+--------------------+
|                name|             address|            user_uid|        cre_datetime|
+--------------------+--------------------+--------------------+--------------------+
|fd1aa8d0bd441275a...|bb9e087631b08eda3...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|46185a59c07cc6ab8...|29e02c36d792893e1...|f7ab56e55b0a35a11...|2022-03-12 19:28:...|
|33ef16bc42d0dcd92...|befa6f7fca9b370e5...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|46e623e8f41abc5eb...|2e0e640962814f79a...|cf291219e9be361a0...|2022-03-12 19:28:...|
|eda0eafffd48a7df9...|a32f5b2f9c54a5bd3...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|a2f1ac6b5d7e6104a...|f5cd793677a84529e...|4438d389d7709e1e5...|2022-03-12 19:28:...|
|c7f9f0661c78bab99...|f564a512d15d7ce58...|140bedbf9c3f6d56a...|2022-03-12 19:28:...|
|2026550fc3e67cc3a...|e7147d9715689a178...|b9c85274f66b0716b...|2022-03-12 19:28:...|
|ae23696d8db28761f...|6a051be4beb4a54eb...|140bedbf9c3