# PySpark Notebook
1. Run PostgreSQL ddl script
2. Load CSV Data files
3. Write Data to PostgreSQL source db
4. Analyze Data with Spark SQL
5. Transform data into hash values
6. Write Data to PostgreSQL target db

_Prepared by: [Noam Marianne]

### Run PostgreSQL Script
Run the PostgreSQL sql script

In [1]:
# ! pip install psycopg2-binary --upgrade --quiet

In [11]:
%run -i 'notifications_ddl.py'

In [12]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import hashlib
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [13]:
spark = SparkSession \
    .builder \
    .appName('pyspark_demo_app') \
    .config('spark.driver.extraClassPath',
            'postgresql-42.2.10.jar') \
    .master("local[*]") \
    .getOrCreate()

### declare PostgreSQL source prop

In [14]:
properties = {
    'driver': 'org.postgresql.Driver',
    'url': 'jdbc:postgresql://postgres:5432/source',
    'target_url': 'jdbc:postgresql://postgres:5432/target',
    'user': 'postgres',
    'password': 'postgres1234',
    'dbtable': 'notifications',
}

### Create schema
load csv into a DataFrame

In [15]:
# File location and type
file_location = "input_files/notifications_de.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
notifications_df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.option("sep", delimiter) \
.load(file_location)

notifications_df.printSchema()

root
 |-- study_uid: string (nullable = true)
 |-- notification_time: string (nullable = true)
 |-- patient: string (nullable = true)
 |-- users: string (nullable = true)



In [16]:
notifications_df.show()
notifications_df.count()

+----------------+--------------------+----------+--------------------+
|       study_uid|   notification_time|   patient|               users|
+----------------+--------------------+----------+--------------------+
|0015d45c69610929|2020-02-04 00:15:...|5f8756d594|['3a0059f6-d1d5-4...|
|0059463f90978066|2020-02-07 14:54:...|0d6be9f6c1|['0f7f6896-aaf2-4...|
|02751c6845a97eea|2020-01-04 15:45:...|0b65c7ead0|['dbfaecd1-30b8-4...|
|04d5018024393334|2020-01-15 20:15:...|172a90cf07|['7446f7a5-4d53-4...|
|0576cf8cb6808188|2020-03-17 13:52:...|7e8cae205a|['d803830f-b0b9-4...|
|06565643d6a1d401|2020-01-23 07:41:...|91cc6691ba|['7e0dc9cb-d33b-4...|
|06c045b9a4b76e85|2020-02-25 09:10:...|bd5965dac7|['0d9113d0-8ac9-4...|
|08ec378fbfaa58bd|2020-01-24 23:28:...|11b99e8fd1|['17f412c9-6c39-4...|
|0a4b88135a000496|2020-02-10 01:44:...|9f25e9f8b6|['43a9f7ff-397f-4...|
|0b585e959b612cfc|2020-02-17 15:59:...|3209dd5920|['73cdb8ba-590f-4...|
|0d6039faed9c7347|2020-01-18 14:22:...|df8678fe3f|['0d9113d0-8ac

94

In [None]:
### enrich df with cre_datetime column

In [17]:
notifications_df = notifications_df.withColumn("cre_datetime",F.current_timestamp())
notifications_df.printSchema()

root
 |-- study_uid: string (nullable = true)
 |-- notification_time: string (nullable = true)
 |-- patient: string (nullable = true)
 |-- users: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = false)



In [18]:
notifications_df.show()

+----------------+--------------------+----------+--------------------+--------------------+
|       study_uid|   notification_time|   patient|               users|        cre_datetime|
+----------------+--------------------+----------+--------------------+--------------------+
|0015d45c69610929|2020-02-04 00:15:...|5f8756d594|['3a0059f6-d1d5-4...|2022-03-12 19:32:...|
|0059463f90978066|2020-02-07 14:54:...|0d6be9f6c1|['0f7f6896-aaf2-4...|2022-03-12 19:32:...|
|02751c6845a97eea|2020-01-04 15:45:...|0b65c7ead0|['dbfaecd1-30b8-4...|2022-03-12 19:32:...|
|04d5018024393334|2020-01-15 20:15:...|172a90cf07|['7446f7a5-4d53-4...|2022-03-12 19:32:...|
|0576cf8cb6808188|2020-03-17 13:52:...|7e8cae205a|['d803830f-b0b9-4...|2022-03-12 19:32:...|
|06565643d6a1d401|2020-01-23 07:41:...|91cc6691ba|['7e0dc9cb-d33b-4...|2022-03-12 19:32:...|
|06c045b9a4b76e85|2020-02-25 09:10:...|bd5965dac7|['0d9113d0-8ac9-4...|2022-03-12 19:32:...|
|08ec378fbfaa58bd|2020-01-24 23:28:...|11b99e8fd1|['17f412c9-6c39-4...

### Write to PostgreSQL source db Table

In [19]:
notifications_df.write \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .mode('append') \
    .save()

In [None]:
### read data from PostgreSQL source db Table

In [20]:
notifications_df_from_db = spark.read \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .load()

notifications_df_from_db.printSchema()

root
 |-- study_uid: string (nullable = true)
 |-- notification_time: string (nullable = true)
 |-- patient: string (nullable = true)
 |-- users: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)



In [21]:
notifications_df_from_db.show(10)
notifications_df_from_db.count()

+----------------+--------------------+----------+--------------------+--------------------+
|       study_uid|   notification_time|   patient|               users|        cre_datetime|
+----------------+--------------------+----------+--------------------+--------------------+
|0015d45c69610929|2020-02-04 00:15:...|5f8756d594|['3a0059f6-d1d5-4...|2022-03-12 19:32:...|
|0059463f90978066|2020-02-07 14:54:...|0d6be9f6c1|['0f7f6896-aaf2-4...|2022-03-12 19:32:...|
|02751c6845a97eea|2020-01-04 15:45:...|0b65c7ead0|['dbfaecd1-30b8-4...|2022-03-12 19:32:...|
|04d5018024393334|2020-01-15 20:15:...|172a90cf07|['7446f7a5-4d53-4...|2022-03-12 19:32:...|
|0576cf8cb6808188|2020-03-17 13:52:...|7e8cae205a|['d803830f-b0b9-4...|2022-03-12 19:32:...|
|06565643d6a1d401|2020-01-23 07:41:...|91cc6691ba|['7e0dc9cb-d33b-4...|2022-03-12 19:32:...|
|06c045b9a4b76e85|2020-02-25 09:10:...|bd5965dac7|['0d9113d0-8ac9-4...|2022-03-12 19:32:...|
|08ec378fbfaa58bd|2020-01-24 23:28:...|11b99e8fd1|['17f412c9-6c39-4...

94

### Analyze Data with Spark SQL
Analyze the DataFrame's users data using Spark SQL

In [22]:
notifications_df_from_db.createOrReplaceTempView("notifications")
df_sql = spark.sql("SELECT sum(case when study_uid is null then 0 else 1 end) as cnt_study_with_id, " +
                "sum(case when study_uid is null then 1 else 0 end) as cnt_study_without_id FROM notifications")
df_sql.show(10)

+-----------------+--------------------+
|cnt_study_with_id|cnt_study_without_id|
+-----------------+--------------------+
|               94|                   0|
+-----------------+--------------------+



In [None]:
### Transform Data

In [23]:
#   Define the UDF function
def algo(input_string):
    if (input_string):
        encoded_string = input_string.encode("utf-8")
    else:
        encoded_string = "none".encode("utf-8")
    return hashlib.sha256(encoded_string).hexdigest()

#   Register the UDF function.
algo_udf = spark.udf.register("algo", algo)

In [24]:
notifications_df_from_db.createOrReplaceTempView("notifications")
df_sql = spark.sql("SELECT algo(study_uid) as study_uid, notification_time, algo(patient) as patient, algo(users) as users, current_timestamp() as cre_datetime FROM notifications")
df_sql.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           study_uid|   notification_time|             patient|               users|        cre_datetime|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|6ba45a43203d0abdd...|2020-02-04 00:15:...|62f8be4752ca8bd36...|86f5cf619f2752606...|2022-03-12 19:32:...|
|8955ac073e0d939eb...|2020-02-07 14:54:...|f6d830d544c22d41d...|e1e333f683471a50f...|2022-03-12 19:32:...|
|58cff85daeb2f2caf...|2020-01-04 15:45:...|a1689d3c646bdcf41...|53837b0c12daff4df...|2022-03-12 19:32:...|
|074239cc33a4f8cae...|2020-01-15 20:15:...|f9801e7ed64829a63...|3e15d21ed730ad5ac...|2022-03-12 19:32:...|
|159fdad72e8102c0f...|2020-03-17 13:52:...|fb2c9564b1a28cb67...|eba2dcf456842b16e...|2022-03-12 19:32:...|
|1fa426fd5145e49f8...|2020-01-23 07:41:...|0dec190452af43270...|3e4bcef112b96e76f...|2022-03-12 19:32:...|
|0495ce13f998a49d3...|2020-02-25 09:1

In [8]:
### Write users_df to PostgreSQL target db Table

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- user_uid: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)
 |-- name_hashed: string (nullable = true)
 |-- address_hashed: string (nullable = true)
 |-- user_uid_hashed: string (nullable = true)



In [25]:
df_sql.write \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['target_url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .mode('append') \
    .save()

In [None]:
### read data from PostgreSQL target db Table

In [26]:
notifications_df_from_target_db = spark.read \
    .format('jdbc') \
    .option('driver', properties['driver']) \
    .option('url', properties['target_url']) \
    .option('user', properties['user']) \
    .option('password', properties['password']) \
    .option('dbtable', properties['dbtable']) \
    .load()

notifications_df_from_target_db.printSchema()

root
 |-- study_uid: string (nullable = true)
 |-- notification_time: string (nullable = true)
 |-- patient: string (nullable = true)
 |-- users: string (nullable = true)
 |-- cre_datetime: timestamp (nullable = true)



In [27]:
notifications_df_from_target_db.show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|           study_uid|   notification_time|             patient|               users|        cre_datetime|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|6ba45a43203d0abdd...|2020-02-04 00:15:...|62f8be4752ca8bd36...|86f5cf619f2752606...|2022-03-12 19:33:...|
|8955ac073e0d939eb...|2020-02-07 14:54:...|f6d830d544c22d41d...|e1e333f683471a50f...|2022-03-12 19:33:...|
|58cff85daeb2f2caf...|2020-01-04 15:45:...|a1689d3c646bdcf41...|53837b0c12daff4df...|2022-03-12 19:33:...|
|074239cc33a4f8cae...|2020-01-15 20:15:...|f9801e7ed64829a63...|3e15d21ed730ad5ac...|2022-03-12 19:33:...|
|159fdad72e8102c0f...|2020-03-17 13:52:...|fb2c9564b1a28cb67...|eba2dcf456842b16e...|2022-03-12 19:33:...|
|1fa426fd5145e49f8...|2020-01-23 07:41:...|0dec190452af43270...|3e4bcef112b96e76f...|2022-03-12 19:33:...|
|0495ce13f998a49d3...|2020-02-25 09:1