# ETL Ingestion

## Setup environment and import libraries

In [1]:
%mount_workspace_dir . --params umask=222,allow_other
import sys
sys.path.insert(0,"/home/emr-notebook/e-BZQPM2KW62NDCT2KPVNNE8UV6/")

The mount directory is already mounted. Skipping mounting.


### Upload the jdbc driver in S3 and declare the path

In [2]:
%%configure -f

{
    "conf": {
        "spark.jars": "s3://aws-glue-assets-999999999999-ap-southeast-1/jars/postgresql-42.3.3.jar,s3://aws-glue-assets-999999999999-ap-southeast-1/jars/emr_redshift_spark/minimal-json.jar,s3://aws-glue-assets-999999999999-ap-southeast-1/jars/emr_redshift_spark/RedshiftJDBC.jar,s3://aws-glue-assets-999999999999-ap-southeast-1/jars/emr_redshift_spark/spark-avro.jar,s3://aws-glue-assets-999999999999-ap-southeast-1/jars/emr_redshift_spark/spark-redshift.jar"
    }
}



In [3]:

import os
import sys
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.functions import to_date, col, udf

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.2,io.github.spark-redshift-community:spark-redshift_2.11:4.0.1 pyspark-shell'


VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
95,application_1667433809375_0094,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Install boto3 in cluster first
`sudo pip3 install -U boto3`

In [4]:

#sc.install_pypi_package("boto3")
import boto3
import botocore

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
import sys
sys.path.append("/home/emr-notebook/e-BZQPM2KW62NDCT2KPVNNE8UFF")
import ETLUtility
from ETLUtility import get_bookmark_value
from ETLUtility import my_uppercase
from ETLUtility import find_latest_event_time
from ETLUtility import update_bookmark_value


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Get bookmark value from previous run

In [6]:
previous_bookmark_val = get_bookmark_value("ecommerce_customer_activity_ts")
print(previous_bookmark_val)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2022-11-04 06:20:23.441813

### Read from Postgres database with customized filtering

In [7]:

db_url = "jdbc:postgresql://lakehouse-source-db.cluster-aaaaaaaaaa.ap-southeast-1.rds.amazonaws.com:5432/lakehouse_source_db"
db_query = "select item_id,user_id,event_type,event_time,discount from ecommerce.customer_activity_ts where event_time>to_timestamp('" + previous_bookmark_val +"', 'YYYY-MM-DD HH24:MI:SS.US')"

jdbcDF = spark.read.format("jdbc") \
    .option("driver", "org.postgresql.Driver") \
    .option("url", db_url) \
    .option("query", db_query) \
    .option("user", "awsuser") \
    .option("password", "password") \
    .option("fetchsize", 10000) \
    .load()

print(db_query)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

select item_id,user_id,event_type,event_time,discount from ecommerce.customer_activity_ts where event_time>to_timestamp('2022-11-04 06:20:23.441813', 'YYYY-MM-DD HH24:MI:SS.US')

In [8]:
jdbcDF.show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------+-------------+--------------------+--------+
|             item_id|user_id|   event_type|          event_time|discount|
+--------------------+-------+-------------+--------------------+--------+
|1def0093-96b2-4cc...|   3156|ProductViewed|2022-11-04 07:46:...|      No|
|1def0093-96b2-4cc...|   3156|ProductViewed|2022-11-04 07:46:...|      No|
|4df77d59-732e-419...|    332|ProductViewed|2022-11-04 07:46:...|     Yes|
|4df77d59-732e-419...|    332|ProductViewed|2022-11-04 07:46:...|     Yes|
|31b83eb4-bd8a-4b5...|   3981|ProductViewed|2022-11-04 07:46:...|     Yes|
|31b83eb4-bd8a-4b5...|   3981|ProductViewed|2022-11-04 07:46:...|     Yes|
|89fbf7f1-0656-44e...|   3905|ProductViewed|2022-11-04 07:46:...|      No|
|89fbf7f1-0656-44e...|   3905|ProductViewed|2022-11-04 07:46:...|      No|
|f9c470b0-152b-477...|   4135|ProductViewed|2022-11-04 07:46:...|     Yes|
|f9c470b0-152b-477...|   4135|ProductViewed|2022-11-04 07:46:...|     Yes|
+--------------------+---

### Transform using a UDF function to set as Uppercase

In [9]:
outputDF = jdbcDF.withColumn('event_type',my_uppercase(col("event_type")))
outputDF.show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-------+-------------+--------------------+--------+
|             item_id|user_id|   event_type|          event_time|discount|
+--------------------+-------+-------------+--------------------+--------+
|1def0093-96b2-4cc...|   3156|PRODUCTVIEWED|2022-11-04 07:46:...|      No|
|1def0093-96b2-4cc...|   3156|PRODUCTVIEWED|2022-11-04 07:46:...|      No|
|4df77d59-732e-419...|    332|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
|4df77d59-732e-419...|    332|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
|31b83eb4-bd8a-4b5...|   3981|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
|31b83eb4-bd8a-4b5...|   3981|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
|89fbf7f1-0656-44e...|   3905|PRODUCTVIEWED|2022-11-04 07:46:...|      No|
|89fbf7f1-0656-44e...|   3905|PRODUCTVIEWED|2022-11-04 07:46:...|      No|
|f9c470b0-152b-477...|   4135|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
|f9c470b0-152b-477...|   4135|PRODUCTVIEWED|2022-11-04 07:46:...|     Yes|
+--------------------+---

### Use function to find max value in a dataframe

In [10]:
new_bookmark_val = find_latest_event_time(outputDF,"event_time",spark)
print(new_bookmark_val)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

2022-11-04 07:46:38.937047

## Save in Redshift

https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-redshift.html

In [11]:


db_url = "jdbc:redshift://lakehouse-redshift-cluster.aaaaaaaaaaa.ap-southeast-1.redshift.amazonaws.com:5439/dev"

outputDF.write \
  .format("io.github.spark_redshift_community.spark.redshift") \
  .mode("append") \
  .option("url", db_url) \
  .option("user", "awsuser") \
  .option("password", "password") \
  .option("dbtable", "ecommerce.customer_activity_ts") \
  .option("aws_iam_role", "arn:aws:iam::999999999999:role/LakeHouseRedshiftGlueAccessRole") \
  .option("tempdir", "s3://aws-emr-resources-999999999999-ap-southeast-1/tempfolder/") \
  .save()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Update bookmark value with latest value

In [12]:
update_bookmark_value("ecommerce_customer_activity_ts",str(new_bookmark_val))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…