In [2]:
import os
from pyspark.sql import SparkSession, DataFrame
from datetime import datetime, date
import pytz

# database connection
DB_SERVER=os.environ['PRODUCT1_SERVER']
DB_PORT=os.environ['PRODUCT1_PORT']
DB_USER=os.environ['PRODUCT1_USER']
DB_PASSWORD=os.environ['PRODUCT1_PASSWORD']
DATABASE = "product1"

# jdbc
JDBC_DRIVER = "org.postgresql.Driver"
JDBC_TYPE = "jdbc:postgresql"
JDBC_URL = f"{JDBC_TYPE}://{DB_SERVER}:{DB_PORT}/{DATABASE}"

# datetime format
TIMEZONE = 'Asia/Ho_Chi_Minh'
DATE_FORMAT = '%Y-%m-%d'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'

def current_systime() -> datetime:
    return datetime.now(pytz.timezone(TIMEZONE)).strftime(DATETIME_FORMAT)

def spark_read_jdbc(
        spark: SparkSession,
        query: str,
        driver: str = JDBC_DRIVER,
        url: str = JDBC_URL,
        user: str = DB_USER,
        password: str = DB_PASSWORD
    ) -> DataFrame:
    return (
        spark.read
        .format("jdbc")
        .option("driver", driver)
        .option("url", url)
        .option("user", user)
        .option("password", password)
        .option("query", query)
        .load()
    )

# init spark session

In [3]:
spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("product1-capture")
    .config("spark.driver.memory", "1g")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse/default")
    .config("spark.jars", "hdfs://namenode:9000/user/hive/spark_jars/postgresql-42.7.3.jar")
    .config("spark.executor.cores", 3)
    .config("spark.executor.memory", "6g")
    .config("spark.executor.instances", 2)
    .enableHiveSupport()
    .getOrCreate()
)

24/08/18 12:23:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/18 12:23:30 WARN TransportClientFactory: DNS resolution succeed for spark-master/172.18.0.17:7077 took 5006 ms


In [4]:
spark

# create table

In [11]:
table_list = (
    spark_read_jdbc(spark, f"""
        SELECT table_name FROM information_schema.tables
        WHERE table_schema='public'
    """)
    .rdd.flatMap(lambda x: x)
    .collect()
)
print("table_list", table_list)

[Stage 1:>                                                          (0 + 1) / 1]

table_list ['customer', 'service', 'period', 'trans', 'review']


                                                                                

In [12]:
checkpoint_datetime = current_systime()
print("checkpoint_datetime", checkpoint_datetime)

checkpoint_datetime 2024-08-17 16:58:25


In [13]:
for table in table_list:
    df = spark_read_jdbc(spark, f"""
        SELECT * FROM {table}
        WHERE updated_datetime <= '{checkpoint_datetime}'
    """)
    schema = df.dtypes
    print("schema", schema)
    
    spark.sql(f"DROP TABLE IF EXISTS staging.{DATABASE}_{table}")
    create_table_query = f"""
        CREATE EXTERNAL TABLE staging.{DATABASE}_{table} (
            {', '.join([column + ' ' + type for column, type in schema])}
        )
        STORED AS PARQUET
        LOCATION 'hdfs://namenode:9000/user/hive/warehouse/staging/topics/{DATABASE}.public.{table}'
    """
    print("create_table_query", create_table_query)
    spark.sql(create_table_query)

    # (
    #     df.write
    #     .format("orc")
    #     .mode("append")
    #     .save(f"hdfs://namenode:9000/user/hive/warehouse/staging/{DATABASE}_{table}")
    # )
    
    break

schema [('customer_id', 'bigint'), ('first_name', 'string'), ('last_name', 'string'), ('birth_date', 'date'), ('address', 'string'), ('phone_number', 'string'), ('email', 'string'), ('job_title', 'string'), ('updated_datetime', 'timestamp')]


24/08/17 09:58:26 WARN DropTableCommand: org.apache.spark.sql.AnalysisException: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/user/hive/warehouse/staging/product1_customer.
org.apache.spark.sql.AnalysisException: [PATH_NOT_FOUND] Path does not exist: hdfs://namenode:9000/user/hive/warehouse/staging/product1_customer.
	at org.apache.spark.sql.errors.QueryCompilationErrors$.dataPathNotExistError(QueryCompilationErrors.scala:1419)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$4(DataSource.scala:757)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$4$adapted(DataSource.scala:754)
	at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:393)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at sc

create_table_query 
        CREATE EXTERNAL TABLE staging.product1_customer (
            customer_id bigint, first_name string, last_name string, birth_date date, address string, phone_number string, email string, job_title string, updated_datetime timestamp
        )
        STORED AS ORC
        LOCATION 'hdfs://namenode:9000/user/hive/warehouse/staging/topics/product1.public.customer'
    


In [8]:
(
    spark.read
    .parquet('hdfs://namenode:9000/user/hive/warehouse/staging/topics/product1.public.customer')
    .select("after.*")
    .show(truncate=False)
)

+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+----------------+
|customer_id|first_name|last_name|birth_date|address          |phone_number|email             |job_title        |updated_datetime|
+-----------+----------+---------+----------+-----------------+------------+------------------+-----------------+----------------+
|4          |Cuong     |Vo       |11510     |12 ABC, MNO, XYZ |01234       |cuongvo@gmail.com |Data Engineer    |1723911737000000|
|5          |Cuong     |Vo       |11510     |12 ABC, MNO, XYZ |01235       |cuongvo1@gmail.com|Data Engineer    |1723911824000000|
|6          |Cuong     |Vo       |11510     |12 ABC, MNO, XYZ |012356      |cuongvo2@gmail.com|Data Engineer    |1723911838000000|
|1          |Vu        |Tran     |11378     |123 ABC, MNO, XYZ|0865937123  |nitsvutt@gmail.com|Data Engineer    |1723898668000000|
|2          |The       |Duong    |11510     |132 ABC, MNO, XYZ|0865937124  |theduon

In [7]:
(
    spark.read
    .parquet('hdfs://namenode:9000/user/hive/warehouse/staging/topics/product1.public.customer')
    .printSchema()
)

root
 |-- before: struct (nullable = true)
 |    |-- customer_id: long (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |    |-- birth_date: integer (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- phone_number: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- job_title: string (nullable = true)
 |    |-- updated_datetime: long (nullable = true)
 |-- after: struct (nullable = true)
 |    |-- customer_id: long (nullable = true)
 |    |-- first_name: string (nullable = true)
 |    |-- last_name: string (nullable = true)
 |    |-- birth_date: integer (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- phone_number: string (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- job_title: string (nullable = true)
 |    |-- updated_datetime: long (nullable = true)
 |-- source: struct (nullable = true)
 |    |-- version: string (nullable = true)
