In [29]:
import os
from pyspark.sql import SparkSession, DataFrame
from datetime import datetime, date
import pytz

# database connection
DB_SERVER=os.environ['PRODUCT1_SERVER']
DB_PORT=os.environ['PRODUCT1_PORT']
DB_USER=os.environ['PRODUCT1_USER']
DB_PASSWORD=os.environ['PRODUCT1_PASSWORD']
DATABASE = "product1"

# jdbc
JDBC_DRIVER = "org.postgresql.Driver"
JDBC_TYPE = "jdbc:postgresql"
JDBC_URL = f"{JDBC_TYPE}://{DB_SERVER}:{DB_PORT}/{DATABASE}"

# datetime format
TIMEZONE = 'Asia/Ho_Chi_Minh'
DATE_FORMAT = '%Y-%m-%d'
DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'

def current_systime() -> datetime:
    return datetime.now(pytz.timezone(TIMEZONE)).strftime(DATETIME_FORMAT)

def spark_read_jdbc(
        spark: SparkSession,
        query: str,
        driver: str = JDBC_DRIVER,
        url: str = JDBC_URL,
        user: str = DB_USER,
        password: str = DB_PASSWORD
    ) -> DataFrame:
    return (
        spark.read
        .format("jdbc")
        .option("driver", driver)
        .option("url", url)
        .option("user", user)
        .option("password", password)
        .option("query", query)
        .load()
    )

# init spark session

In [1]:
spark = (
    SparkSession.builder
    .master("spark://spark-master:7077")
    .appName("product1-capture")
    .config("spark.driver.memory", "1g")
    .config("hive.metastore.uris", "thrift://hive-metastore:9083")
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse/default")
    .config("spark.jars", "hdfs://namenode:9000/user/hive/spark_jars/postgresql-42.7.3.jar")
    .config("spark.executor.cores", 3)
    .config("spark.executor.memory", "6g")
    .config("spark.executor.instances", 2)
    .enableHiveSupport()
    .getOrCreate()
)

24/08/17 06:11:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
spark

# create table

In [31]:
table_list = (
    spark_read_jdbc(spark, f"""
        SELECT table_name FROM information_schema.tables
        WHERE table_schema='public'
    """)
    .rdd.flatMap(lambda x: x)
    .collect()
)
print("table_list", table_list)

[Stage 5:>                                                          (0 + 1) / 1]

table_list ['customer', 'service', 'period', 'trans', 'review']


                                                                                

In [32]:
checkpoint_datetime = current_systime()
print("checkpoint_datetime", checkpoint_datetime)

checkpoint_datetime 2024-08-17 14:08:31


In [45]:
for table in table_list:
    df = spark_read_jdbc(spark, f"""
        SELECT * FROM {table}
        WHERE updated_datetime <= '{checkpoint_datetime}'
    """)
    schema = df.dtypes
    print("schema", schema)
    
    spark.sql(f"DROP TABLE IF EXISTS staging.{DATABASE}_{table}")
    create_table_query = f"""
        CREATE EXTERNAL TABLE staging.{DATABASE}_{table} (
            {', '.join([column + ' ' + type for column, type in schema])}
        )
        STORED AS ORC
        LOCATION 'hdfs://namenode:9000/user/hive/warehouse/staging/{DATABASE}_{table}'
    """
    print("create_table_query", create_table_query)
    spark.sql(create_table_query)

    (
        df.write
        .format("orc")
        .mode("append")
        .save(f"hdfs://namenode:9000/user/hive/warehouse/staging/{DATABASE}_{table}")
    )
    
    break

schema [('customer_id', 'bigint'), ('first_name', 'string'), ('last_name', 'string'), ('birth_date', 'date'), ('address', 'string'), ('phone_number', 'string'), ('email', 'string'), ('job_title', 'string'), ('updated_datetime', 'timestamp')]
create_table_query 
        CREATE EXTERNAL TABLE staging.product1_customer (
            customer_id bigint, first_name string, last_name string, birth_date date, address string, phone_number string, email string, job_title string, updated_datetime timestamp
        )
        STORED AS ORC
        LOCATION 'hdfs://namenode:9000/user/hive/warehouse/staging/product1_customer'
    


                                                                                