In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("ETL")\
        .config('spark.jars.packages', 'org.postgresql:postgresql:42.6.2') \
        .getOrCreate()
sqlContext = SparkSession(spark)
#Dont Show warning only error
spark.sparkContext.setLogLevel("ERROR")

In [3]:
time_df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://103.245.236.247:5432/e-commerce") \
    .option("driver", "org.postgresql.Driver")\
    .option("user", "traianthai") \
    .option("password", "123123") \
    .option("dbtable", "public.\"order_product\"") \
    .load()


In [4]:
time_df.show()

+---+--------------------+--------------------+---------+--------+----------+-------+
| id|           createdAt|           updatedAt|deletedAt|quantity|productsId|orderId|
+---+--------------------+--------------------+---------+--------+----------+-------+
|108|2024-05-23 14:11:...|2024-05-23 14:11:...|     NULL|       1|        38|   1412|
|109|2024-05-23 14:11:...|2024-05-23 14:11:...|     NULL|       1|        37|   1413|
|110|2024-05-23 14:31:...|2024-05-23 14:31:...|     NULL|       5|        37|   1414|
| 22|2024-04-29 20:47:...|2024-04-29 20:47:...|     NULL|       1|        24|     13|
| 23|2024-04-29 20:47:...|2024-04-29 20:47:...|     NULL|       4|        21|     13|
| 24|2024-04-29 20:52:...|2024-04-29 20:52:...|     NULL|       1|        21|     14|
| 25|2024-04-29 20:52:...|2024-04-29 20:52:...|     NULL|       1|        24|     14|
| 26|2024-04-29 21:02:...|2024-04-29 21:02:...|     NULL|       1|        21|     15|
| 27|2024-04-29 21:02:...|2024-04-29 21:02:...|     NU

In [5]:
from pyspark.sql.functions import col, to_date
# Thực hiện transform để chuyển đổi trường CreateAt sang định dạng yyyy-mm-dd
time_df = time_df.select(to_date(col("createdAt")).alias("date"))

In [6]:
time_df.show()

+----------+
|      date|
+----------+
|2024-05-23|
|2024-05-23|
|2024-05-23|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-29|
|2024-04-30|
|2024-04-30|
|2024-04-30|
|2024-04-30|
|2024-04-30|
|2024-04-30|
|2024-04-30|
|2024-04-30|
+----------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import col, dayofmonth, month, year,when

# Tạo DataFrame mới dim_time từ trường CreateAt
dim_time_df = time_df.withColumn("month", month(col("date"))) \
                   .withColumn("quarter",  when(col("month").between(1, 3), 1)
                                          .when(col("month").between(4, 6), 2)
                                          .when(col("month").between(7, 9), 3)
                                          .otherwise(4)) \
                   .withColumn("year", year(col("date"))
).distinct()

In [8]:
dim_time_df.show()

+----------+-----+-------+----+
|      date|month|quarter|year|
+----------+-----+-------+----+
|2024-05-13|    5|      2|2024|
|2024-05-08|    5|      2|2024|
|2024-05-14|    5|      2|2024|
|2024-04-30|    4|      2|2024|
|2024-06-12|    6|      2|2024|
|2024-05-23|    5|      2|2024|
|2024-04-29|    4|      2|2024|
|2024-05-21|    5|      2|2024|
|2024-05-24|    5|      2|2024|
|2024-05-15|    5|      2|2024|
|2024-05-22|    5|      2|2024|
|2024-05-10|    5|      2|2024|
+----------+-----+-------+----+



In [9]:
from pyspark.sql.functions import monotonically_increasing_id

# Thêm trường time_id bằng cách sử dụng hàm monotonically_increasing_id()
dim_time_df = dim_time_df.withColumn("time_id", monotonically_increasing_id())


In [10]:
dim_time_df.show()

+----------+-----+-------+----+-------+
|      date|month|quarter|year|time_id|
+----------+-----+-------+----+-------+
|2024-05-13|    5|      2|2024|      0|
|2024-05-08|    5|      2|2024|      1|
|2024-05-14|    5|      2|2024|      2|
|2024-04-30|    4|      2|2024|      3|
|2024-06-12|    6|      2|2024|      4|
|2024-05-23|    5|      2|2024|      5|
|2024-04-29|    4|      2|2024|      6|
|2024-05-21|    5|      2|2024|      7|
|2024-05-24|    5|      2|2024|      8|
|2024-05-15|    5|      2|2024|      9|
|2024-05-22|    5|      2|2024|     10|
|2024-05-10|    5|      2|2024|     11|
+----------+-----+-------+----+-------+



In [11]:
# Ghi DataFrame dim_time vào bảng "dim_time" trong schema "public"
dim_time_df.write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://103.245.236.247:5432/data_warehouse") \
    .option("dbtable", "public.\"Dim_Time\"") \
    .option("user", "traianthai") \
    .option("password", "123123") \
    .mode("overwrite") \
    .save()


In [12]:
# Đóng SparkSession
spark.stop()