In [0]:
%run ./01-config

In [0]:
class Bronze():
    def __init__(self, env):
        self.Conf = Config()
        self.landing_zone = self.Conf.base_data_path + "/raw" 
        self.checkpoint_base = self.Conf.base_dir_checkpoint + "/checkpoints"
        self.catalog = env
        self.db_name = "ecommerce_db_bz"
        spark.sql(f"USE {self.catalog}.{self.db_name}")

# --------------------------------INSERT DATA--------------------------------
    def insert_order_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "order_id STRING, customer_id STRING, order_status STRING, order_purchase_timestamp TIMESTAMP, order_approved_at TIMESTAMP, order_delivered_carrier_date TIMESTAMP, order_delivered_customer_date TIMESTAMP, order_estimated_delivery_date TIMESTAMP"

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/orders")
                )

        stream_writer = df.writeStream \
                                 .format("delta") \
                                 .option("checkpointLocation", self.checkpoint_base + "/order_bz") \
                                 .outputMode("append") \
                                 .queryName("order_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p1")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.orders_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.orders_bz")
        
        print("Hoàn thành!")

    def insert_order_item_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = "order_id STRING, order_item_id INT, product_id STRING, seller_id STRING, shipping_limit_date TIMESTAMP, price FLOAT, freight_value FLOAT"

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/order_items")
                )

        stream_writer = df.writeStream \
                                 .format("delta") \
                                 .option("checkpointLocation", self.checkpoint_base + "/order_items_bz") \
                                 .outputMode("append") \
                                 .queryName("order_items_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p2")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.order_items_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.order_items_bz")
        
        print("Hoàn thành!")
    
    def load_customer_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            customer_id string,
            customer_unique_id string,
            customer_zip_code_prefix int,
            customer_city string,
            customer_state string
        """

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/customers")
                )

        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/customer_bz") \
                                .outputMode("append") \
                                .queryName("customer_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p3")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.customer_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.customer_bz")
        print("Hoàn thành!")
    
    def insert_category_translation_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            product_category_name STRING, product_category_name_english STRING
        """

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/product_category_name_translation")
                )

        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/category_translation_bz") \
                                .outputMode("append") \
                                .queryName("category_translation_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p4")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.category_translation_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.category_translation_bz")
        print("Hoàn thành!")

    def load_sellers_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            seller_id string,
            seller_zip_code_prefix int,
            seller_city string,
            seller_state string
        """
        df = (spark.readStream
                    .format("cloudFiles")
                    .schema(schema)
                    .option("maxFilesPerTrigger", 1)
                    .option("cloudFiles.format", "csv")
                    .option("header", "true")
                    .load(self.landing_zone + "/sellers")
                    )

        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/sellers_bz") \
                                .outputMode("append") \
                                .queryName("sellers_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p5")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.sellers_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.sellers_bz")
        print("Hoàn thành!")

    def load_order_payments_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            order_id string,
            payment_sequential tinyint,
            payment_type string,
            payment_installments tinyint,
            payment_value float
        """

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/order_payments")
                )

        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/order_payments_bz") \
                                .outputMode("append") \
                                .queryName("order_payments_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p5")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.order_payment_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.order_payment_bz")
        print("Hoàn thành!")

    def load_order_reviews_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            review_id string,
            order_id string,
            review_score tinyint,
            review_comment_title string,
            review_comment_message string,
            review_creation_date timestamp,
            review_answer_timestamp timestamp
        """

        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/order_reviews")
                )

        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/order_reviews_bz") \
                                .outputMode("append") \
                                .queryName("order_reviews_bz_insert_stream")

        # spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p7")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.order_reviews_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.order_reviews_bz")
        print("Hoàn thành!")

    def insert_product_bz(self, once=True, processing_time="5 seconds"):
        from pyspark.sql import functions as F
        schema = """
            product_id STRING, product_category_name STRING, product_name_lenght TINYINT, product_description_lenght INT, product_photos_qty TINYINT, product_weight_g INT, product_length_cm TINYINT, product_height_cm TINYINT, product_width_cm TINYINT
        """
        
        df = (spark.readStream
                .format("cloudFiles")
                .schema(schema)
                .option("maxFilesPerTrigger", 1)
                .option("cloudFiles.format", "csv")
                .option("header", "true")
                .load(self.landing_zone + "/products")
                )
    
        stream_writer = df.writeStream \
                                .format("delta") \
                                .option("checkpointLocation", self.checkpoint_base + "/product_bz") \
                                .outputMode("append") \
                                .queryName("product_bz_insert_stream")

        spark.sparkContext.setLocalProperty("spark.scheduler.pool", "bronze_p8")
        if once == True:
            return stream_writer.trigger(availableNow=True).toTable(f"{self.catalog}.{self.db_name}.products_bz")
        else:
            return stream_writer.trigger(processingTime=processing_time).toTable(f"{self.catalog}.{self.db_name}.products_bz")
        print("Hoàn thành!")
    

    def load(self, once=True, processing_time="5 seconds"):
        print(f"\nTiến hành đưa dữ liệu vào Bronze Layer...")
        self.insert_order_bz(once, processing_time) 
        self.insert_order_item_bz(once, processing_time) 
        self.load_customer_bz(once, processing_time)
        self.insert_category_translation_bz(once, processing_time) 
        self.load_sellers_bz(once, processing_time) 
        self.load_order_payments_bz(once, processing_time)
        self.insert_product_bz(once, processing_time) 
        self.load_order_reviews_bz(once, processing_time)
        if once:
            for stream in spark.streams.active:
                stream.awaitTermination()

        print(f"Hoàn thành đưa dữ liệu vào Bronze Layer")

    def cleanup(self): 
        print(f"Đang xóa {self.checkpoint_base}...", end='')
        dbutils.fs.rm(self.checkpoint_base, True)
        print("Hoàn thành!")

In [0]:
BZ = Bronze('dev')
BZ.load(once=True)