In [1]:
import datetime
import yaml
from typing import Optional
from delta import *
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T
import logging
from pathlib import Path
import functools

In [22]:
from config import Config
from decorator_factory import DecoratorFactory

ImportError: cannot import name 'Config' from 'config' (unknown location)

In [7]:
from extract.raw_data_factory import *

NameError: name 'DecoratorFactory' is not defined

# Config

In [6]:
from typing import Optional
import pendulum
import logging
from pyspark.sql import SparkSession
import yaml
from delta import *
from pathlib import Path

class Config:
    def __init__(self, config_path: Optional[str] = None):
        self.config_path = config_path
        
        self.current_date_details = {
            'date': pendulum.now(tz='Asia/Ho_Chi_Minh')
            , 'year': pendulum.now(tz='Asia/Ho_Chi_Minh').year
            , 'month': pendulum.now(tz='Asia/Ho_Chi_Minh').month
            , 'day': pendulum.now(tz='Asia/Ho_Chi_Minh').day
        }

        # self.one_day_delta = pendulum.timedelta(days=1)
        self.one_day_delta = 10
        
        self.previous_date_details = {
            'date': pendulum.now(tz='Asia/Ho_Chi_Minh').subtract(days = self.one_day_delta)
            , 'year': pendulum.now(tz='Asia/Ho_Chi_Minh').subtract(days = self.one_day_delta).year
            , 'month': pendulum.now(tz='Asia/Ho_Chi_Minh').subtract(days = self.one_day_delta).month
            , 'day': pendulum.now(tz='Asia/Ho_Chi_Minh').subtract(days = self.one_day_delta).day
        }

        self.configs = self._get_configs()

        self.data_lake_configs = self.configs.get('data_lake')
        self.raw_configs = self.data_lake_configs['raw']
        self.enriched_configs = self.data_lake_configs['enriched']
        self.spark_configs = self.configs['spark']
        self.logging_configs = self.configs['logging']


    @property
    def spark(self):
        
        spark_config = self.spark_configs.get('config', {})
        spark_packages = self.spark_configs.get('packages')
        spark_app_name = self.spark_configs.get('app_name', 'temp_name')
        
        builder = (
            SparkSession
            .builder
            .config(map = spark_config)
            .appName(spark_app_name)
        )
        spark = configure_spark_with_delta_pip(
            spark_session_builder = builder
            , extra_packages = spark_packages
        ).getOrCreate()
        
        return spark
        
    @property
    def logger(self):
        
        logging_level = self.logging_configs.get('level', 'INFO')
        logging_format = self.logging_configs.get('format')
        logging_base_path = self.logging_configs.get('base_path')

        current_year = self.current_date_details['year']
        current_month = self.current_date_details['month']
        current_day = self.current_date_details['day']
        
        logging_path = f'{logging_base_path}/{current_year}/{current_month:0>2}'
        logging_file_name = f'{current_year}{current_month:0>2}{current_day:0>2}.log'
        
        logging_path = Path.cwd() / Path(logging_path)
        logging_path.mkdir(exist_ok=True, parents=True)
        logging_file_path = logging_path / logging_file_name
        
        logging.basicConfig(
            level = logging_level
            , format = logging_format
            , handlers = [
                logging.FileHandler(logging_file_path)
                , logging.StreamHandler()
            ]
        )
        return logging.getLogger(__name__)
        
    def _get_configs(self) -> dict:
        try:
            if not self.config_path:
                self.config_path = '/home/jovyan/work/logistics_project_v2/config/config.yaml'
            with open(self.config_path) as file:
                config = yaml.safe_load(file)
        except Exception as e:
            raise e

        return config


# Decorator

In [8]:
import functools
from pyspark.sql import functions as F
from pyspark.sql import types as T

class DecoratorFactory:
    
    @staticmethod
    def decorate_get_raw_data(func):
        @functools.wraps(func)
        def new_function(self, raw_name, raw_data_name):
            self.logger.info(f'Starting to fetch raw {raw_name} data from HDFS.')
            self.logger.info(f"DataFrame: {raw_name}")

            try:
                self.logger.info(f'Executing function: {func.__name__}')
                raw_df = func(self, raw_name, raw_data_name)
                self.logger.info(f'Successfully fetched raw {raw_name} data.')
            except Exception as e:
                self.logger.error(f"Error while fetching {raw_name} from raw layer.")
                self.logger.error(f"DataFrame: {raw_name}")
                self.logger.error(f"Error: {str(e)}")
                raise
            return raw_df
            
        return new_function

    @staticmethod
    def decorate_select_columns(func):
        @functools.wraps(func)
        def new_function(self, raw_df):
            try:
                self.logger.info(f"Selecting columns: ['after.*', 'op', 'source.ts_ms', 'event_timestamp', 'year', 'month', 'day']")
                new_raw_df = func(self, raw_df)
            except Exception as e:
                self.logger.error(f"Error while selecting columns.")
                self.logger.error(f"Error: {str(e)}")
                raise
            return new_raw_df
        return new_function

    @staticmethod
    def decorate_get_enriched_df(func):
        @functools.wraps(func)
        def new_function(self):
            try:
                new_df = func(self)
            except Exception as e:
                self.logger.error(e)
                raise
            return new_df
        return new_function
        
    @staticmethod
    def decorate_load_enriched_data(func):
        @functools.wraps(func)
        def new_function(self, df, df_name):
            self.logger.info(f"Starting to load {df_name} to enriched layer.")
            self.logger.info(f"DataFrame: {df_name}")
            try:
                self.logger.info(f'Executing function: {func.__name__}')
                func(self, df, df_name)
            except Exception as e:
                self.logger.error(f"Error while loading {df_name} to enriched layer.")
                self.logger.error(f"DataFrame: {df_name}")
                self.logger.error(f"Error: {str(e)}")
                raise
            self.logger.info(f"Successfully loaded {df_name} to enriched layer.")
        return new_function

# Raw Data Factory

In [157]:
class RawDataFactory:
    def __init__(self, config = Config):
        self.app_config = config
        self.spark = self.app_config.spark
        self.logger = self.app_config.logger
        
        self.raw_configs = self.app_config.raw_configs
        self.raw_base_path = self.raw_configs.get('base_path')
        self.raw_format =  self.raw_configs.get('format')
        
        self.previous_date_details = self.app_config.previous_date_details
        self.previous_year = self.previous_date_details['year']
        self.previous_month = self.previous_date_details['month']
        self.previous_day = self.previous_date_details['day']

    @DecoratorFactory.decorate_get_raw_data
    def _get_raw_data(self, raw_name = str, raw_data_name = str) -> DataFrame:
        raw_df_path = f'{self.raw_base_path}/{raw_data_name}'
        self.logger.info(f"HDFS Path: {raw_df_path}")
        raw_df = (
            self.spark.read
            .format(self.raw_format)
            .option('path', raw_df_path)
            .load()
            .where(
                (F.col('year') == self.previous_year)
                & (F.col('month') == self.previous_month)
                & (F.col('day') == self.previous_day)
            )
        )
        return raw_df

    @property
    def raw_users_df(self):
        raw_users_name = 'users'
        raw_users_data_name = 'logistics_src.logistics.Users'
        return self._get_raw_data(raw_users_name, raw_users_data_name)
        
    @property
    def raw_drivers_df(self):
        raw_drivers_name = 'drivers'
        raw_drivers_data_name = 'logistics_src.logistics.Drivers'
        return self._get_raw_data(raw_drivers_name, raw_drivers_data_name)

    
    @property
    def raw_orders_df(self):
        raw_orders_name = 'orders'
        raw_orders_data_name = 'logistics_src.logistics.Orders'
        return self._get_raw_data(raw_orders_name, raw_orders_data_name)
        
    @property
    def raw_payments_df(self):
        raw_payments_name = 'payments'
        raw_payments_data_name = 'logistics_src.logistics.Payments'
        return self._get_raw_data(raw_payments_name, raw_payments_data_name)
        
    @property
    def raw_shipments_df(self):
        raw_shipments_name = 'shipments'
        raw_shipments_data_name = 'logistics_src.logistics.Shipments'
        return self._get_raw_data(raw_shipments_name, raw_shipments_data_name)


# Enriched Data Factory

# Dim Date

In [125]:
from config.config import Config
from pyspark.sql import functions as F

class DimDate:
    def __init__(self):
        self.app_config = Config()
        self.spark = self.app_config.spark

        self.curated_configs = self.app_config.curated_configs
        self.dimensions_base_path = self.curated_configs.get('dimensions_base_path')
        self.dim_date_path = self.dimensions_base_path + '/dim_date'
        
    @property
    def dim_date(self):
        date_range_query = "select explode(sequence(to_date('2024-01-01'), to_date('2044-01-01'), interval 1 days)) as date"
        date_range_df = self.app_config.spark.sql(date_range_query)
        dim_date = date_range_df.select(
            F.date_format(F.col('date'), 'yyyyMMdd').cast(T.IntegerType()).alias('date_key')
            , F.col('date')
            , F.year(F.col('date')).alias('year')
            , F.quarter(F.col('date')).alias('quarter')
            , F.month(F.col('date')).alias('month')
            , F.day(F.col('date')).alias('day')
            , F.dayofweek(F.col('date')).alias('day_of_week')
            , F.dayofmonth(F.col('date')).alias('day_of_month')
            , F.dayofyear(F.col('date')).alias('day_of_year')
            , F.weekofyear(F.col('date')).alias('week_of_year')
            , F.when(F.dayofweek(F.col('date')).isin(1, 7), F.lit(True)).otherwise(F.lit(False)).alias('is_weekend')
            , F.date_format(F.col('date'), 'EEEE').alias('week_name')
            , F.date_format(F.col('date'), 'MMMM').alias('month_name')
            , F.concat(F.lit('Q'), F.quarter(F.col('date'))).alias('quarter_name')
        )
        return dim_date
        
    def load(self):
        is_exists = DeltaTable.isDeltaTable(self.spark, self.dim_date_path)
        if not is_exists:
            self.dim_date.write.mode('overwrite').format('delta').option('path', self.dim_date_path).save()
            
dim_date_temp = DimDate()
dim_date_temp.load()

# Dim Location

In [33]:
from config.config import Config
from pyspark.sql import functions as F
from pyspark.sql.window import Window

class DimLocations:
    def __init__(self):
        self.app_config = Config()
        self.spark = self.app_config.spark
        self.source_df = EnrichedDataFactory(self.app_config).get_source('orders')
        
        self.curated_configs = self.app_config.curated_configs
        self.dimensions_base_path = self.curated_configs.get('dimensions_base_path')
        self.dim_locations_path = self.dimensions_base_path + '/dim_locations'
        
    @property
    def dim_locations(self):
        dim_locations = (
            self.source_df.select(F.col('pickup_address').alias('location'))
            .union(self.source_df.select(F.col('delivery_address').alias('location')))
            .dropDuplicates()
            .select(
                F.row_number().over(Window.orderBy(F.col('location'))).alias('location_key')
                , F.col('location')
            )
        )
        
        return dim_locations
        
    def load(self):
        is_exists = DeltaTable.isDeltaTable(self.spark, self.dim_locations_path)
        if not is_exists:
            self.dim_locations.write.mode('overwrite').format('delta').option('path', self.dim_locations_path).save()
        else:
            delta_table = DeltaTable.forPath(self.spark, self.dim_locations_path)
            max_location_key = delta_table.toDF().selectExpr('max(location_key) as max_location_key').first().max_location_key
            new_dim_locations = self.dim_locations.withColumn('location_key', F.col('location_key') + F.lit(max_location_key))
            
            (
                delta_table.alias('target')
                .merge(
                    source = new_dim_locations.alias('source')
                    , condition = (F.col('source.location') == F.col('target.location'))
                )
                .whenNotMatchedInsertAll()
                .execute()
            )
            
dim_location_temp = DimLocations()
dim_location_temp.load()

2025-02-02 09:58:13,470 - config.config - INFO - HDFS Path: /enriched/transactional/mysql/logistics/orders


# Dim Users

In [75]:
from config.config import Config
from delta import *
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

class DimUsers:
    def __init__(self):
        self.app_config = Config()
        self.spark = self.app_config.spark
        self.source_df = EnrichedDataFactory(self.app_config).get_source('users')
        
        self.curated_configs = self.app_config.curated_configs
        self.dimensions_base_path = self.curated_configs.get('dimensions_base_path')
        self.dim_users_path = self.dimensions_base_path + '/dim_users'
        
    @property
    def dim_users(self):
        
        dim_users = self.source_df.select(
            F.row_number().over(Window.orderBy(F.col('user_id'))).cast(T.LongType()).alias('user_key')
            , F.col('user_id').cast(T.LongType())
            , F.col('full_name')
            , F.col('email')
            , F.col('password_hash')
            , F.col('phone_number')
            , F.col('address')
            , F.col('role')
            , F.col('created_at')
            , F.lit(True).alias('is_current')
            , F.col('event_timestamp').alias('effective_from_timestamp')
            , F.make_timestamp(F.lit(9999), F.lit(12), F.lit(31), F.lit(0), F.lit(0), F.lit(0)).alias('effective_to_timestamp')
            , F.col('op')
        )
        return dim_users

    def load(self):
        is_exists =  DeltaTable.isDeltaTable(self.spark, self.dim_users_path)
        if not is_exists:
            self.dim_users.drop(F.col('op')).write.mode('overwrite').format('delta').option('path', self.dim_users_path).save()
        else:
            delta_table = DeltaTable.forPath(self.spark, self.dim_users_path)
            
            update_rows = self.dim_users.where(F.col('op') == 'u').drop(F.col('op'))
            
            (
                delta_table.alias('target')
                .merge(
                    source = update_rows.alias('source')
                    , condition = (F.col('source.user_id') == F.col('target.user_id'))
                )
                .whenMatchedUpdate(
                    condition = (F.col('target.is_current'))
                    , set = {
                        'is_current': F.lit(False)
                        , 'effective_to_timestamp': F.col('source.effective_from_timestamp')
                    }
                )
                .execute()
            )
            
            max_user_key = delta_table.toDF().selectExpr('max(user_key) as max_user_key').first().max_user_key
            new_dim_users = self.dim_users.withColumn('user_key', F.col('user_key') + F.lit(max_user_key)).drop(F.col('op'))
            new_dim_users.write.mode('append').format('delta').option('path', self.dim_users_path).save()
            
users_temp = DimUsers()

2025-02-02 06:40:27,822 - config.config - INFO - HDFS Path: /enriched/transactional/mysql/logistics/users


In [78]:
users_temp.load()

# Dim Drivers

In [89]:
from config.config import Config
from delta import *
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

class DimDrivers:
    def __init__(self):
        self.app_config = Config()
        self.spark = self.app_config.spark
        self.source_df = EnrichedDataFactory(self.app_config).get_source('drivers')
        
        self.curated_configs = self.app_config.curated_configs
        self.dimensions_base_path = self.curated_configs.get('dimensions_base_path')
        self.dim_drivers_path = self.dimensions_base_path + '/dim_drivers'
        self.dim_users_path = self.dimensions_base_path + '/dim_users'
        
    @property
    def dim_drivers(self):
        dim_users = (
            self.spark.read
            .format('delta')
            .option('path', self.dim_users_path)
            .load()
            .where((F.col('is_current')))
        )

        dim_drivers = (
            self.source_df.alias('drivers')
            .join(dim_users.alias('users'), F.col('drivers.user_id') == F.col('users.user_id'))
            .select(
                F.row_number().over(Window.orderBy(F.col('driver_id'))).cast(T.LongType()).alias('driver_key')
                , F.col('driver_id').cast(T.LongType())
                , F.col('full_name')
                , F.col('email')
                , F.col('password_hash')
                , F.col('phone_number')
                , F.col('address')
                , F.col('vehicle_license_plate')
                , F.col('vehicle_type')
                , F.col('vehicle_year')
                , F.lit(True).alias('is_current')
                , F.col('event_timestamp').alias('effective_from_timestamp')
                , F.make_timestamp(F.lit(9999), F.lit(12), F.lit(31), F.lit(0), F.lit(0), F.lit(0)).alias('effective_to_timestamp')
                , F.col('op')
            )
        )
        return dim_drivers

    def load(self):
        is_exists =  DeltaTable.isDeltaTable(self.spark, self.dim_drivers_path)
        if not is_exists:
            self.dim_drivers.drop(F.col('op')).write.mode('overwrite').format('delta').option('path', self.dim_drivers_path).save()
        else:
            delta_table = DeltaTable.forPath(self.spark, self.dim_drivers_path)
            
            update_rows = self.dim_drivers.where(F.col('op') == 'u')
            
            (
                delta_table.alias('target')
                .merge(
                    source = update_rows.alias('source')
                    , condition = (F.col('source.driver_id') == F.col('target.driver_id'))
                )
                .whenMatchedUpdate(
                    condition = (F.col('target.is_current'))
                    , set = {
                        'is_current': F.lit(False)
                        , 'effective_to_timestamp': F.col('source.effective_from_timestamp')
                    }
                )
                .execute()
            )
            
            max_driver_key = delta_table.toDF().selectExpr('max(driver_key) as max_driver_key').first().max_driver_key
            new_dim_drivers = self.withColumn('driver_key', F.col('driver_key') + F.lit(max_driver_key)).drop(F.col('op'))
            new_dim_drivers.write.mode('append').format('delta').option('path', self.dim_drivers_path).save()
            
drivers_temp = DimDrivers()

2025-02-02 07:12:10,908 - config.config - INFO - HDFS Path: /enriched/transactional/mysql/logistics/drivers


# Fact Orders

In [33]:
from config.config import Config
from delta import *
from pyspark.sql import functions as F
from pyspark.sql import types as T

class FactOrders:
    
    def __init__(self, status = None):
        self.app_config = Config()
        self.spark = self.app_config.spark
        self.source_df = EnrichedDataFactory(self.app_config).get_source('orders')
        # where(F.col('status') == F.lit(status))
        
        self.curated_configs = self.app_config.curated_configs
        self.facts_base_path = self.curated_configs.get('facts_base_path')
        self.fact_orders_path = self.facts_base_path + '/fact_orders'

        self.dimensions_base_path = self.curated_configs.get('dimensions_base_path')
        self.dim_users_path = self.dimensions_base_path + '/dim_users'
        self.dim_locations_path = self.dimensions_base_path + '/dim_locations'

    @property
    def dim_users(self):
        dim_users = (
            self.spark.read.format('delta').option('path', self.dim_users_path).load()
            .where((F.col('is_current')))
            .select(F.col('user_id'), F.col('user_key'))
        )
        return dim_users
        
    @property
    def dim_locations(self):
        dim_locations = self.spark.read.format('delta').option('path', self.dim_locations_path).load()
        return dim_locations
        
    @property
    def joined_orders(self):
        joined_orders = (
            self.source_df.alias('orders')
            .join(self.dim_users.alias('dim_users'), F.col('orders.user_id') == F.col('dim_users.user_id'), 'left')
            .join(self.dim_locations.alias('pickup_location'), F.col('orders.pickup_address') == F.col('pickup_location.location'), 'left')
            .join(self.dim_locations.alias('delivery_location'), F.col('orders.delivery_address') == F.col('delivery_location.location'), 'left')
        )
        return joined_orders
        
    def make_interval(self, date_key, time_key, end_timestamp):

        interval = end_timestamp - F.to_timestamp(F.concat(date_key, ' ', time_key), 'yyyyMMdd HH:mm:ss')

        interval_str = F.make_interval(
            days = F.extract(F.lit('D'), interval)
            , hours = F.extract(F.lit('H'), interval)
            , mins = F.extract(F.lit('m'), interval)
            , secs = F.extract(F.lit('s'), interval)
        ).cast(T.StringType())
        
        return interval_str
        
    def write_processing_orders(self):

        processing_orders_df = (
            self.joined_orders
            .where(F.col('status') == 'processing')
            .select(
                F.col('order_id')
                , F.col('user_key')
                , F.col('pickup_location.location_key').alias('pick_up_location_key')
                , F.col('delivery_location.location_key').alias('delivery_location_key')
                , F.col('package_description')
                , F.date_format(F.col('created_at'), 'yyyyMMdd').cast(T.LongType()).alias('created_order_date_key')
                , F.date_format(F.col('created_at'), 'HH:mm:ss').alias('created_order_time_key')
                , F.lit(99991231).alias('accepted_date_key')
                , F.lit('00:00:00').alias('accepted_time_key')
                , F.lit(99991231).alias('in_transit_date_key')
                , F.lit('00:00:00').alias('in_transit_time_key')
                , F.lit(99991231).alias('delivered_date_key')
                , F.lit('00:00:00').alias('delivered_time_key')
                , F.date_format(F.col('delivery_time'), 'yyyyMMdd').cast(T.LongType()).alias('delivery_date_key')
                , F.date_format(F.col('delivery_time'), 'HH:mm:ss').alias('delivery_time_key')
                , F.col('package_weight')
                , F.make_interval(days = F.lit(0), hours = F.lit(0), mins = F.lit(0), secs = F.lit(0)).cast(T.StringType()).alias('created_to_accepted_lag')
                , F.make_interval(days = F.lit(0), hours = F.lit(0), mins = F.lit(0), secs = F.lit(0)).cast(T.StringType()).alias('accepted_to_in_transit_lag')
                , F.make_interval(days = F.lit(0), hours = F.lit(0), mins = F.lit(0), secs = F.lit(0)).cast(T.StringType()).alias('in_transit_to_delivered_lag')
                , F.make_interval(days = F.lit(0), hours = F.lit(0), mins = F.lit(0), secs = F.lit(0)).cast(T.StringType()).alias('delivered_and_delivery_difference')
                , F.col('status')
            )
        )
        writer = processing_orders_df.write.format('delta').option('path', self.fact_orders_path)
        
        is_exists = DeltaTable.isDeltaTable(self.spark, self.fact_orders_path)
        
        if not exists:
            writer = writer.mode('overwrite')
        else:
            writer = writer.mode('append')
            
        writer.save()

    def write_accepted_orders(self):

        delta_table = DeltaTable.forPath(self.spark, self.fact_orders_path)
        accepted_orders_df = self.joined_orders.where(F.col('status') == 'accepted')

        (
            delta_table.alias('target')
            .merge(
                source = accepted_orders_df.alias('source')
                , condition = (F.col('source.order_id') == F.col('target.order_id'))
            )
            .whenMatchedUpdate(
                set = {
                    'user_key': F.col('source.user_key')
                    , 'pick_up_location_key': F.col('source.pick_up_location_key')
                    , 'delivery_location_key': F.col('source.delivery_location_key')
                    , 'package_description': F.col('source.package_description')
                    , 'accepted_date_key': F.date_format(F.col('source.event_timestamp'), 'yyyyMMdd').cast(T.LongType())
                    , 'accepted_time_key': F.date_format(F.col('source.event_timestamp'), 'HH:mm:ss')
                    , 'created_to_accepted_lag': self.make_interval(F.col('target.created_order_date_key'), F.col('target.accepted_time_key'), F.col('source.event_timestamp'))
                    , 'package_weight': F.col('source.package_weight')
                    , 'status': F.col('source.status')
                }
            )
            .execute()
        )

    def write_in_transit_orders(self):
        
        delta_table = DeltaTable.forPath(self.spark, self.fact_orders_path)
        in_transit_orders_df = self.joined_orders.where(F.col('status') == 'in_transit')
        
        (
            delta_table.alias('target')
            .merge(
                source = in_transit_orders_df.alias('source')
                , condition = (F.col('source.order_id') == F.col('target.order_id'))
            )
            .whenMatchedUpdate(
                set = {
                    'user_key': F.col('source.user_key')
                    , 'pick_up_location_key': F.col('source.pick_up_location_key')
                    , 'delivery_location_key': F.col('source.delivery_location_key')
                    , 'package_description': F.col('source.package_description')
                    , 'in_transit_date_key': F.date_format(F.col('source.event_timestamp'), 'yyyyMMdd').cast(T.LongType())
                    , 'in_transit_time_key': F.date_format(F.col('source.event_timestamp'), 'HH:mm:ss')
                    , 'accepted_to_in_transit_lag': self.make_interval(F.col('target.accepted_date_key'), F.col('target.accepted_time_key'), F.col('source.event_timestamp'))
                    , 'package_weight': F.col('source.package_weight')
                    , 'status': F.col('source.status')
                }
            )
            .execute()
        )

    def write_delivered_orders(self):
        
        delta_table = DeltaTable.forPath(self.spark, self.fact_orders_path)
        delivered_orders_df = self.joined_orders.where(F.col('status') == 'delivered')

        (
            delta_table.alias('target')
            .merge(
                source = delivered_orders_df.alias('source')
                , condition = (F.col('source.order_id') == F.col('target.order_id'))
            )
            .whenMatchedUpdate(
                set = {
                    'user_key': F.col('source.user_key')
                    , 'pick_up_location_key': F.col('source.pick_up_location_key')
                    , 'delivery_location_key': F.col('source.delivery_location_key')
                    , 'package_description': F.col('source.package_description')
                    , 'delivered_date_key': F.date_format(F.col('source.event_timestamp'), 'yyyyMMdd').cast(T.LongType())
                    , 'delivered_time_key': F.date_format(F.col('source.event_timestamp'), 'HH:mm:ss')
                    , 'in_transit_to_delivered_lag': self.make_interval(F.col('target.in_transit_date_key'), F.col('target.in_transit_time_key'), F.col('source.event_timestamp'))
                    , 'delivered_and_delivery_difference': self.make_interval(F.col('target.delivery_date_key'), F.col('target.delivered_time_key'), F.col('source.event_timestamp'))
                    , 'package_weight': F.col('source.package_weight')
                    , 'status': F.col('source.status')
                }
            )
            .execute()
        )
        
        

fact_orders_temp = FactOrders()
fact_orders_temp.joined_orders.show()

2025-02-03 06:03:55,911 - config.config - INFO - HDFS Path: /enriched/transactional/mysql/logistics/orders


+--------+-------+---------------+----------------+-------------------+--------------+-------------------+----------+-------------------+---+-------------------+-------+--------+------------+---------------+------------+---------------+
|order_id|user_id| pickup_address|delivery_address|package_description|package_weight|      delivery_time|    status|         created_at| op|    event_timestamp|user_id|user_key|location_key|       location|location_key|       location|
+--------+-------+---------------+----------------+-------------------+--------------+-------------------+----------+-------------------+---+-------------------+-------+--------+------------+---------------+------------+---------------+
|     321|     40|  1 Pickup Lane| 624 Delivery Rd|          Furniture|         35.37|2024-12-31 18:56:52|processing|2024-12-23 18:56:52|  r|2025-01-22 12:55:39|     40|      40|           2|  1 Pickup Lane|         617|624 Delivery Rd|
|     322|     41|201 Pickup Lane| 823 Delivery Rd| 

In [42]:
from pathlib import Path
import sys

sys.path.append(str(Path.cwd()))

In [40]:
Path.cwd().parent / 'config'

PosixPath('/home/jovyan/work/config')

# Test

In [4]:
spark = fact_orders_temp.app_config.spark
from datetime import datetime

In [23]:

# Define schema
schema = T.StructType([
    T.StructField("id", T.StructType(), True),
    T.StructField("start_date", T.TimestampType(), True),
    T.StructField("end_date", T.TimestampType(), True)
])

# Sample data
data = [
    (1, datetime(2024, 2, 1, 10, 0, 0), datetime(2024, 2, 3, 12, 0, 0)),
    (2, datetime(2024, 2, 2, 9, 30, 0), datetime(2024, 2, 2, 11, 35, 0)),
    (3, datetime(2024, 2, 3, 14, 0, 0), datetime(2024, 2, 3, 16, 45, 0))
]

# Create DataFrame
df = spark.createDataFrame(data, schema=["id", "start_date", "end_date"])

# Show the DataFrame
df.show(truncate=False)

+---+-------------------+-------------------+
|id |start_date         |end_date           |
+---+-------------------+-------------------+
|1  |2024-02-01 10:00:00|2024-02-03 12:00:00|
|2  |2024-02-02 09:30:00|2024-02-02 11:35:00|
|3  |2024-02-03 14:00:00|2024-02-03 16:45:00|
+---+-------------------+-------------------+



In [30]:
df.write.format('delta').option('path', '/test_table').mode('append').save()

In [25]:
df.select(
    F.col('*')
    , (F.col('end_date') - F.col('start_date')).alias('interval')
    , F.make_interval(
        days = F.extract(F.lit('D'), F.col('interval'))
        , hours = F.extract(F.lit('H'), F.col('interval'))
        , mins = F.extract(F.lit('m'), F.col('interval'))
        , secs = F.extract(F.lit('s'), F.col('interval'))
    )

    
).show(20, False)

+---+-------------------+-------------------+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |start_date         |end_date           |interval                           |make_interval(0, 0, 0, extract(D FROM lateralAliasReference(interval)), extract(H FROM lateralAliasReference(interval)), extract(m FROM lateralAliasReference(interval)), extract(s FROM lateralAliasReference(interval)))|
+---+-------------------+-------------------+-----------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1  |2024-02-01 10:00:00|2024-02-03 12:00:00|INTERVAL '2 02:00:00' DAY TO SECOND|2 days 2 hours  

In [75]:
df = fact_orders_temp.app_config.spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]],
    ["year", "month", "week", "day", "hour", "min", "sec"])
df.select(
    F.make_interval(days=df.day, hours=df.hour, mins=df.min, secs=df.sec).alias('r').cast(T.StringType())
    # F.make_dt_interval(df.day, df.hour, df.min, df.sec).alias('datetime')
    # F.make_ym_interval(df.year, df.month).alias('yearmonth')
).show(20, False)
# .write.mode('overwrite').format('delta').option('path', 'temp').save()

+-------------------------------------------+
|r                                          |
+-------------------------------------------+
|1 days 12 hours 30 minutes 1.001001 seconds|
+-------------------------------------------+



In [83]:
data = [
    ("2024-02-01 12:30:01",),
    ("2024-02-02 14:45:30",)
]

# Define the schema for the DataFrame
columns = ["created_order_time"]

# Create a DataFrame from the data
df = fact_orders_temp.app_config.spark.createDataFrame(data, columns)

# Convert the string column to a timestamp type
df = df.withColumn("created_order_time", F.to_timestamp("created_order_time", "yyyy-MM-dd HH:mm:ss"))

In [1]:
df.select(
    F.col('created_order_time')
    , F.date_format(F.col('created_order_time'), 'yyyyMMdd').cast(T.LongType()).alias('created_order_date_key')
    , F.date_format(F.col('created_order_time'), "HH:mm:ss").alias('created_order_time_key')
    , F.extract(F.lit('H'),
        F.to_timestamp(F.concat(F.col('created_order_date_key'), F.lit(' ') ,F.col('created_order_time_key')), 'yyyyMMdd HH:mm:ss') 
        + F.make_interval(F.lit(0), F.lit(0), F.lit(0), F.lit(0), F.lit(15), F.lit(30), F.lit(1))
        - F.col('created_order_time')
    ).alias('temp')
    
).show(20, False)

NameError: name 'df' is not defined

In [103]:
T.IntervalType()

AttributeError: module 'pyspark.sql.types' has no attribute 'IntervalType'

In [97]:
F.make_interval(days = F.lit(20))

Column<'make_interval(0, 0, 0, 20, 0, 0, 0)'>

In [53]:
df.write.mode('append').format('delta').option('path', '/temp').save()

In [20]:
enriched_data

<__main__.EnrichedDataFactory at 0x7f7148a108d0>

In [55]:
df.first().Age

34

In [43]:
def test(var1, var2):
    print(var1, var2)
test(**{'var1': '123', 'var2': 456})

123 456


In [149]:
class EnrichedDataFactory:
    def __init__(self, config = Config):
        self.app_config = config
        self.raw_data = RawDataFactory(self.app_config)
        
        self.logger = self.app_config.logger
        
        self.configs = self.app_config.enriched_configs
        self.base_path = self.configs.get('base_path')
        self.mode = self.configs.get('mode', 'append')
        self.format = self.configs.get('format', 'parquet')
        self.partition_columns = self.configs.get('partition_columns')
        self.compression = self.configs.get('compression')
        self.extra_configs = self.configs.get('configs')
        
    @DecoratorFactory.decorate_select_columns
    def _select_columns(self, raw_df):
        new_raw_df = raw_df.select(
            F.col('after.*')
            , F.col('op')
            , F.col('source.ts_ms')
            , F.from_utc_timestamp(F.to_timestamp(F.col('source.ts_ms') / 1000), 'Asia/Ho_Chi_Minh').alias('event_timestamp')
            , F.col('year')
            , F.col('month')
            , F.col('day')
        )
        return new_raw_df

    
    def _convert_iso_string_to_timestamp(self, iso_string):
        try:
            timestamp = F.from_utc_timestamp(F.to_timestamp(iso_string, "yyyy-MM-dd'T'HH:mm:ss'Z'"), 'Asia/Ho_Chi_Minh')
        except Exception as e:
            self.logger.error(f'Error parsing date: {e}')
            raise
        return timestamp
        
    
    @property
    @DecoratorFactory.decorate_get_enriched_df
    def enriched_users_df(self):
        
        self.logger.info('Starting transformation of raw users data.')
        
        new_users_df = self._select_columns(self.raw_data.raw_users_df)
        cast_users_df = (
            new_users_df
            .withColumn(
                'created_at'
                , self._convert_iso_string_to_timestamp(F.col('created_at'))
            )
        )
        self.logger.info('Completed transformation of raw users data.')
        
        return cast_users_df
        
    @property
    @DecoratorFactory.decorate_get_enriched_df
    def enriched_drivers_df(self):
        
        self.logger.info('Starting transformation of raw drivers data.')
        
        new_drivers_df = self._select_columns(self.raw_data.raw_drivers_df)
        
        self.logger.info('Completed transformation of raw drivers data.')
        
        return new_drivers_df
    
    @property
    @DecoratorFactory.decorate_get_enriched_df
    def enriched_orders_df(self):
        
        self.logger.info('Starting transformation of raw drivers data.')
        
        new_orders_df = self._select_columns(self.raw_data.raw_orders_df)
        cast_orders_df = (
            new_orders_df
            .withColumn(
                'delivery_time'
                , self._convert_iso_string_to_timestamp(F.col('delivery_time'))
            )
            .withColumn(
               'created_at'
                , self._convert_iso_string_to_timestamp(F.col('created_at'))
            )
        )
        self.logger.info('Completed transformation of raw orders data.')
        return cast_orders_df
        
    @property
    @DecoratorFactory.decorate_get_enriched_df
    def enriched_payments_df(self):
        self.logger.info('Starting transformation of raw payments data.')
        
        new_payments_df = self._select_columns(self.raw_data.raw_payments_df)
        
        cast_payments_df = (
            new_payments_df
            .withColumn(
                'payment_date'
                , self._convert_iso_string_to_timestamp(F.col('payment_date'))
            )
        )
        self.logger.info('Completed transformation of raw payments data.')
        
        return cast_payments_df
        
    @property
    @DecoratorFactory.decorate_get_enriched_df
    def enriched_shipments_df(self):
        self.logger.info("Starting transformation of raw shipments data.")
        
        new_shipments_df = self._select_columns(self.raw_data.raw_shipments_df)
        cast_shipments_df = (
            new_shipments_df
            .withColumn(
                'estimated_delivery_time'
                , self._convert_iso_string_to_timestamp(F.col('estimated_delivery_time'))
            )
        )
        self.logger.info('Completed transformation of raw shipments data.')
        
        return cast_shipments_df
        
    @property
    def data_lake_loader(self):
        return DataLakeLoader(self)

In [14]:
from pyspark.sql import DataFrame

from pathlib import Path
import sys

sys.path.append(Path.cwd() / 'decorator')

from decorator_factory import DecoratorFactory

class EnrichedDataLoader:
    def __init__(self, base_path, mode = 'append', format = 'parquet', partition_columns = None, compression = None, extra_configs = None ):
        self.base_path = base_path
        self.mode = mode
        self.format = format
        self.partition_columns = partition_columns
        self.compression = compression
        self.extra_cnfigs = extra_configs

    @DecoratorFactory.decorate_load_enriched_data
    def load_enriched_data(self, df: DataFrame, df_name: str) -> None:
        
        output_path = f'{self.base_path}/{df_name}'
        writer = df.write.mode(self.mode)

        if self.compression:
            writer = writer.option("compression", self.compression)
        if self.partition_columns:
            writer = writer.partitionBy(self.partition_columns)
        if self.extra_configs:
            writer = writer.options(**self.extra_configs)
        
        writer.format(self.format).save(output_path)
        self.logger.info(f"Path: {output_path}")

    
            

ModuleNotFoundError: No module named 'decorator_factory'

In [156]:
def main() -> None:
    config = Config()
    enriched_data = EnrichedDataFactory(config)
    dfs = []
    dfs.append((enriched_data.enriched_users_df, 'users'))
    dfs.append((enriched_data.enriched_drivers_df, 'drivers'))
    dfs.append((enriched_data.enriched_orders_df, 'orders'))
    dfs.append((enriched_data.enriched_payments_df, 'payments'))
    dfs.append((enriched_data.enriched_shipments_df, 'shipments'))

    for df, df_name in dfs:
        enriched_data.data_lake_loader.load_enriched_data(df, df_name)

if __name__ == '__main__':
    main()

2025-01-24 09:41:02,412 - __main__ - INFO - Starting transformation of raw users data.
2025-01-24 09:41:02,413 - __main__ - INFO - Starting to fetch raw users data from HDFS.
2025-01-24 09:41:02,414 - __main__ - INFO - DataFrame: users
2025-01-24 09:41:02,414 - __main__ - INFO - Executing function: _get_raw_data
2025-01-24 09:41:02,416 - __main__ - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Users
2025-01-24 09:41:02,626 - __main__ - INFO - Successfully fetched raw users data.
2025-01-24 09:41:02,627 - __main__ - INFO - Selecting columns: ['after.*', 'op', 'source.ts_ms', 'event_timestamp', 'year', 'month', 'day']
2025-01-24 09:41:02,698 - __main__ - INFO - Completed transformation of raw users data.
2025-01-24 09:41:02,699 - __main__ - INFO - Starting transformation of raw drivers data.
2025-01-24 09:41:02,700 - __main__ - INFO - Starting to fetch raw drivers data from HDFS.
2025-01-24 09:41:02,701 - __main__ - INFO - DataFrame: drivers
2025-01-

In [None]:
config.logger.info('8888888888')

In [None]:
temp = config.spark.read.format('parquet').option('path', '/raw/transactional/mysql/logistics/topics/logistics_src.logistics.Payments/').load()

In [None]:
temp.printSchema()

In [None]:
T.DecimalType()

In [None]:

temp.select(F.col('after.amount').cast(T.StringType())).show()

In [None]:
datetime.timedelta(days=1)

In [None]:
datetime.datetime.now() - datetime.timedelta(days=1)

In [None]:
def test( var: Optional[str] = None) -> str:
    print(type(var))
test(123)

In [None]:
year = 2024
month = 1
day = 1
print(f'/{year}/{month:0>2}/{day:0>2}')

In [None]:
config.spark.read.format('avro').option('path', path).load()

In [None]:
path = '/home/jovyan/work/logistics_project_v2/test/test/logistics_src.logistics.Drivers+0+0000000000+0000000019.avro'

In [None]:
metadata['avro.codec']

In [None]:
def test1(func):
    def wrappler(*args, **kwargs):
        t = func(*args, **kwargs)
        return t
    return wrappler

@test1
def cai(n):
    return n
cai(123)

In [None]:
id(n)

In [None]:
def check_authorization(f):
    def wrapper(self, var1):
        print(self.url)
        print(var1)
        return f(self, var1)
    return wrapper

class Client(object):
    def __init__(self, url):
        self.url = url

    @check_authorization
    def get(self, var1):
        print('get')

Client('url 123').get(123)

In [None]:
import functools

In [None]:
functools

In [None]:
def my_decorator(func):
    def wrapper(*args, **kwargs):
        """Wrapper function."""
        print("Before the function call")
        result = func(*args, **kwargs)
        print("After the function call")
        return result
    return wrapper

@my_decorator
def greet(name):
    """Greet a person by name."""
    print(f"Hello, {name}!")

print(greet.__name__)   # Output: wrapper (instead of 'greet')
print(greet.__doc__)    # Output: Wrapper function. (original docstring is lost)


In [None]:
from functools import wraps

def my_decorator(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        """Wrapper function."""
        print("Before the function call")
        result = func(*args, **kwargs)
        print("After the function call")
        return result
    return wrapper

@my_decorator
def greet(name):
    """Greet a person by name."""
    print(f"Hello, {name}!")

print(greet.__name__)   # Output: greet (metadata preserved)
print(greet.__doc__)    # Output: Greet a person by name. (original docstring preserved)


In [None]:
config.spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [None]:
data = [
    (1, "Alice", 29),
    (2, "Bob", 35),
    (3, "Charlie", 23)
]

# Define the schema (column names)
columns = ["id", "name", "age"]

# Create the DataFrame
df = config.spark.createDataFrame(data, schema=columns)
df = df.withColumn(
    'year', F.lit(2025)
).withColumn(
    'month', F.lit(1)
).withColumn(
    'day', F.lit(24)
)

In [None]:
df1 = df.withColumn(
    'day'
    , F.when(F.col('id') == 3, F.lit(23)).otherwise(F.col('day'))
)

In [None]:
df2 = df.withColumn(
    'day'
    , F.lit(23)
)

In [None]:
df2.show()

In [None]:
/home/jovyan/work/logistics_project_v2/

In [None]:
temp_config = {'header': True}
(
    df.write.mode('overwrite')
    .format('csv')
    .partitionBy(['year', 'month', 'day'])
    .options(**temp_config)
    .save('file:////home/jovyan/work/logistics_project_v2/df')
)

In [None]:
o

In [None]:
df

In [None]:
config.spark.read.format('csv').option('path', 'file:///home/jovyan/work/logistics_project_v2/df').load().where(F.col('year') == 2025)

In [None]:
config.spark.read.format('avro').option('path', '/raw/transactional/mysql/logistics/topics/logistics_src.logistics.Users').load().where(F.col('year') == 2025).show()

In [2]:
from pathlib import Path
import sys

In [6]:
Path.cwd() / '123/123'

PosixPath('/home/jovyan/work/logistics_project_v2/123/123')

In [4]:
sys.path.append(str(Path.cwd()))

In [8]:
from extract.raw_data_factory import RawDataFactory
from transform.enriched_data_factory import EnrichedDataFactory
from load.data_lake_loader import DataLakeLoader

In [8]:
from config.config import *

In [1]:
import raw_to_enriched_pipeline

In [4]:
raw_to_enriched_pipeline.main()

2025-02-01 06:54:07,618 - config - INFO - Starting transformation of raw users data.
2025-02-01 06:54:07,619 - config - INFO - Starting to fetch raw users data from HDFS.
2025-02-01 06:54:07,620 - config - INFO - DataFrame: users
2025-02-01 06:54:07,621 - config - INFO - Executing function: _get_raw_data
2025-02-01 06:54:07,621 - config - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Users
2025-02-01 06:54:07,686 - config - INFO - Successfully fetched raw users data.
2025-02-01 06:54:07,687 - config - INFO - Selecting columns: ['after.*', 'op', 'source.ts_ms', 'event_timestamp', 'year', 'month', 'day']
2025-02-01 06:54:07,729 - config - INFO - Completed transformation of raw users data.
2025-02-01 06:54:07,730 - config - INFO - Starting transformation of raw drivers data.
2025-02-01 06:54:07,731 - config - INFO - Starting to fetch raw drivers data from HDFS.
2025-02-01 06:54:07,732 - config - INFO - DataFrame: drivers
2025-02-01 06:54:07,733 - conf

In [1]:
from main import main

In [5]:
import pendulum

In [8]:
pendulum.now(tz='Asia/Ho_Chi_Minh').subtract(days=10)

DateTime(2025, 1, 22, 13, 34, 8, 974981, tzinfo=Timezone('Asia/Ho_Chi_Minh'))

In [9]:
pendulum.yesterday()

DateTime(2025, 1, 31, 0, 0, 0, tzinfo=Timezone('Etc/UTC'))

In [4]:
from delta import *
import pendulum

In [1]:
from transform.enriched_data_factory import EnrichedDataFactory

In [5]:
pendulum.now(tz='Asia/Ho_Chi_Minh')

DateTime(2025, 2, 1, 13, 42, 53, 834324, tzinfo=Timezone('Asia/Ho_Chi_Minh'))

In [2]:
EnrichedDataFactory().app_config.previous_date_details

{'date': DateTime(2025, 1, 22, 13, 43, 46, 869480, tzinfo=Timezone('Asia/Ho_Chi_Minh')),
 'year': 2025,
 'month': 1,
 'day': 22}

In [6]:
EnrichedDataFactory().raw_data.raw_users_df.show()

2025-02-01 06:46:46,614 - config - INFO - Starting to fetch raw users data from HDFS.
2025-02-01 06:46:46,616 - config - INFO - DataFrame: users
2025-02-01 06:46:46,618 - config - INFO - Executing function: _get_raw_data
2025-02-01 06:46:46,620 - config - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Users
2025-02-01 06:46:46,858 - config - INFO - Successfully fetched raw users data.


+------+--------------------+--------------------+-----------+---+-------------+----------------+-------------------+----+-----+---+
|before|               after|              source|transaction| op|        ts_ms|           ts_us|              ts_ns|year|month|day|
+------+--------------------+--------------------+-----------+---+-------------+----------------+-------------------+----+-----+---+
|  NULL|{121, User_121, u...|{3.0.5.Final, mys...|       NULL|  r|1737525339947|1737525339947311|1737525339947311698|2025|    1| 22|
|  NULL|{122, User_122, u...|{3.0.5.Final, mys...|       NULL|  r|1737525339947|1737525339947497|1737525339947497410|2025|    1| 22|
|  NULL|{123, User_123, u...|{3.0.5.Final, mys...|       NULL|  r|1737525339947|1737525339947660|1737525339947660791|2025|    1| 22|
|  NULL|{124, User_124, u...|{3.0.5.Final, mys...|       NULL|  r|1737525339947|1737525339947822|1737525339947822509|2025|    1| 22|
|  NULL|{125, User_125, u...|{3.0.5.Final, mys...|       NULL|  r|173

In [2]:

def main() -> None:
    
    enriched_data = EnrichedDataFactory()
    dfs = []
    dfs.append((enriched_data.enriched_users_df, 'users'))
    dfs.append((enriched_data.enriched_drivers_df, 'drivers'))
    dfs.append((enriched_data.enriched_orders_df, 'orders'))
    dfs.append((enriched_data.enriched_payments_df, 'payments'))
    dfs.append((enriched_data.enriched_shipments_df, 'shipments'))

    for df, df_name in dfs:
        enriched_data.data_lake_loader.load_enriched_data(df, df_name)

if __name__ == '__main__':
    main()

2025-01-26 09:34:00,370 - config - INFO - Starting transformation of raw users data.
2025-01-26 09:34:00,371 - config - INFO - Starting to fetch raw users data from HDFS.
2025-01-26 09:34:00,371 - config - INFO - DataFrame: users
2025-01-26 09:34:00,372 - config - INFO - Executing function: _get_raw_data
2025-01-26 09:34:00,373 - config - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Users
2025-01-26 09:34:04,164 - config - ERROR - Error while fetching users from raw layer.
2025-01-26 09:34:04,166 - config - ERROR - DataFrame: users
2025-01-26 09:34:04,167 - config - ERROR - Error: name 'F' is not defined
2025-01-26 09:34:04,169 - config - ERROR - name 'F' is not defined


NameError: name 'F' is not defined

In [5]:
configure_spark_with_delta_pip

<function delta.pip_utils.configure_spark_with_delta_pip(spark_session_builder: pyspark.sql.session.SparkSession.Builder, extra_packages: Optional[List[str]] = None) -> pyspark.sql.session.SparkSession.Builder>

In [7]:
from workflows import process_enriched_shipments

In [8]:
process_enriched_shipments.main()

2025-02-05 02:09:23,344 - logistics - INFO - Starting transformation of raw shipments data.
2025-02-05 02:09:23,345 - logistics - INFO - Starting to fetch raw shipments data from HDFS.
2025-02-05 02:09:23,346 - logistics - INFO - DataFrame: shipments
2025-02-05 02:09:23,347 - logistics - INFO - Executing function: _extract_raw_data
2025-02-05 02:09:23,348 - logistics - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Shipments
2025-02-05 02:09:23,418 - logistics - INFO - Successfully fetched raw shipments data.
2025-02-05 02:09:23,419 - logistics - INFO - Selecting columns: ['after.*', 'op', 'source.ts_ms', 'event_timestamp', 'year', 'month', 'day']
2025-02-05 02:09:23,473 - logistics - INFO - Completed transformation of raw shipments data.
2025-02-05 02:09:24,005 - logistics - INFO - Path: /enriched/transactional/mysql/logistics/shipments


In [4]:
process_enriched_orders.main()

2025-02-05 02:08:52,326 - logistics - INFO - Starting transformation of raw drivers data.
2025-02-05 02:08:52,327 - logistics - INFO - Starting to fetch raw orders data from HDFS.
2025-02-05 02:08:52,328 - logistics - INFO - DataFrame: orders
2025-02-05 02:08:52,329 - logistics - INFO - Executing function: _extract_raw_data
2025-02-05 02:08:52,330 - logistics - INFO - HDFS Path: /raw/transactional/mysql/logistics/topics/logistics_src.logistics.Orders
2025-02-05 02:08:52,434 - logistics - INFO - Successfully fetched raw orders data.
2025-02-05 02:08:52,435 - logistics - INFO - Selecting columns: ['after.*', 'op', 'source.ts_ms', 'event_timestamp', 'year', 'month', 'day']
2025-02-05 02:08:52,508 - logistics - INFO - Completed transformation of raw orders data.
2025-02-05 02:08:54,100 - logistics - INFO - Path: /enriched/transactional/mysql/logistics/orders
