In [None]:
from extract import test

In [None]:
test.temp.parent

In [None]:
from airflow import DAG
from airflow.operators.empty import EmptyOperator
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
import pendulum
import yaml
from pathlib import Path

default_args = {
    'owner': 'vinh'
    , 'email': 'ltvinh1101@gmail.com'
    , 'email_on_failure': True
    , 'email_on_retry': True
    , 'retries': 2
    , 'retry_delay': pendulum.duration(seconds = 1)
}
    
with DAG(
    dag_id = 'logistics_dag_v1'
    , description = 'This is a dag'
    , start_date = pendulum.now(tz = 'Asia/Ho_Chi_Minh')
    , default_args = default_args
    , catchup = False
    , schedule = '@daily'
    , tags = ['logistics', 'mysql']
) as dag:
    #START
    start = EmptyOperator(task_id = 'start')

    config_path = Path(__file__) / 'config/config.yaml'
    with open(config_path) as file:
        config = yaml.safe_load(file)
        packages = config['spark']['packages']
        
    process_enirched_apps = [
        'process_enriched_drivers'
        # , 'process_enriched_drivers'
        # , 'process_enriched_orders'
        # , 'process_enriched_paymentss'
        # , 'process_enriched_shipments'
    ]

    processe_enriched_operators = []
    for app_name in process_enirched_apps:
        temp = SparkSubmitOperator(
            task_id = app_name
            , application = Path(__file__) / f'workflows/{app_name}.py'
            , packages = ','.join(packages)
        )

    #END
    end = EmptyOperator(task_id = 'end')

In [None]:
from workflows import process_enriched_users

In [None]:
from airflow.operators.empty import EmptyOperator

In [None]:
process_enriched_users.main()

In [None]:
from workflows import *

In [None]:
import yaml

In [None]:
    config_path = Path.cwd() / 'config/config.yaml'
    with open(config_path) as file:
        config = yaml.safe_load(file)

In [None]:
config['spark']['packages']

In [None]:
process_enriched_drivers.main()

In [None]:
from pathlib import Path

In [None]:
Path() / '123/123'

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [None]:
spark.read.format('parquet').option('path', 'hdfs://hdfs-namenode:9000/enriched/transactional/mysql/logistics/users').load().orderBy(F.col('user_id')).show()

In [None]:
spark.stop()

In [None]:
from workflows import process_enriched_users

In [None]:
process_enriched_users.main()

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
spark.read.option('path', '/enriched/transactional/mysql/logistics/users/year=2025/month=2/day=3').load().show()

In [None]:
from airflow import DAG
from airflow.utils.task_group import TaskGroup
from airflow.models.baseoperator import chain
from airflow.operators.empty import EmptyOperator
from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
from pathlib import Path
import pendulum

enriched_path = Path('/opt/spark-apps/logistics_project_v2/workflows/enriched')
curated_path = Path('/opt/spark-apps/logistics_project_v2/workflows/curated')

default_args = {
    'owner': 'vinh'
    , 'email': 'ltvinh1101@gmail.com'
    , 'email_on_failure': True
    , 'email_on_retry': True
    , 'retries': 2
    , 'retry_delay': pendulum.duration(seconds = 1)
}

with DAG(
    dag_id = 'logistics_dag_v1'
    , description = 'this is my first dag'
    , start_date = pendulum.now(tz = 'Asia/Ho_Chi_Minh')
    # , schedule = '@daily'
    , schedule = None
    , tags = ['logistics', 'mysql']
) as dag:
    # START
    start = EmptyOperator(task_id = 'start')

    
    # ENRICHED

    with TaskGroup(group_id = 'enriched_operators_group') as enriched_group:
        process_enirched_apps = [
            'process_enriched_users'
            , 'process_enriched_drivers'
            , 'process_enriched_orders'
            , 'process_enriched_paymentss'
            , 'process_enriched_shipments'
        ]
        for app_name in process_enirched_apps:
            SparkSubmitOperator(
                task_id = app_name
                , conn_id = 'spark_conn'
                , application = str(enriched_path / f'{app_name}.py')
                , packages = 'io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-avro_2.12:3.5.3'
            )
     
    start >> enriched_group

    #CURATED
    
    with TaskGroup(group_id = 'dim_operatprs_group') as dim_group:
        process_dim_apps = [
            'process_dim_users'
            , 'process_dim_drivers'
            , 'process_dim_locations'
            , 'process_dim_date'

        ]
        
        for app_name in process_dim_apps:
            SparkSubmitOperator(
                task_id = app_name
                , conn_id = 'spark_conn'
                , application = str(curated_path / f'{app_name}.py')
                , packages = 'io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-avro_2.12:3.5.3'
            )
            
    with TaskGroup(group_id = 'fact_operators_groups') as fact_group:
        process_fact_apps = [
            'process_fact_processing_orders'
            , 'process_fact_in_transit_orders'
            , 'process_fact_accepted_orders'
            , 'process_fact_delivered_orders'
        ]
        
        for app_name in process_fact_apps:
            SparkSubmitOperator(
                task_id = app_name
                , conn_id = 'spark_conn'
                , application = str(curated_path / f'{app_name}.py')
                , packages = 'io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-avro_2.12:3.5.3'
            )

    enriched_group >>  [dim_group, fact_group]

    
    # END
    
    end = EmptyOperator(task_id = 'end')

    [dim_group, fact_group] >> end

In [None]:
from airflow.utils.task_group import TaskGroup

In [None]:
from airflow.models.baseoperator import chain

In [None]:
spark-submit --master spark://spark-master:7077 /home/jovyan/work/src/spark_dataframe_app.py

In [None]:
from workflows.enriched import process_enriched_users

In [None]:
process_enriched_users.main()

In [None]:
io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-avro_2.12:3.5.3

In [None]:
spark-submit --packages io.delta:delta-spark_2.12:3.3.0,org.apache.spark:spark-avro_2.12:3.5.3 workflows/enriched/process_enriched_users.py

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
from workflows.enriched import process_enriched_orders

In [None]:
process_enriched_orders.main()

In [None]:
from workflows.enriched import process_enriched_shipments

In [None]:
process_enriched_shipments.main()

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
from workflows.curated import process_dim_date

In [None]:
process_dim_date.main()

In [1]:
from workflows.curated import process_dim_users

In [2]:
process_dim_users.main()

2025-02-12 06:53:32,405 - logistics - INFO - HDFS Path: /enriched/transactional/mysql/logistics/users


In [None]:
spark.read.format('delta').option('path', '/curated/transactional/mysql/logistics/dimensions/dim_users').load().show()

In [1]:
from workflows.curated import process_dim_drivers

In [9]:
from workflows.curated import process_dim_locations

In [1]:
from workflows.curated import process_fact_processing_orders

In [2]:
process_fact_processing_orders.main()

2025-02-12 07:11:45,507 - logistics - INFO - HDFS Path: /enriched/transactional/mysql/logistics/orders


In [11]:
spark.read.format('delta').option('path', '/curated/transactional/mysql/logistics/dimensions/dim_drivers').load().orderBy(F.col('driver_id')).count()

80

In [14]:
spark.read.format('delta').option('path', '/curated/transactional/mysql/logistics/dimensions/dim_locations').load().count()

1057

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("SimpleDataFrame").getOrCreate()

In [5]:
spark.read.format('delta').option('path', '/curated/transactional/mysql/logistics/dimensions/dim_users').load().count()

380

In [8]:
spark.read.format('delta').option('path', '/curated/transactional/mysql/logistics/facts/fact_orders').load().count()

760

In [10]:
from workflows.curated import process_fact_delivered_orders

In [11]:
process_fact_delivered_orders.main()

2025-02-12 07:17:55,476 - logistics - INFO - HDFS Path: /enriched/transactional/mysql/logistics/orders


In [None]:
url = 'http://livy:8998/batches'

data = {
    'file': '/home/jovyan/work/src/logistics_project_v2/spark_dataframe_app.py'
}
a