In [2]:
!pip install apache-airflow

Collecting apache-airflow
  Downloading apache_airflow-3.0.0-py3-none-any.whl.metadata (31 kB)
Collecting apache-airflow-core==3.0.0 (from apache-airflow)
  Downloading apache_airflow_core-3.0.0-py3-none-any.whl.metadata (9.0 kB)
Collecting apache-airflow-task-sdk<1.1.0,>=1.0.0 (from apache-airflow)
  Downloading apache_airflow_task_sdk-1.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting a2wsgi>=1.10.8 (from apache-airflow-core==3.0.0->apache-airflow)
  Downloading a2wsgi-1.10.8-py3-none-any.whl.metadata (3.9 kB)
Collecting aiosqlite>=0.20.0 (from apache-airflow-core==3.0.0->apache-airflow)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting alembic<2.0,>=1.13.1 (from apache-airflow-core==3.0.0->apache-airflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting apache-airflow-providers-common-compat>=1.6.0 (from apache-airflow-core==3.0.0->apache-airflow)
  Downloading apache_airflow_providers_common_compat-1.6.1-py3-none-any.whl.metadat

In [5]:
from datetime import datetime, timedelta

from airflow import DAG
from airflow.operators.bash import BashOperator

# Define default arguments
default_args = {
    'owner': 'petkao',
    'start_date': datetime.today(),  # Consider using a fixed date like datetime(2025, 5, 6) for stability
    'email': ['petkao@gmail.com'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

# Define the DAG
with DAG(
    dag_id='ETL_toll_data',
    default_args=default_args,
    description='Apache Airflow Final Assignment',
    # Pass schedule_interval within the schedule argument
    schedule='@daily',
    catchup=False,
    tags=['assignment', 'etl']
) as dag:

    etl_bash_task = BashOperator(
        task_id='run_etl_script',
        bash_command='bash /path/to/etl_toll_data.sh'
    )

    etl_bash_task

In [17]:
from datetime import datetime, timedelta

from airflow import DAG
from airflow.operators.bash import BashOperator

# Define default arguments
default_args = {
    'owner': 'petkao',
    'start_date': datetime.today(),
    'email': ['petkao@gmail.com'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=5)
}

# Define the DAG
with DAG(
    dag_id='ETL_toll_data',
    default_args=default_args,
    description='Apache Airflow Final Assignment',
    # Pass schedule_interval value to the schedule argument
    schedule='@daily',
    catchup=False,
    tags=['assignment', 'etl']
) as dag:

    # Task to unzip the toll data archive
    unzip_data = BashOperator(
        task_id='unzip_data',
        bash_command='tar -xvzf /home/project/airflow/dags/finalassignment/tolldata.tgz -C /home/project/airflow/dags/finalassignment/'
    )

    extract_data_from_csv = BashOperator(
        task_id='extract_data_from_csv',
        bash_command="""
        cut -d',' -f1,2,3,4 /home/project/airflow/dags/finalassignment/vehicle-data.csv > /home/project/airflow/dags/finalassignment/csv_data.csv
        """
    )

    extract_data_from_tsv = BashOperator(
        task_id='extract_data_from_tsv',
        bash_command="""
        cut -f5,6,7 --output-delimiter=',' /home/project/airflow/dags/finalassignment/tollplaza-data.tsv > /home/project/airflow/dags/finalassignment/tsv_data.csv
        """
    )

    extract_data_from_fixed_width = BashOperator(
        task_id='extract_data_from_fixed_width',
        bash_command="""
        cut -c40-49,50-59 /home/project/airflow/dags/finalassignment/payment-data.txt | \
        sed 's/ \{1,\}/,/g' > /home/project/airflow/dags/finalassignment/fixed_width_data.csv
        """
    )

    consolidate_data = BashOperator(
        task_id='consolidate_data',
        bash_command="""
        paste -d',' \
        /home/project/airflow/dags/finalassignment/csv_data.csv \
        /home/project/airflow/dags/finalassignment/tsv_data.csv \
        /home/project/airflow/dags/finalassignment/fixed_width_data.csv \
        > /home/project/airflow/dags/finalassignment/extracted_data.csv
        """
    )

    unzip_data >> extract_data_from_csv >> extract_data_from_tsv >> extract_data_from_fixed_width >> consolidate_data

    transform_data = BashOperator(
        task_id='transform_data',
        bash_command="""
        awk -F',' 'BEGIN {OFS=","} { $4=toupper($4); print }' \
        /home/project/airflow/dags/finalassignment/extracted_data.csv \
        > /home/project/airflow/dags/finalassignment/staging/transformed_data.csv
        """
    )

    unzip_data >> extract_data_from_csv >> extract_data_from_tsv >> extract_data_from_fixed_width >> consolidate_data >> transform_data

