# Task C

We will use Apache Airflow to orchestrate our workflow so that task_a.ipynb runs before task_b.ipynb on a daily schedule. The rationale is as follows: we are designing a workflow to build a prediction model for Idealista house prices, where new data is ingested each day. The first task (task_a.ipynb) will be responsible for cleaning the incoming data and removing duplicates, while the second task (task_b.ipynb) will handle training and testing the predictive model.

In [1]:
import os
from airflow import DAG
from airflow.operators.python import PythonOperator
from datetime import datetime, timedelta
import papermill as pm

# Use the current working directory
PROJECT_PATH = os.getcwd()  

def run_notebook_a():
    pm.execute_notebook(
        f"{PROJECT_PATH}/task_a.ipynb",
        f"{PROJECT_PATH}/artifacts/task_a_out.ipynb"  # output notebook
    )

def run_notebook_b():
    pm.execute_notebook(
        f"{PROJECT_PATH}/task_b.ipynb",
        f"{PROJECT_PATH}/artifacts/task_b_out.ipynb"
    )

default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

with DAG(
    dag_id='notebook_workflow',
    default_args=default_args,
    description='Run task_a then task_b notebooks daily',
    schedule_interval='@daily',
    start_date=datetime(2024, 1, 1),
    catchup=False,
    tags=['notebook', 'papermill'],
) as dag:
    task_a = PythonOperator(
        task_id='run_task_a',
        python_callable=run_notebook_a,
    )
    task_b = PythonOperator(
        task_id='run_task_b',
        python_callable=run_notebook_b,
    )

    task_a >> task_b  # This sets the dependency: a runs before b


ModuleNotFoundError: No module named 'airflow'