# **1. Airflow Introduction** 

In [7]:
import pandas as pd

df = pd.read_parquet('data/green_tripdata_2022-12.parquet')

In [8]:
df

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2022-12-01 00:32:58,2022-12-01 00:37:38,N,1.0,166,24,2.0,0.77,5.50,0.5,0.5,1.00,0.0,,0.3,7.80,1.0,1.0,0.0
1,1,2022-12-01 00:26:49,2022-12-01 00:31:07,N,1.0,74,41,1.0,0.60,5.00,0.5,0.5,0.00,0.0,,0.3,6.30,2.0,1.0,0.0
2,1,2022-12-01 00:20:23,2022-12-01 00:49:36,N,1.0,260,17,1.0,0.00,22.20,0.0,0.5,0.00,0.0,,0.3,23.00,1.0,1.0,0.0
3,2,2022-12-01 00:20:23,2022-12-01 00:28:46,N,1.0,80,256,1.0,1.71,8.00,0.5,0.5,0.00,0.0,,0.3,9.30,2.0,1.0,0.0
4,2,2022-12-01 00:09:12,2022-12-01 00:13:39,N,1.0,179,179,1.0,0.62,5.00,0.5,0.5,4.00,0.0,,0.3,10.30,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72434,2,2022-12-31 23:33:00,2022-12-31 23:49:00,,,33,79,,3.53,20.72,0.0,0.0,4.89,0.0,,1.0,29.36,,,
72435,2,2022-12-31 23:29:00,2023-01-01 00:01:00,,,49,197,,8.94,39.01,0.0,0.0,8.00,0.0,,1.0,48.01,,,
72436,2,2022-12-31 23:05:00,2022-12-31 23:12:00,,,54,181,,1.27,12.11,0.0,0.0,2.62,0.0,,1.0,15.73,,,
72437,2,2022-12-31 23:03:00,2022-12-31 23:18:00,,,7,129,,2.23,15.51,0.0,0.0,3.30,0.0,,1.0,19.81,,,


df

from airflow import DAG
from datetime import datetime
import os

from airflow.operators.bash import BashOperator

# Create a DAG object
workflow = DAG( 
    dag_id="data_ingest_gcs",
    schedule_interval="0 6 2 * *",
    start_date = datetime(2022, 1, 2),
    end_date = datetime(2022, 12, 2),
    catchup=True 
)

# Variables 
AIRFLOW_HOME = os.environ.get("AIRFLOW_HOME", "/opt/airflow/")
URL_PREFIX = 'https://d37ci6vzurychx.cloudfront.net/trip-data/'
FILE = 'green_tripdata_{{ execution_date.strftime(\'%Y-%m\') }}.parquet'
URL = URL_PREFIX + FILE
OUTPUT_FILE = AIRFLOW_HOME + FILE
TABLE_NAME = 'green_taxi_{{ execution_date.strftime(\'%Y_%m\') }}'


# Create a BashOperator
with workflow:
    download_task = BashOperator(
        task_id="wget",
        bash_command=f'curl -sSL {URL} > {OUTPUT_FILE}'    
    )

In [None]:
# [START import modules] 
from airflow import DAG
from datetime import datetime
from google.cloud import storage
from os import getenv

from airflow.operators.bash import BashOperator
from airflow.providers.google.cloud.operators.gcs import GCSCreateBucketOperator, GCSListObjectsOperator
from airflow.providers.google.cloud.transfers.local_to_gcs import LocalFilesystemToGCSOperator
# [END import modules] 

# [START Env Variables] 
AIRFLOW_HOME = getenv("AIRFLOW_HOME", "/opt/airflow/")
URL_PREFIX = 'https://d37ci6vzurychx.cloudfront.net/trip-data'
FILE_NAME = 'green_tripdata_{{ execution_date.strftime(\'%Y-%m\') }}.parquet'
TABLE_NAME = 'green_taxi_{{ execution_date.strftime(\'%Y_%m\') }}'
URL = f'{URL_PREFIX}/{FILE_NAME}ss'
OUTPUT_FILE_PATH = getenv('OUTPUT_FILE_PATH', f'{AIRFLOW_HOME}/{FILE_NAME}')

PROJECT_ID = getenv("PROJECT_ID", "de-bootcamp-414215")
REGION = getenv("REGIONAL", "us-east1")
LOCATION = getenv("LOCATION", "us-east1")

BUCKET_NAME = getenv("BUCKET_NAME", 'taxi-data-414215')
GCS_BUCKET_PATH = getenv("GCS_BUCKET", f'gs://{BUCKET_NAME}/taxi_data_2022/')
# [END Env Variables] 

# [START default args] 
default_args = {
    "owner": "marcos benicio",
    "email": ['marcosbenicio@id.uff.br'],
    "email_on_failure": False,
    "email_on_retry": False,
    "retries": 1
}
# [END default args]


# [START DAG] 
workflow = DAG(
                dag_id="data_ingest_gcs",
                default_args = default_args,
                tags=['gcs', 'data_ingest'], 
                schedule_interval="0 6 2 * *",
                start_date = datetime(2022, 1, 2),
                end_date = datetime(2022, 12, 2),
                )
# [END DAG]



# [START Workflow] 
with workflow:
    
    download_task = BashOperator(
        task_id="wget_url_data",
        bash_command=f'curl -sSL' + URL + '>' + OUTPUT_FILE_PATH  
    )   
    create_bucket = GCSCreateBucketOperator(
        task_id="create_bucket",
        bucket_name=BUCKET_NAME,
        storage_class="REGIONAL",
        location=LOCATION,
        project_id=PROJECT_ID,
        labels={"env": "dev", "team": "airflow"},
        gcp_conn_id="gcp"
    )
    ingest_data_gcs = LocalFilesystemToGCSOperator(
        task_id="ingest_data_gcs",
        src=OUTPUT_FILE_PATH,
        dst=GCS_BUCKET_PATH,
        bucket=BUCKET_NAME,
        gcp_conn_id="gcp"
    )
    list_bucket_obj = GCSListObjectsOperator(
        task_id = "list_bucket_data",
        bucket=BUCKET_NAME,
        gcp_conn_id="gcp"
    )   
download_task >> create_bucket >> ingest_data_gcs >> list_bucket_obj
# [END Workflow]

# **2. Airflow in Docker**

On Linux, the quick-start needs to know your host user id and needs to have group id set to 0. Otherwise the files created in `dags`, `logs` and `plugins` will be created with `root` user ownership. To configure them for the docker-compose:

```bash
    mkdir -p ./dags ./logs ./plugins ./config
    echo -e "AIRFLOW_UID=$(id -u)" > .env
```
We then need to create the `docker-compose.yaml` file. For this we chan check the [official documentation](https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html) and download the `docker-compose.yaml` file using the following command:


```bash
    curl -LfO 'https://airflow.apache.org/docs/apache-airflow/2.8.1/docker-compose.yaml'
```

The file downloaded is the following:

```yaml
  x-airflow-common:
    &airflow-common
    image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.8.1}
    # build: .
    environment:
      &airflow-common-env
      AIRFLOW__CORE__EXECUTOR: CeleryExecutor
      AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
      AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
      AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
      AIRFLOW__CORE__FERNET_KEY: ''
      AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
      AIRFLOW__CORE__LOAD_EXAMPLES: 'true'
      AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'

      AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'

      _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
    volumes:
      - ${AIRFLOW_PROJ_DIR:-.}/dags:/opt/airflow/dags
      - ${AIRFLOW_PROJ_DIR:-.}/logs:/opt/airflow/logs
      - ${AIRFLOW_PROJ_DIR:-.}/config:/opt/airflow/config
      - ${AIRFLOW_PROJ_DIR:-.}/plugins:/opt/airflow/plugins
    user: "${AIRFLOW_UID:-50000}:0"
    depends_on:
      &airflow-common-depends-on
      redis:
        condition: service_healthy
      postgres:
        condition: service_healthy

  services:
    postgres:
      image: postgres:13
      environment:
        POSTGRES_USER: airflow
        POSTGRES_PASSWORD: airflow
        POSTGRES_DB: airflow
      volumes:
        - postgres-db-volume:/var/lib/postgresql/data
      healthcheck:
        test: ["CMD", "pg_isready", "-U", "airflow"]
        interval: 10s
        retries: 5
        start_period: 5s
      restart: always

    redis:
      image: redis:latest
      expose:
        - 6379
      healthcheck:
        test: ["CMD", "redis-cli", "ping"]
        interval: 10s
        timeout: 30s
        retries: 50
        start_period: 30s
      restart: always

    airflow-webserver:
      <<: *airflow-common
      command: webserver
      ports:
        - "8080:8080"
      healthcheck:
        test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
        interval: 30s
        timeout: 10s
        retries: 5
        start_period: 30s
      restart: always
      depends_on:
        <<: *airflow-common-depends-on
        airflow-init:
          condition: service_completed_successfully

    airflow-scheduler:
      <<: *airflow-common
      command: scheduler
      healthcheck:
        test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
        interval: 30s
        timeout: 10s
        retries: 5
        start_period: 30s
      restart: always
      depends_on:
        <<: *airflow-common-depends-on
        airflow-init:
          condition: service_completed_successfully

    airflow-worker:
      <<: *airflow-common
      command: celery worker
      healthcheck:
        test:
          - "CMD-SHELL"
          - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
        interval: 30s
        timeout: 10s
        retries: 5
        start_period: 30s
      environment:
        <<: *airflow-common-env
        DUMB_INIT_SETSID: "0"
      restart: always
      depends_on:
        <<: *airflow-common-depends-on
        airflow-init:
          condition: service_completed_successfully

    airflow-triggerer:
      <<: *airflow-common
      command: triggerer
      healthcheck:
        test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
        interval: 30s
        timeout: 10s
        retries: 5
        start_period: 30s
      restart: always
      depends_on:
        <<: *airflow-common-depends-on
        airflow-init:
          condition: service_completed_successfully

    airflow-init:
      <<: *airflow-common
      entrypoint: /bin/bash
      command:
        - -c
        - |
          if [[ -z "${AIRFLOW_UID}" ]]; then
            echo
            echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
            echo "If you are on Linux, you SHOULD follow the instructions below to set "
            echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
            echo "For other operating systems you can get rid of the warning with manually created .env file:"
            echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
            echo
          fi
          one_meg=1048576
          mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
          cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
          disk_available=$$(df / | tail -1 | awk '{print $$4}')
          warning_resources="false"
          if (( mem_available < 4000 )) ; then
            echo
            echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
            echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
            echo
            warning_resources="true"
          fi
          if (( cpus_available < 2 )); then
            echo
            echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
            echo "At least 2 CPUs recommended. You have $${cpus_available}"
            echo
            warning_resources="true"
          fi
          if (( disk_available < one_meg * 10 )); then
            echo
            echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
            echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
            echo
            warning_resources="true"
          fi
          if [[ $${warning_resources} == "true" ]]; then
            echo
            echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
            echo "Please follow the instructions to increase amount of resources available:"
            echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
            echo
          fi
          mkdir -p /sources/logs /sources/dags /sources/plugins
          chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
          exec /entrypoint airflow version
      environment:
        <<: *airflow-common-env
        _AIRFLOW_DB_MIGRATE: 'true'
        _AIRFLOW_WWW_USER_CREATE: 'true'
        _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
        _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
        _PIP_ADDITIONAL_REQUIREMENTS: ''
      user: "0:0"
      volumes:
        - ${AIRFLOW_PROJ_DIR:-.}:/sources

    airflow-cli:
      <<: *airflow-common
      profiles:
        - debug
      environment:
        <<: *airflow-common-env
        CONNECTION_CHECK_MAX_COUNT: "0"
      command:
        - bash
        - -c
        - airflow

    flower:
      <<: *airflow-common
      command: celery flower
      profiles:
        - flower
      ports:
        - "5555:5555"
      healthcheck:
        test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
        interval: 30s
        timeout: 10s
        retries: 5
        start_period: 30s
      restart: always
      depends_on:
        <<: *airflow-common-depends-on
        airflow-init:
          condition: service_completed_successfully

  volumes:
    postgres-db-volume:
```



```bash
    docker-compose up airflow-init
```

```bash
    docker-compose up
```


