<a href="https://colab.research.google.com/github/orokgospel/monthly_weekend_trip_metrics_dag/blob/main/python_ETL_orchestrator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
pip install requests pandas yagmail

Collecting yagmail
  Downloading yagmail-0.15.293-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting premailer (from yagmail)
  Downloading premailer-3.10.0-py2.py3-none-any.whl.metadata (15 kB)
Collecting cssselect (from premailer->yagmail)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting cssutils (from premailer->yagmail)
  Downloading cssutils-2.11.1-py3-none-any.whl.metadata (8.7 kB)
Downloading yagmail-0.15.293-py2.py3-none-any.whl (17 kB)
Downloading premailer-3.10.0-py2.py3-none-any.whl (19 kB)
Downloading cssselect-1.3.0-py3-none-any.whl (18 kB)
Downloading cssutils-2.11.1-py3-none-any.whl (385 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m385.7/385.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cssutils, cssselect, premailer, yagmail
Successfully installed cssselect-1.3.0 cssutils-2.11.1 premailer-3.10.0 yagmail-0.1

In [None]:
# @title
import requests
import pandas as pd
import yagmail
import traceback
import sys

# --------------------------------------------------
# CLICKHOUSE CONFIG (Using your HTTP cloud endpoint)
# --------------------------------------------------
CLICKHOUSE_URL = "https://pe40u3vv9u.us-west-2.aws.clickhouse.cloud:8443"
CLICKHOUSE_USER = "default"
CLICKHOUSE_PASSWORD = "V.eXe7eL6GM_y"

DESTINATION_TABLE = "weekend_monthly_report"

# --------------------------------------------------
# EMAIL CONFIG
# --------------------------------------------------
EMAIL_USER = "orokgospel@gmail.com"
EMAIL_PASSWORD = "my_password"   # Should be Gmail App Password
EMAIL_TO = "intertswitch@ng.com"


# --------------------------------------------------
# SQL QUERY
# --------------------------------------------------
SQL_QUERY = """
SELECT
  toMonth(tpep_pickup_datetime) AS month_num,
  countIf(toDayOfWeek(tpep_pickup_datetime) = 6) AS sat_mean_trip_count,
  avgIf(fare_amount, toDayOfWeek(tpep_pickup_datetime) = 6) AS sat_mean_fare_per_trip,
  avgIf(trip_duration, toDayOfWeek(tpep_pickup_datetime) = 6) AS sat_mean_duration_per_trip,

  countIf(toDayOfWeek(tpep_pickup_datetime) = 7) AS sun_mean_trip_count,
  avgIf(fare_amount, toDayOfWeek(tpep_pickup_datetime) = 7) AS sun_mean_fare_per_trip,
  avgIf(trip_duration, toDayOfWeek(tpep_pickup_datetime) = 7) AS sun_mean_duration_per_trip

FROM tripdata
WHERE tpep_pickup_datetime >= '2014-01-01'
  AND tpep_pickup_datetime <= '2016-12-31'
GROUP BY toMonth(tpep_pickup_datetime)
ORDER BY toMonth(tpep_pickup_datetime)
"""


# --------------------------------------------------
# CLICKHOUSE HELPERS
# --------------------------------------------------
def clickhouse_query(sql: str) -> str:
    response = requests.post(
        CLICKHOUSE_URL,
        data=sql.encode("utf-8"),
        auth=(CLICKHOUSE_USER, CLICKHOUSE_PASSWORD),
        headers={"Content-Type": "text/plain"}
    )
    response.raise_for_status()
    return response.text


def clickhouse_query_dataframe(sql: str) -> pd.DataFrame:
    csv_sql = f"SELECT * FROM ({sql}) FORMAT CSVWithNames"
    response = clickhouse_query(csv_sql)
    from io import StringIO
    return pd.read_csv(StringIO(response))


# --------------------------------------------------
# SAFE EMAIL SENDER (never breaks ETL)
# --------------------------------------------------
def safe_send_email(subject: str, body: str):
    try:
        yag = yagmail.SMTP(EMAIL_USER, EMAIL_PASSWORD)
        yag.send(EMAIL_TO, subject, body)
        print("Email sent successfully.")
    except Exception as e:
        print(f"WARNING: Email failed: {e}")
        print("Continuing ETL without email...")


# --------------------------------------------------
# MAIN ETL FUNCTION (CALLABLE)
# --------------------------------------------------
def run_etl_job():
    try:
        print("\nüîµ Starting ETL job...")

        # ----------------- RUN QUERY -----------------
        print("‚û° Running SQL query...")
        df = clickhouse_query_dataframe(SQL_QUERY)

        if df.empty:
            raise Exception("Query returned NO DATA!")

        print("‚úî Data fetched successfully.")

        # ----------------- VALIDATION -----------------
        print("‚û° Validating data...")
        if not df["month_num"].between(1, 12).all():
            raise Exception("Invalid month numbers detected!")

        print("‚úî Validation passed.")

        # ------------- CREATE DEST TABLE -------------
        print("‚û° Ensuring destination table exists...")
        create_sql = f"""
        CREATE TABLE IF NOT EXISTS {DESTINATION_TABLE} (
            month_num UInt8,
            sat_mean_trip_count UInt32,
            sat_mean_fare_per_trip Float64,
            sat_mean_duration_per_trip Float64,
            sun_mean_trip_count UInt32,
            sun_mean_fare_per_trip Float64,
            sun_mean_duration_per_trip Float64
        ) ENGINE = MergeTree()
        ORDER BY month_num;
        """
        clickhouse_query(create_sql)
        print("‚úî Destination table ready.")

        # ----------------- INSERT DATA ----------------
        print("‚û° Inserting report data...")
        insert_sql = f"INSERT INTO {DESTINATION_TABLE} FORMAT CSVWithNames\n" + df.to_csv(index=False)
        clickhouse_query(insert_sql)
        print("‚úî Data inserted successfully.")

        # ----------------- SUCCESS EMAIL --------------
        safe_send_email(
            subject="ETL SUCCESS: Weekend Trip Report",
            body="ETL completed successfully. Data inserted into ClickHouse."
        )

        print("üü¢ ETL JOB COMPLETED SUCCESSFULLY.\n")

    except Exception as e:
        error_details = f"ETL FAILED:\n\n{str(e)}\n\n{traceback.format_exc()}"
        print(error_details)

        safe_send_email(
            subject="‚ùå ETL FAILED",
            body=error_details
        )

        print("üî¥ ETL TERMINATED WITH ERRORS.\n")
        sys.exit(1)


# --------------------------------------------------
# EXECUTE
# --------------------------------------------------
if __name__ == "__main__":
    run_etl_job()




üîµ Starting ETL job...
‚û° Running SQL query...
‚úî Data fetched successfully.
‚û° Validating data...
‚úî Validation passed.
‚û° Ensuring destination table exists...
‚úî Destination table ready.
‚û° Inserting report data...
‚úî Data inserted successfully.
Continuing ETL without email...
üü¢ ETL JOB COMPLETED SUCCESSFULLY.

