In [1]:
project_id = "amplified-brook-454012-i1"
dataset = "nyc_taxi"
delta_lake_conn = "nyc-biglake-connection"



In [2]:
import pandas as pd
import time
import json
from google.cloud import bigquery
from google.cloud.exceptions import GoogleCloudError
from google.oauth2 import service_account
from enum import Enum

class QueryState(Enum):
    """Enum to represent BigQuery job states"""
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    DONE = "DONE"
    FAILED = "FAILED"
    CANCELED = "CANCELED"

def get_bigquery_client(**kwargs):
    """
    Create an authenticated BigQuery client for use in Databricks.
    
    Args:
        **kwargs: Additional arguments:
            - secret_scope (str): Databricks secret scope name
            - secret_key (str): Databricks secret key name
            - project_id (str, optional): Google Cloud project ID
    
    Returns:
        google.cloud.bigquery.Client: Authenticated BigQuery client
    
    Raises:
        ValueError: If required parameters are missing
        ImportError: If running in Databricks and dbutils is not available
    """
    if "secret_scope" not in kwargs or "secret_key" not in kwargs:
        raise ValueError("secret_scope and secret_key are required for databricks_secret authentication")
    
    # This will only work in Databricks
    try:
        from pyspark.dbutils import DBUtils
        from pyspark.sql import SparkSession
        
        spark = SparkSession.builder.getOrCreate()
        dbutils = DBUtils(spark)
        
        # Get service account JSON from Databricks secrets
        json_credentials = dbutils.secrets.get(scope=kwargs["secret_scope"], key=kwargs["secret_key"])
        service_account_info = json.loads(json_credentials)
        
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info
        )
        
        project_id = kwargs.get("project_id")
        if not project_id and "project_id" in service_account_info:
            project_id = service_account_info["project_id"]
            
        if project_id:
            return bigquery.Client(credentials=credentials, project=project_id)
        else:
            return bigquery.Client(credentials=credentials)
            
    except ImportError:
        raise ImportError("This authentication method only works in Databricks")

def execute_sql_query(
    client,
    query: str,
    project_id: str = None,
    dataset_id: str = None,
    poll_interval_seconds: int = 5,
    timeout_seconds: int = 300,  # Default timeout of 30 minutes
    location: str = "asia-southeast1",  # Default location
    dry_run: bool = False,  # Option to perform a dry run
    use_legacy_sql: bool = False,  # Use standard SQL by default
    query_parameters: list = [],  # For parameterized queries
    maximum_bytes_billed: int = 10**10,  # Cost control
    labels: dict[str, str] = {}
):
    """
    Execute a SQL query in BigQuery and return formatted results
    This function will poll until the query is complete or timeout is reached

    Args:
        client: BigQuery client instance
        query: SQL query to execute
        project_id: Optional Google Cloud project ID (defaults to client's project)
        dataset_id: Optional default dataset ID for the query
        poll_interval_seconds: Seconds to wait between status checks (default: 30)
        timeout_seconds: Maximum seconds to wait for query completion (default: 1800, set to None for no timeout)
        location: BigQuery location/region (default: "US")
        dry_run: If True, don't actually run the query (just validate)
        use_legacy_sql: If True, use legacy SQL instead of standard SQL
        query_parameters: List of query parameters for parameterized queries
        maximum_bytes_billed: Maximum bytes that will be billed for this job

    Returns:
        Formatted query results as pandas DataFrame or error message
    """
    try:
        # Configure the job
        job_config = bigquery.QueryJobConfig(
            use_legacy_sql=use_legacy_sql,
            dry_run=dry_run,
            query_parameters=query_parameters,
            maximum_bytes_billed=maximum_bytes_billed,
            labels = labels
        )
        
        # Set default dataset if provided
        if dataset_id:
            job_config.default_dataset = f"{project_id}.{dataset_id}" if project_id else dataset_id
        
        # Start the query job
        query_job = client.query(
            query,
            job_config=job_config,
            location=location,
            project=project_id,
        )
        
        # Track elapsed time
        start_time = time.time()
        
        # Poll until the query is complete or timeout is reached
        while True:
            # Check if timeout has been reached
            elapsed_time = time.time() - start_time
            if timeout_seconds is not None and elapsed_time > timeout_seconds:
                print(f"Query execution timed out after {timeout_seconds} seconds.")
                query_job.cancel()
                return f"Query execution timed out after {timeout_seconds} seconds."
            
            # Reload the job to get the latest status
            query_job.reload()
            
            # Get current state
            if query_job.state == "PENDING":
                state = QueryState.PENDING
            elif query_job.state == "RUNNING":
                state = QueryState.RUNNING
            elif query_job.state == "DONE":
                if query_job.error_result:
                    state = QueryState.FAILED
                else:
                    state = QueryState.DONE
            else:
                state = QueryState.FAILED
            
            # Check if the query has completed
            if state == QueryState.DONE:
                print(f"Query execution completed successfully in {elapsed_time:.2f} seconds.")
                break
            elif state == QueryState.FAILED:
                error_message = query_job.error_result.get('message', 'Unknown error')
                error_reason = query_job.error_result.get('reason', 'Unknown reason')
                print(f"Query execution failed after {elapsed_time:.2f} seconds: {error_reason} - {error_message}")
                return f"Query execution failed: {error_reason} - {error_message}"
            elif state == QueryState.CANCELED:
                print(f"Query execution was canceled after {elapsed_time:.2f} seconds.")
                return "Query execution was canceled."
            else:
                # Query is still running, wait and check again
                print(f"Query is running... Current state: {state.value}, elapsed time: {elapsed_time:.2f} seconds")
                time.sleep(poll_interval_seconds)
        
        # If this is a dry run, return the estimated bytes processed
        if dry_run:
            bytes_processed = query_job.total_bytes_processed
            return f"Dry run completed. Query will process {bytes_processed} bytes."
        
        # Get query statistics
        bytes_processed = query_job.total_bytes_processed
        bytes_billed = query_job.total_bytes_billed
        slot_ms = query_job.slot_millis
        
        print(f"Query processed {bytes_processed} bytes and billed for {bytes_billed} bytes.")
        print(f"Query used {slot_ms} slot milliseconds.")
        
        # Get the results
        results = query_job.result()
        
        # Check if we have results
        if not results:
            return "Query executed successfully, but no results were returned."
        
        # Convert to DataFrame
        df = results.to_dataframe()
        
        # Return the DataFrame
        return df
    
    except GoogleCloudError as e:
        return f"Google Cloud Error: {str(e)}"
    except Exception as e:
        return f"Error executing SQL query: {str(e)}"



In [3]:
client = get_bigquery_client(
    secret_scope="databricks-bigquery",  # Your secret scope name
    secret_key="databricks-bq-sa"         # Your secret key name
)



In [4]:
def create_external_table(client, format: str, table_name: str, gcs_path: str, delta_lake_conn: str = delta_lake_conn, project_id: str = project_id, dataset:str = dataset, location: str = "asia-southeast1"):
    
    sql = f"""
        CREATE EXTERNAL TABLE IF NOT EXISTS `{project_id}.{dataset}.{table_name}`
        WITH CONNECTION `{project_id}`.`{location}`.`{delta_lake_conn}`
        OPTIONS (
        format = "{format.upper()}",
        uris=['{gcs_path}']
        );
    """
    
    print(sql)
    
    if not client:
        client = get_bigquery_client(
            secret_scope="databricks-bigquery",  # Your secret scope name
            secret_key="databricks-bq-sa"         # Your secret key name
        )

    result = execute_sql_query(
        client=client,
        project_id=project_id,
        query=sql,
        poll_interval_seconds=5,
        timeout_seconds=300
    )



In [98]:
create_external_table(client = client, format = "PARQUET", table_name = "vendor_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/vendor-lookup/*.parquet')
create_external_table(client = client, format = "PARQUET", table_name = "trip_type_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/trip-type-lookup/*.parquet')
create_external_table(client = client, format = "PARQUET", table_name = "payment_type_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/payment-type-lookup/*.parquet')
create_external_table(client = client, format = "PARQUET", table_name = "rate_code_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/rate-code-lookup/*.parquet')
create_external_table(client = client, format = "PARQUET", table_name = "trip_month_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/trip-month-lookup/*.parquet')
create_external_table(client = client, format = "PARQUET", table_name = "taxi_zone_lookup", gcs_path = 'gs://nyctaxi-silver/nyctaxi/reference/taxi-zone-lookup/*.parquet')

CREATE EXTERNAL TABLE IF NOT EXISTS `amplified-brook-454012-i1.nyc_taxi.vendor_lookup`
        WITH CONNECTION `amplified-brook-454012-i1`.`asia-southeast1`.`nyc-biglake-connection`
        OPTIONS (
        format = "PARQUET",
        uris=['gs://nyctaxi-silver/nyctaxi/reference/vendor-lookup/*.parquet']
        );
    
Query is running... Current state: RUNNING, elapsed time: 0.00 seconds
Query execution completed successfully in 5.07 seconds.
Query processed 0 bytes and billed for 0 bytes.
Query used 0 slot milliseconds.

        CREATE EXTERNAL TABLE IF NOT EXISTS `amplified-brook-454012-i1.nyc_taxi.trip_type_lookup`
        WITH CONNECTION `amplified-brook-454012-i1`.`asia-southeast1`.`nyc-biglake-connection`
        OPTIONS (
        format = "PARQUET",
        uris=['gs://nyctaxi-silver/nyctaxi/reference/trip-type-lookup/*.parquet']
        );
    
Query is running... Current state: RUNNING, elapsed time: 0.00 seconds
Query execution completed successfully in 5.04 seconds.
Query

In [99]:
create_external_table(client = client, format = "DELTA_LAKE", table_name = "yellow_taxi_trips_raw", gcs_path = 'gs://nyctaxi-silver/nyctaxi/transactions/yellow-taxi')

CREATE EXTERNAL TABLE IF NOT EXISTS `amplified-brook-454012-i1.nyc_taxi.yellow_taxi_trips_raw`
        WITH CONNECTION `amplified-brook-454012-i1`.`asia-southeast1`.`nyc-biglake-connection`
        OPTIONS (
        format = "DELTA_LAKE",
        uris=['gs://nyctaxi-silver/nyctaxi/transactions/yellow-taxi']
        );
    
Query is running... Current state: RUNNING, elapsed time: 0.00 seconds
Query execution completed successfully in 5.06 seconds.
Query processed 0 bytes and billed for 0 bytes.
Query used 0 slot milliseconds.

In [119]:
create_external_table(client = client, format = "DELTA_LAKE", table_name = "green_taxi_trips_raw", gcs_path = 'gs://nyctaxi-silver/nyctaxi/transactions/green-taxi')

CREATE EXTERNAL TABLE IF NOT EXISTS `amplified-brook-454012-i1.nyc_taxi.green_taxi_trips_raw`
        WITH CONNECTION `amplified-brook-454012-i1`.`asia-southeast1`.`nyc-biglake-connection`
        OPTIONS (
        format = "DELTA_LAKE",
        uris=['gs://nyctaxi-silver/nyctaxi/transactions/green-taxi']
        );
    
Query is running... Current state: RUNNING, elapsed time: 0.00 seconds
Query execution completed successfully in 5.07 seconds.
Query processed 0 bytes and billed for 0 bytes.
Query used 0 slot milliseconds.

In [5]:
def create_table(schema: str, table_name: str, parquet_dir: str, location: str):
    spark.sql(f"use {schema};")
    spark.sql(f"DROP TABLE IF EXISTS {table_name};")
    spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING delta LOCATION '{location}/{parquet_dir}';")



In [7]:
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="yellow_taxi_trips_raw", parquet_dir="", location='gs://nyctaxi-silver/nyctaxi/transactions/yellow-taxi')



In [None]:
# from delta.tables import *
deltaTable = DeltaTable.convertToDelta(spark, f"parquet.`gs://nyctaxi-silver/nyctaxi/transactions/yellow-taxi`", "trip_year STRING, trip_month STRING")



In [None]:
import os
sql_transform_dir = "/Workspace/NYCTaxi/sql/transform/bigquery"



In [102]:
sql_file = "1-bq-transform-yellow-taxi.sql"

sql_full_path =f"file://{os.path.join(sql_transform_dir, sql_file)}" 
print(f"Executing SQL file: {sql_full_path}")

sql_content = ""
try:
    # Read the SQL file content
    sql_content = dbutils.fs.head(sql_full_path)
    print(sql_content)
except Exception as e:
    print(f"Error reading file {sql_full_path}: {str(e)}")

result = execute_sql_query(
    client=client,
    project_id='amplified-brook-454012-i1',
    query=sql_content,
    maximum_bytes_billed=600000000000,
    labels= {"type": "bq_transform", "name" : "transform_yellow_taxi"}
)


Executing SQL file: file:///Workspace/CarsProject/sql/transform/bigquery/1-bq-transform-yellow-taxi.sql
-- Databricks notebook source
CREATE OR REPLACE TABLE `amplified-brook-454012-i1`.`nyc_taxi`.`yellow_taxi_trips_transform`
PARTITION BY DATETIME_TRUNC(partition_id, MONTH)
CLUSTER BY trip_year, trip_month, taxi_type, vendor_id
AS
SELECT
  taxi_type,
  CAST(vendor_id AS INT64) AS vendor_id,
  pickup_datetime,
  dropoff_datetime,
  store_and_fwd_flag,
  rate_code_id,
  pickup_location_id,
  dropoff_location_id,
  pickup_longitude,
  pickup_latitude,
  dropoff_longitude,
  dropoff_latitude,
  passenger_count,
  trip_distance,
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  improvement_surcharge,
  total_amount,
  CAST(payment_type AS INT64) AS payment_type,
  CAST(trip_year as INT64) as trip_year,
  CAST(trip_month as INT64) as trip_month,
  vendor_abbreviation,
  vendor_description,
  month_name_short,
  month_name_full,
  payment_type_description,
  rate_code_descri

In [121]:
sql_file = "2-bq-transform-green-taxi.sql"

sql_full_path =f"file://{os.path.join(sql_transform_dir, sql_file)}" 
print(f"Executing SQL file: {sql_full_path}")

sql_content = ""
try:
    # Read the SQL file content
    sql_content = dbutils.fs.head(sql_full_path)
    print(sql_content)
except Exception as e:
    print(f"Error reading file {sql_full_path}: {str(e)}")

result = execute_sql_query(
    client=client,
    project_id='amplified-brook-454012-i1',
    query=sql_content,
    maximum_bytes_billed=20000000000,
    labels= {"type": "bq_transform", "name" : "transform_green_taxi"}
)


Executing SQL file: file:///Workspace/CarsProject/sql/transform/bigquery/2-bq-transform-green-taxi.sql
-- Databricks notebook source
CREATE OR REPLACE TABLE `amplified-brook-454012-i1`.`nyc_taxi`.`green_taxi_trips_transform`
PARTITION BY DATETIME_TRUNC(partition_id, MONTH)
CLUSTER BY trip_year, trip_month, taxi_type, vendor_id
AS
  SELECT
      t.taxi_type,
      t.vendor_id,
      t.pickup_datetime,
      t.dropoff_datetime,
      t.store_and_fwd_flag,
      t.rate_code_id,
      t.pickup_location_id,
      t.dropoff_location_id,
      t.pickup_longitude,
      t.pickup_latitude,
      t.dropoff_longitude,
      t.dropoff_latitude,
      t.passenger_count,
      t.trip_distance,
      t.fare_amount,
      t.extra,
      t.mta_tax,
      t.tip_amount,
      t.tolls_amount,
      t.ehail_fee,
      t.improvement_surcharge,
      t.total_amount,
      t.payment_type,
      t.trip_type,
      CAST(t.trip_year AS INT64) as trip_year,
      CAST(t.trip_month AS INT64) as trip_month,
      v

In [122]:
sql_file = "3-bq-transform-create-materialize-view.sql"

sql_full_path =f"file://{os.path.join(sql_transform_dir, sql_file)}" 
print(f"Executing SQL file: {sql_full_path}")

sql_content = ""
try:
    # Read the SQL file content
    sql_content = dbutils.fs.head(sql_full_path)
    print(sql_content)
except Exception as e:
    print(f"Error reading file {sql_full_path}: {str(e)}")

result = execute_sql_query(
    client=client,
    project_id='amplified-brook-454012-i1',
    query=sql_content,
    maximum_bytes_billed=800000000000,
    labels= {"type": "bq_transform", "name" : "transform_taxi_mat_view"}
)


Executing SQL file: file:///Workspace/CarsProject/sql/transform/bigquery/3-bq-transform-create-materialize-view.sql
-- Databricks notebook source
CREATE OR REPLACE TABLE `amplified-brook-454012-i1`.`nyc_taxi`.`taxi_trips_mat_view`
PARTITION BY DATETIME_TRUNC(partition_id, MONTH)
CLUSTER BY trip_year, trip_month, taxi_type, vendor_id
AS
SELECT *, PARSE_DATETIME('%Y%m', FORMAT('%06d', CAST(trip_year as INT64) * 100 + CAST(trip_month as INT64))) as partition_id FROM (
SELECT DISTINCT  
  taxi_type,
  vendor_id,
  pickup_datetime,
  dropoff_datetime,
  store_and_fwd_flag,
  rate_code_id,
  pickup_location_id,
  dropoff_location_id,
  pickup_longitude,
  pickup_latitude,
  dropoff_longitude,
  dropoff_latitude,
  passenger_count,
  trip_distance,
  fare_amount,
  extra,
  mta_tax,
  tip_amount,
  tolls_amount,
  0.0 AS ehail_fee, -- Added inline
  improvement_surcharge,
  total_amount,
  payment_type,
  0 AS trip_type, -- Added inline
  vendor_abbreviation,
  vendor_description,
  '' AS tri