# What's in this exercise?
We will run various reports and visualize

In [29]:
project_id = "amplified-brook-454012-i1"
dataset = "nyc_taxi"
delta_lake_conn = "nyc-biglake-connection"



In [None]:
import pandas as pd
import time
import json
from google.cloud import bigquery
from google.cloud.exceptions import GoogleCloudError
from google.oauth2 import service_account
from enum import Enum

class QueryState(Enum):
    """Enum to represent BigQuery job states"""
    PENDING = "PENDING"
    RUNNING = "RUNNING"
    DONE = "DONE"
    FAILED = "FAILED"
    CANCELED = "CANCELED"

def get_bigquery_client(**kwargs):
    """
    Create an authenticated BigQuery client for use in Databricks.
    
    Args:
        **kwargs: Additional arguments:
            - secret_scope (str): Databricks secret scope name
            - secret_key (str): Databricks secret key name
            - project_id (str, optional): Google Cloud project ID
    
    Returns:
        google.cloud.bigquery.Client: Authenticated BigQuery client
    
    Raises:
        ValueError: If required parameters are missing
        ImportError: If running in Databricks and dbutils is not available
    """
    if "secret_scope" not in kwargs or "secret_key" not in kwargs:
        raise ValueError("secret_scope and secret_key are required for databricks_secret authentication")
    
    # This will only work in Databricks
    try:
        from pyspark.dbutils import DBUtils
        from pyspark.sql import SparkSession
        
        spark = SparkSession.builder.getOrCreate()
        dbutils = DBUtils(spark)
        
        # Get service account JSON from Databricks secrets
        json_credentials = dbutils.secrets.get(scope=kwargs["secret_scope"], key=kwargs["secret_key"])
        service_account_info = json.loads(json_credentials)
        
        credentials = service_account.Credentials.from_service_account_info(
            service_account_info
        )
        
        project_id = kwargs.get("project_id")
        if not project_id and "project_id" in service_account_info:
            project_id = service_account_info["project_id"]
            
        if project_id:
            return bigquery.Client(credentials=credentials, project=project_id)
        else:
            return bigquery.Client(credentials=credentials)
            
    except ImportError:
        raise ImportError("This authentication method only works in Databricks")

def execute_sql_query(
    client,
    query: str,
    project_id: str = None,
    dataset_id: str = None,
    poll_interval_seconds: int = 5,
    timeout_seconds: int = 300,  # Default timeout of 30 minutes
    location: str = "asia-southeast1",  # Default location
    dry_run: bool = False,  # Option to perform a dry run
    use_legacy_sql: bool = False,  # Use standard SQL by default
    query_parameters: list = [],  # For parameterized queries
    maximum_bytes_billed: int = 10**10,  # Cost control
    labels: dict[str, str] = {}
):
    """
    Execute a SQL query in BigQuery and return formatted results
    This function will poll until the query is complete or timeout is reached

    Args:
        client: BigQuery client instance
        query: SQL query to execute
        project_id: Optional Google Cloud project ID (defaults to client's project)
        dataset_id: Optional default dataset ID for the query
        poll_interval_seconds: Seconds to wait between status checks (default: 30)
        timeout_seconds: Maximum seconds to wait for query completion (default: 1800, set to None for no timeout)
        location: BigQuery location/region (default: "US")
        dry_run: If True, don't actually run the query (just validate)
        use_legacy_sql: If True, use legacy SQL instead of standard SQL
        query_parameters: List of query parameters for parameterized queries
        maximum_bytes_billed: Maximum bytes that will be billed for this job

    Returns:
        Formatted query results as pandas DataFrame or error message
    """
    try:
        # Configure the job
        job_config = bigquery.QueryJobConfig(
            use_legacy_sql=use_legacy_sql,
            dry_run=dry_run,
            query_parameters=query_parameters,
            maximum_bytes_billed=maximum_bytes_billed,
            labels = labels
        )
        
        # Set default dataset if provided
        if dataset_id:
            job_config.default_dataset = f"{project_id}.{dataset_id}" if project_id else dataset_id
        
        # Start the query job
        query_job = client.query(
            query,
            job_config=job_config,
            location=location,
            project=project_id,
        )
        
        # Track elapsed time
        start_time = time.time()
        
        # Poll until the query is complete or timeout is reached
        while True:
            # Check if timeout has been reached
            elapsed_time = time.time() - start_time
            if timeout_seconds is not None and elapsed_time > timeout_seconds:
                print(f"Query execution timed out after {timeout_seconds} seconds.")
                query_job.cancel()
                return f"Query execution timed out after {timeout_seconds} seconds."
            
            # Reload the job to get the latest status
            query_job.reload()
            
            # Get current state
            if query_job.state == "PENDING":
                state = QueryState.PENDING
            elif query_job.state == "RUNNING":
                state = QueryState.RUNNING
            elif query_job.state == "DONE":
                if query_job.error_result:
                    state = QueryState.FAILED
                else:
                    state = QueryState.DONE
            else:
                state = QueryState.FAILED
            
            # Check if the query has completed
            if state == QueryState.DONE:
                print(f"Query execution completed successfully in {elapsed_time:.2f} seconds.")
                break
            elif state == QueryState.FAILED:
                error_message = query_job.error_result.get('message', 'Unknown error')
                error_reason = query_job.error_result.get('reason', 'Unknown reason')
                print(f"Query execution failed after {elapsed_time:.2f} seconds: {error_reason} - {error_message}")
                return f"Query execution failed: {error_reason} - {error_message}"
            elif state == QueryState.CANCELED:
                print(f"Query execution was canceled after {elapsed_time:.2f} seconds.")
                return "Query execution was canceled."
            else:
                # Query is still running, wait and check again
                print(f"Query is running... Current state: {state.value}, elapsed time: {elapsed_time:.2f} seconds")
                time.sleep(poll_interval_seconds)
        
        # If this is a dry run, return the estimated bytes processed
        if dry_run:
            bytes_processed = query_job.total_bytes_processed
            return f"Dry run completed. Query will process {bytes_processed} bytes."
        
        # Get query statistics
        bytes_processed = query_job.total_bytes_processed
        bytes_billed = query_job.total_bytes_billed
        slot_ms = query_job.slot_millis
        
        print(f"Query processed {bytes_processed} bytes and billed for {bytes_billed} bytes.")
        print(f"Query used {slot_ms} slot milliseconds.")
        
        # Get the results
        results = query_job.result()
        
        # Check if we have results
        if not results:
            return "Query executed successfully, but no results were returned."
        
        # Convert to DataFrame
        df = results.to_dataframe()
        
        # Return the DataFrame
        return df
    
    except GoogleCloudError as e:
        return f"Google Cloud Error: {str(e)}"
    except Exception as e:
        return f"Error executing SQL query: {str(e)}"

In [None]:
client = get_bigquery_client(
    secret_scope="databricks-bigquery",  # Your secret scope name
    secret_key="databricks-bq-sa"         # Your secret key name
)

### 1.  Trip count by taxi type

In [334]:
sql = """
select 
  taxi_type,
  count(1) as trip_count
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,trip_count
yellow,1367926283
green,59035872


### 2.  Revenue including tips by taxi type

In [335]:
sql = """
select 
  taxi_type, sum(total_amount) revenue
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,revenue
yellow,18801428115.072758
green,862192848.0288665


### 5.  Trip count trend by month, by taxi type, for 2016

In [336]:
sql = """
select 
  taxi_type,
  trip_month as month,
  count(1) as trip_count
from 
  taxi_trips_mat_view
where 
  trip_year=2016
group by taxi_type,trip_month
order by trip_month
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


taxi_type,month,trip_count
yellow,1,10906857
green,1,1445285
yellow,2,11382048
green,2,1510722
green,3,1576393
yellow,3,12210951
green,4,1543925
yellow,4,11934332
green,5,1536979
yellow,5,11836853


### 6.  Average trip distance by taxi type

In [337]:
sql = """
select 
  taxi_type, round(avg(trip_distance),2) as trip_distance_miles
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,trip_distance_miles
yellow,5.22
green,2.86


### 7.  Average trip amount by taxi type

In [340]:
sql = """
select 
  taxi_type, round(avg(total_amount),2) as avg_total_amount
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,avg_total_amount
yellow,13.74
green,14.6


### 8.  Trips with no tip, by taxi type

In [341]:
sql = """
select 
  taxi_type, count(1) tipless_count
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
where tip_amount=0
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


taxi_type,tipless_count
yellow,714255854
green,35839928


### 9.  Trips with no charge, by taxi type

In [342]:
sql = """
select 
  taxi_type, count(*) as transactions
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
where
  payment_type_description='No charge'
  and total_amount=0.0
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,transactions
yellow,2093
green,8578


### 10.  Trips by payment type

In [343]:
sql = """
select 
  payment_type_description as Payment_type, count(*) as transactions
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
group by payment_type_description
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

Payment_type,transactions
Unknown,1112469
Dispute,1111397
Cash,719487053
Credit card,701765941
No charge,3478486
,6809


### 11.  Trip trend by pickup hour for yellow taxi in 2016

In [344]:
sql = """
select pickup_hour,count(*) as trip_count
from synapse_nyc_reference.nyctaxi.yellow_taxi_trips_transform
where trip_year=2016
group by pickup_hour
order by pickup_hour
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

pickup_hour,trip_count
0,4801263
1,3510554
2,2576474
3,1888184
4,1451388
5,1393381
6,3001894
7,4925227
8,5967221
9,6044046


### 12.  Top 3 yellow taxi pickup-dropoff zones for 2016

In [345]:
sql = """
select * from 
  (
  select 
    pickup_zone,dropoff_zone,count(*) as trip_count
  from 
    yellow_taxi_trips_transform
  where 
    trip_year=2016
  and
    pickup_zone is not null and pickup_zone<>'NV'
  and 
    dropoff_zone is not null and dropoff_zone<>'NV'
  group by pickup_zone,dropoff_zone
  order by trip_count desc
  ) x
limit 3
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


pickup_zone,dropoff_zone,trip_count
Upper East Side South,Upper East Side North,310035
Upper East Side North,Upper East Side South,267509
Upper East Side North,Upper East Side North,249141
