# What's in this exercise?
We will run various reports and visualize

In [None]:
from databricks.sdk import WorkspaceClient

host = dbutils.secrets.get(scope="databricks-warehouse", key="warehouse-sql-host")
token = dbutils.secrets.get(scope="databricks-warehouse", key="warehouse-sql-token")
warehouse_id = dbutils.secrets.get(scope="databricks-warehouse", key="warehouse-sql-warehouseid")
catalog = dbutils.secrets.get(scope="databricks-warehouse", key="warehouse-sql-catalog-nyc")
schema = dbutils.secrets.get(scope="databricks-warehouse", key="warehouse-sql-schema-nyc")

w = WorkspaceClient(
    host = host,
    token = token
)



In [None]:
# import sys
# sys.path.append("../../utils")
# from utils.databricks_utils import execute_sql_query

import pandas as pd
from databricks.sdk.service.sql import StatementState

def execute_sql_query(
    w,
    query: str,
    warehouse_id: str,
    catalog: str = None,
    schema: str = None,
):
    """
    Execute a SQL query and return formatted results

    Args:
        w: WorkspaceClient instance
        query: SQL query to execute
        warehouse_id: SQL warehouse ID
        catalog: Optional catalog name
        schema: Optional schema name

    Returns:
        Formatted query results
    """
    try:
        # Execute the SQL statement
        exe_stt = w.statement_execution.execute_statement(
            statement=query, warehouse_id=warehouse_id, catalog=catalog, schema=schema
        )

        # Get the statement execution result
        get_statement = w.statement_execution.get_statement(
            statement_id=exe_stt.statement_id
        )

        # Check if the query failed
        if hasattr(get_statement, "status") and get_statement.status:
            if (
                hasattr(get_statement.status, "state")
                and get_statement.status.state == StatementState.FAILED
            ):
                
                if (
                    hasattr(get_statement.status, "error")
                    and get_statement.status.error
                ):
                    error_message = get_statement.status.error.message
                    error_code = get_statement.status.error.error_code
                    return f"Query execution failed: {error_code} - {error_message}"
                return "Query execution failed with unknown error."

        # Check if we have results
        if not hasattr(get_statement, "result") or not get_statement.result:
            return "Query executed successfully, but no results were returned."

        # Format the results
        if (
            hasattr(get_statement.result, "data_array")
            and get_statement.result.data_array
        ):
            # Get column names from the manifest
            columns = []
            if hasattr(get_statement, "manifest") and get_statement.manifest:
                if (
                    hasattr(get_statement.manifest, "schema")
                    and get_statement.manifest.schema
                ):
                    # Fixed: Properly handle ResultSchema object
                    schema = get_statement.manifest.schema
                    if hasattr(schema, "columns") and schema.columns:
                        # If schema has columns attribute
                        columns = [col.name for col in schema.columns]
                    elif hasattr(schema, "fields") and schema.fields:
                        # If schema has fields attribute
                        columns = [field.name for field in schema.fields]
                    elif isinstance(schema, list):
                        # If schema is already a list
                        columns = [col.name for col in schema]

            # If we couldn't get column names, use generic ones
            if not columns and get_statement.result.data_array:
                columns = [
                    f"Column_{i}"
                    for i in range(len(get_statement.result.data_array[0]))
                ]

            # Create a pandas DataFrame for better display
            df = pd.DataFrame(get_statement.result.data_array, columns=columns)

            # Return formatted table
            return df
        else:
            # Double-check if the query failed but we didn't catch it earlier
            if hasattr(get_statement, "status") and get_statement.status:
                if (
                    hasattr(get_statement.status, "state")
                    and get_statement.status.state == "FAILED"
                ):
                    if (
                        hasattr(get_statement.status, "error")
                        and get_statement.status.error
                    ):
                        error_message = get_statement.status.error.message
                        error_code = get_statement.status.error.error_code
                        return f"Query execution failed: {error_code} - {error_message}"
                    return "Query execution failed with unknown error."
            return "Query executed successfully, but no data was returned."

    except Exception as e:
        return f"Error executing SQL query: {str(e)}"

### 1.  Trip count by taxi type

In [334]:
sql = """
select 
  taxi_type,
  count(1) as trip_count
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,trip_count
yellow,1367926283
green,59035872


### 2.  Revenue including tips by taxi type

In [335]:
sql = """
select 
  taxi_type, sum(total_amount) revenue
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,revenue
yellow,18801428115.072758
green,862192848.0288665


### 5.  Trip count trend by month, by taxi type, for 2016

In [336]:
sql = """
select 
  taxi_type,
  trip_month as month,
  count(1) as trip_count
from 
  taxi_trips_mat_view
where 
  trip_year=2016
group by taxi_type,trip_month
order by trip_month
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


taxi_type,month,trip_count
yellow,1,10906857
green,1,1445285
yellow,2,11382048
green,2,1510722
green,3,1576393
yellow,3,12210951
green,4,1543925
yellow,4,11934332
green,5,1536979
yellow,5,11836853


### 6.  Average trip distance by taxi type

In [337]:
sql = """
select 
  taxi_type, round(avg(trip_distance),2) as trip_distance_miles
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,trip_distance_miles
yellow,5.22
green,2.86


### 7.  Average trip amount by taxi type

In [340]:
sql = """
select 
  taxi_type, round(avg(total_amount),2) as avg_total_amount
from 
  taxi_trips_mat_view
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,avg_total_amount
yellow,13.74
green,14.6


### 8.  Trips with no tip, by taxi type

In [341]:
sql = """
select 
  taxi_type, count(1) tipless_count
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
where tip_amount=0
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


taxi_type,tipless_count
yellow,714255854
green,35839928


### 9.  Trips with no charge, by taxi type

In [342]:
sql = """
select 
  taxi_type, count(*) as transactions
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
where
  payment_type_description='No charge'
  and total_amount=0.0
group by taxi_type
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

taxi_type,transactions
yellow,2093
green,8578


### 10.  Trips by payment type

In [343]:
sql = """
select 
  payment_type_description as Payment_type, count(*) as transactions
from 
  synapse_nyc_reference.nyctaxi.taxi_trips_mat_view
group by payment_type_description
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

Payment_type,transactions
Unknown,1112469
Dispute,1111397
Cash,719487053
Credit card,701765941
No charge,3478486
,6809


### 11.  Trip trend by pickup hour for yellow taxi in 2016

In [344]:
sql = """
select pickup_hour,count(*) as trip_count
from synapse_nyc_reference.nyctaxi.yellow_taxi_trips_transform
where trip_year=2016
group by pickup_hour
order by pickup_hour
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)

pickup_hour,trip_count
0,4801263
1,3510554
2,2576474
3,1888184
4,1451388
5,1393381
6,3001894
7,4925227
8,5967221
9,6044046


### 12.  Top 3 yellow taxi pickup-dropoff zones for 2016

In [345]:
sql = """
select * from 
  (
  select 
    pickup_zone,dropoff_zone,count(*) as trip_count
  from 
    yellow_taxi_trips_transform
  where 
    trip_year=2016
  and
    pickup_zone is not null and pickup_zone<>'NV'
  and 
    dropoff_zone is not null and dropoff_zone<>'NV'
  group by pickup_zone,dropoff_zone
  order by trip_count desc
  ) x
limit 3
"""

df_result = execute_sql_query(w=w,query=sql, warehouse_id=warehouse_id, catalog=catalog, schema=schema)
display(df_result)


pickup_zone,dropoff_zone,trip_count
Upper East Side South,Upper East Side North,310035
Upper East Side North,Upper East Side South,267509
Upper East Side North,Upper East Side North,249141
