# What's in this exercise?
We run the common functions notebook so we can reuse capability defined there, and then...<BR>
1) Load reference data in staging directory to reference data directory<BR> 
2) Create external unmanaged Hive tables<BR>
3) Create statistics for tables

#### 1.  Declare necessary variables

In [79]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType, FloatType, DoubleType, TimestampType



In [80]:
%sql
SHOW CATALOGS;

catalog
databricks_workspace
hive_metastore
samples
synapse_nyc_reference
system


In [81]:
# Define source and destination directories
srcDataDirRoot = f"gs://nyctaxi-raw/reference-data/" #Root dir for source data
destDataDirRoot = f"gs://nyctaxi-silver/nyctaxi/reference/" #Root dir for consumable data



### 1.  Execute notebook with common/reusable functions

In [34]:
%run "../../../../01-General/2-CommonFunctions.ipynb"

Code from file 'file:///home/dinhnn/git/personal/Databricks-NYC-Taxi/Workspace/01-General/2-CommonFunctions.ipynb':
 import os
import math
import glob
import re
prqShrinkageFactor = 0.19 #We found a saving in space of 81% with Parquet
def analyzeTables(databaseAndTable):
  try:
    print("Table: " + databaseAndTable)
    print("....refresh table")
    sql("REFRESH TABLE " + databaseAndTable)
    print("....analyze table")
    sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS")
    print("....done")
  except Exception as e:
    return e
def calcOutputFileCountTxtToPrq(srcDataFile, targetedFileSizeMB):
  try:
    estFileCount = int(math.floor((os.path.getsize(srcDataFile) * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)))
    if(estFileCount == 0):
      return 1 
    else:
      return estFileCount
  except Exception as e:
    return e
#Delete residual files from job operation (_SUCCESS, _start*, _committed*)
#Should be called with '/dbfs/mnt/...'
def recursivelyD



### 2. List reference datasets

In [None]:
display(dbutils.fs.ls(srcDataDirRoot))

### 3. Define schema for raw reference data

In [36]:
# 1.  Taxi zone lookup
taxiZoneSchema = StructType([
    StructField("location_id", StringType(), True),
    StructField("borough", StringType(), True),
    StructField("zone", StringType(), True),
    StructField("service_zone", StringType(), True)])

#2. Months of the year
tripMonthNameSchema = StructType([
    StructField("trip_month", StringType(), True),
    StructField("month_name_short", StringType(), True),
    StructField("month_name_full", StringType(), True)])

#3.  Rate code id lookup
rateCodeSchema = StructType([
    StructField("rate_code_id", IntegerType(), True),
    StructField("description", StringType(), True)])

#4.  Payment type lookup
paymentTypeSchema = StructType([
    StructField("payment_type", IntegerType(), True),
    StructField("abbreviation", StringType(), True),
    StructField("description", StringType(), True)])

#5. Trip type
tripTypeSchema = StructType([
    StructField("trip_type", IntegerType(), True),
    StructField("description", StringType(), True)])


#6. Vendor ID
vendorSchema = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("abbreviation", StringType(), True),
    StructField("description", StringType(), True)])



### 4. Load reference data

##### 4.1. Create function to load data

In [38]:
def loadReferenceData(srcDatasetName, srcDataFile, destDataDir, srcSchema, delimiter):
  print("Dataset:  " + srcDatasetName)
  print(".......................................................")
  
  #Execute for idempotent runs
  print("....deleting destination directory - " + str(dbutils.fs.rm(destDataDir, recurse=True)))
  
  #Read source data
  refDF = (spark.read.option("header", True)
                      .schema(srcSchema)
                      .option("delimiter",delimiter)
                      .csv(srcDataFile))
      
  #Write parquet output
  print("....reading source and saving as parquet")
  print(refDF)
  refDF.coalesce(1).write.option("compression", "zstd").mode("overwrite").parquet(destDataDir)
  
  print("....done")



##### 4.2. Load data

In [13]:
loadReferenceData("taxi zone",srcDataDirRoot + "taxi_zone_lookup.csv",destDataDirRoot + "taxi-zone-lookup",taxiZoneSchema,",")
loadReferenceData("trip month",srcDataDirRoot + "trip_month_lookup.csv",destDataDirRoot + "trip-month-lookup",tripMonthNameSchema,",")
loadReferenceData("rate code",srcDataDirRoot + "rate_code_lookup.csv",destDataDirRoot + "rate-code-lookup",rateCodeSchema,"|")
loadReferenceData("payment type",srcDataDirRoot + "payment_type_lookup.csv",destDataDirRoot + "payment-type-lookup",paymentTypeSchema,"|")
loadReferenceData("trip type",srcDataDirRoot + "trip_type_lookup.csv",destDataDirRoot + "trip-type-lookup",tripTypeSchema,"|")
loadReferenceData("vendor",srcDataDirRoot + "vendor_lookup.csv",destDataDirRoot + "vendor-lookup",vendorSchema,"|")

Dataset:  taxi zone
.......................................................
....deleting destination directory - False
....reading source and saving as parquet
DataFrame[location_id: string, borough: string, zone: string, service_zone: string]
....done
Dataset:  trip month
.......................................................
....deleting destination directory - False
....reading source and saving as parquet
DataFrame[trip_month: string, month_name_short: string, month_name_full: string]
....done
Dataset:  rate code
.......................................................
....deleting destination directory - False
....reading source and saving as parquet
DataFrame[rate_code_id: int, description: string]
....done
Dataset:  payment type
.......................................................
....deleting destination directory - False
....reading source and saving as parquet
DataFrame[payment_type: int, abbreviation: string, description: string]
....done
Dataset:  trip type
.............

##### 4.3. Validate load

In [14]:
display(dbutils.fs.ls(destDataDirRoot))

path,name,size,modificationTime
gs://nyctaxi-silver/nyctaxi/reference/vendor-lookup/,vendor-lookup/,0,0
gs://nyctaxi-silver/nyctaxi/reference/rate-code-lookup/,rate-code-lookup/,0,0
gs://nyctaxi-silver/nyctaxi/reference/taxi-zone-lookup/,taxi-zone-lookup/,0,0
gs://nyctaxi-silver/nyctaxi/reference/trip-type-lookup/,trip-type-lookup/,0,0
gs://nyctaxi-silver/nyctaxi/reference/trip-month-lookup/,trip-month-lookup/,0,0
gs://nyctaxi-silver/nyctaxi/reference/payment-type-lookup/,payment-type-lookup/,0,0


### 5. Create Hive tables

In [15]:
%sql
CREATE CATALOG synapse_nyc_reference
MANAGED LOCATION 'gs://databricks-internal-catalog/nyctaxi';


<Empty result set>

In [16]:
%sql
CREATE SCHEMA synapse_nyc_reference.nyctaxi;

<Empty result set>

In [17]:
# Function to convert Spark schema to SQL DDL format
def convert_schema_to_sql_ddl(schema):
    """
    Convert a Spark StructType schema to SQL DDL format.
    
    Parameters:
    - schema: A StructType schema
    
    Returns:
    - A string with SQL DDL column definitions in parentheses
    
    Example:
    StructType([
        StructField("location_id", StringType(), True),
        StructField("borough", StringType(), True)
    ])
    
    Returns:
    (
        location_id STRING,
        borough STRING
    )
    """
    column_definitions = []
    for field in schema.fields:
        field_type = field.dataType.simpleString()
        # Map Spark types to SQL types
        if field_type == "string":
            sql_type = "STRING"
        elif field_type in ("integer", "int"):
            sql_type = "INT"
        elif field_type == "long":
            sql_type = "BIGINT"
        elif field_type == "double":
            sql_type = "DOUBLE"
        elif field_type == "float":
            sql_type = "FLOAT"
        elif field_type == "boolean":
            sql_type = "BOOLEAN"
        elif field_type == "timestamp":
            sql_type = "TIMESTAMP"
        elif field_type == "date":
            sql_type = "DATE"
        elif field_type.startswith("decimal"):
            sql_type = field_type.upper()
        else:
            sql_type = "STRING"  # Default to STRING for unknown types
        
        column_definitions.append(f"        {field.name} {sql_type}")
    
    # Format as a parenthesized list
    return "(\n" + ",\n".join(column_definitions) + "\n)"



In [18]:
def create_table(schema: str, table_name: str, schema_fields: str, parquet_dir: str, location: str):
    sql_ddl = convert_schema_to_sql_ddl(schema_fields)
    spark.sql(f"use {schema};")
    spark.sql(f"DROP TABLE IF EXISTS {table_name};")
    spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} {f"{sql_ddl}" if {sql_ddl} else ""} USING parquet LOCATION '{location}/{parquet_dir}';")



In [19]:
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="taxi_zone_lookup", schema_fields=taxiZoneSchema, parquet_dir="taxi-zone-lookup", location=destDataDirRoot)
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="trip_month_lookup", schema_fields=tripMonthNameSchema, parquet_dir="trip-month-lookup", location=destDataDirRoot)
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="rate_code_lookup", schema_fields=rateCodeSchema, parquet_dir="rate-code-lookup", location=destDataDirRoot)
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="payment_type_lookup", schema_fields=paymentTypeSchema, parquet_dir="payment-type-lookup", location=destDataDirRoot)
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="trip_type_lookup", schema_fields=tripTypeSchema, parquet_dir="trip-type-lookup", location=destDataDirRoot)
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="vendor_lookup", schema_fields=vendorSchema, parquet_dir="vendor-lookup", location=destDataDirRoot)




In [20]:
%sql
SELECT * FROM synapse_nyc_reference.nyctaxi.taxi_zone_lookup;

location_id,borough,zone,service_zone
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
7,Queens,Astoria,Boro Zone
8,Queens,Astoria Park,Boro Zone
9,Queens,Auburndale,Boro Zone
10,Queens,Baisley Park,Boro Zone


In [21]:
%sql
select * from synapse_nyc_reference.nyctaxi.trip_month_lookup;

trip_month,month_name_short,month_name_full
1,JAN,January
2,FEB,February
3,MAR,March
4,APR,April
5,MAY,May
6,JUN,June
7,JUL,July
8,AUG,August
9,SEP,September
10,OCT,October


In [22]:
%sql
select * from synapse_nyc_reference.nyctaxi.rate_code_lookup;

rate_code_id,description
1,Standard rate
2,JFK
3,Newark
4,Nassau or Westchester
5,Negotiated fare
6,Group ride


In [23]:
%sql
select * from synapse_nyc_reference.nyctaxi.payment_type_lookup;

payment_type,abbreviation,description
1,,Credit card
2,,Cash
3,,No charge
4,,Dispute
5,,Unknown
6,,Voided trip
7,CAS,Cash
8,CASH,Cash
9,CRD,Credit card
10,CRE,Credit card


In [24]:
%sql
select * from synapse_nyc_reference.nyctaxi.trip_type_lookup;

trip_type,description
1,Street-hail
2,Dispatch


In [25]:
%sql
select * from synapse_nyc_reference.nyctaxi.vendor_lookup;

vendor_id,abbreviation,description
1,CMT,"Creative, Mobile Technologies, LLC"
2,VTS,VeriFone Inc.
3,DDS,Digital Dispatch Systems
