# What's in this exercise?
We will run the common functions notebook so we can reuse capability defined there, and then...<BR>
1) Load green taxi data from the staging zone, homogenize schemas over the years, and persist to the raw information zone, in Delta format<BR> 
2) Create external table definition<BR>
3) Optimize the table

In [121]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,LongType,FloatType,DoubleType, TimestampType
from pyspark.sql.functions import *



In [122]:
storageAccountName = dbutils.secrets.get(scope="databricks-warehouse", key="storage-name")
storageAccountKey = dbutils.secrets.get(scope="databricks-warehouse", key="storage-key")



In [123]:
spark.conf.set(f"fs.azure.account.key.{storageAccountName}.dfs.core.windows.net", storageAccountKey)



In [124]:
#Source, destination directories
srcDataDirRoot = f"abfss://bronze@{storageAccountName}.dfs.core.windows.net/transactional-data/" #Root dir for source data
destDataDirRoot = f"abfss://silver@{storageAccountName}.dfs.core.windows.net/nyctaxi/transactions/green-taxi/" #Root dir for raw data in Parquet

#Canonical ordered column list for green taxi across years to homogenize schema
canonicalTripSchemaColList = ["taxi_type","vendor_id","pickup_datetime","dropoff_datetime","store_and_fwd_flag","rate_code_id","pickup_location_id","dropoff_location_id","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count","trip_distance","fare_amount","extra","mta_tax","tip_amount","tolls_amount","ehail_fee","improvement_surcharge","total_amount","payment_type","trip_type","trip_year","trip_month"]



### 1.  Execute notebook with common/reusable functions

In [125]:
%run "../../../../01-General/2-CommonFunctions.ipynb"

Code from file 'file:///home/dinhnn/git/personal/Databricks-NYC-Taxi/Workspace/01-General/2-CommonFunctions.ipynb':
 import os
import math
import glob
import re
prqShrinkageFactor = 0.19 #We found a saving in space of 81% with Parquet
def analyzeTables(databaseAndTable):
  try:
    print("Table: " + databaseAndTable)
    print("....refresh table")
    sql("REFRESH TABLE " + databaseAndTable)
    print("....analyze table")
    sql("ANALYZE TABLE " + databaseAndTable + " COMPUTE STATISTICS")
    print("....done")
  except Exception as e:
    return e
def calcOutputFileCountTxtToPrq(srcDataFile, targetedFileSizeMB):
  try:
    estFileCount = int(math.floor((os.path.getsize(srcDataFile) * prqShrinkageFactor) / (targetedFileSizeMB * 1024 * 1024)))
    if(estFileCount == 0):
      return 1 
    else:
      return estFileCount
  except Exception as e:
    return e
#Delete residual files from job operation (_SUCCESS, _start*, _committed*)
#Should be called with '/dbfs/mnt/...'
def recursivelyD



#### 2. Define schema for source data
Different years have different schemas - fields added/removed

In [14]:
#Schema for data based on year and month

#2017
greenTripSchema2017H1 = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_location_id", IntegerType(), True),
    StructField("dropoff_location_id", IntegerType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True)])

#Second half of 2016
greenTripSchema2016H2 = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_location_id", IntegerType(), True),
    StructField("dropoff_location_id", IntegerType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True),
    StructField("junk1", StringType(), True),
    StructField("junk2", StringType(), True)])

#2015 second half of the year and 2016 first half of the year
greenTripSchema2015H22016H1 = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True)])

#2015 first half of the year
greenTripSchema2015H1 = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True),
    StructField("junk1", StringType(), True),
    StructField("junk2", StringType(), True)])

#August 2013 through 2014
greenTripSchemaPre2015 = StructType([
    StructField("vendor_id", IntegerType(), True),
    StructField("pickup_datetime", TimestampType(), True),
    StructField("dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("pickup_longitude", DoubleType(), True),
    StructField("pickup_latitude", DoubleType(), True),
    StructField("dropoff_longitude", DoubleType(), True),
    StructField("dropoff_latitude", DoubleType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True),
    StructField("junk1", StringType(), True),
    StructField("junk2", StringType(), True)])



#### 3. Some functions

In [15]:
#1) Function to determine schema for a given year and month
#Input:  Year and month
#Output: StructType for applicable schema 
#Sample call: println(getSchemaStruct(2009,1))

def getTaxiSchema(tripYear, tripMonth):
  taxiSchema = None
  if((tripYear == 2013 and tripMonth > 7) or tripYear == 2014):
    print("case 1")
    taxiSchema = greenTripSchemaPre2015
  elif(tripYear == 2015 and tripMonth < 7):
    print("case 2")
    taxiSchema = greenTripSchema2015H1
  elif((tripYear == 2015 and tripMonth > 6) or (tripYear == 2016 and tripMonth < 7)):
    print("case 3")
    taxiSchema = greenTripSchema2015H22016H1
  elif(tripYear == 2016 and tripMonth > 6):
    print("case 4")
    taxiSchema = greenTripSchema2016H2
  elif(tripYear == 2017 and tripMonth < 7):
    print("case 5")
    taxiSchema = greenTripSchema2017H1
  else:
    print("case default")
  
  return taxiSchema



In [16]:
#2) Function to add columns to dataframe as required to homogenize schema
#Input:  Dataframe, year and month
#Output: Dataframe with homogenized schema 
#Sample call: println(getSchemaHomogenizedDataframe(DF,2014,6))

def getSchemaHomogenizedDataframe(sourceDF,tripYear,tripMonth):
  if((tripYear == 2013 and tripMonth > 7) or tripYear == 2014):

    sourceDF = (sourceDF.withColumn("pickup_location_id", lit(0).cast("integer"))
              .withColumn("dropoff_location_id", lit(0).cast("integer"))
              .withColumn("improvement_surcharge",lit(0).cast("double"))
              .withColumn("trip_year",substring(col("pickup_datetime"),0, 4))
              .withColumn("trip_month",substring(col("pickup_datetime"),6,2))
              .withColumn("taxi_type",lit("green"))
              .withColumn("temp_pickup_longitude", col("pickup_longitude").cast("string"))
                                      .drop("pickup_longitude").withColumnRenamed("temp_pickup_longitude", "pickup_longitude")
              .withColumn("temp_dropoff_longitude", col("dropoff_longitude").cast("string"))
                                      .drop("dropoff_longitude").withColumnRenamed("temp_dropoff_longitude", "dropoff_longitude")
              .withColumn("temp_pickup_latitude", col("pickup_latitude").cast("string"))
                                      .drop("pickup_latitude").withColumnRenamed("temp_pickup_latitude", "pickup_latitude")
              .withColumn("temp_dropoff_latitude", col("dropoff_latitude").cast("string"))
                                      .drop("dropoff_latitude").withColumnRenamed("temp_dropoff_latitude", "dropoff_latitude"))

  elif(tripYear == 2015 and tripMonth < 7):

    sourceDF = (sourceDF.withColumn("pickup_location_id", lit(0).cast("integer"))
              .withColumn("dropoff_location_id", lit(0).cast("integer"))
              .withColumn("trip_year",substring(col("pickup_datetime"),0, 4))
              .withColumn("trip_month",substring(col("pickup_datetime"),6,2))
              .withColumn("taxi_type",lit("green"))
              .withColumn("temp_pickup_longitude", col("pickup_longitude").cast("string"))
                                      .drop("pickup_longitude").withColumnRenamed("temp_pickup_longitude", "pickup_longitude")
              .withColumn("temp_dropoff_longitude", col("dropoff_longitude").cast("string"))
                                      .drop("dropoff_longitude").withColumnRenamed("temp_dropoff_longitude", "dropoff_longitude")
              .withColumn("temp_pickup_latitude", col("pickup_latitude").cast("string"))
                                      .drop("pickup_latitude").withColumnRenamed("temp_pickup_latitude", "pickup_latitude")
              .withColumn("temp_dropoff_latitude", col("dropoff_latitude").cast("string"))
                                      .drop("dropoff_latitude").withColumnRenamed("temp_dropoff_latitude", "dropoff_latitude"))

  elif((tripYear == 2015 and tripMonth > 6) or (tripYear == 2016 and tripMonth < 7)):

    sourceDF = (sourceDF.withColumn("pickup_location_id", lit(0).cast("integer"))
              .withColumn("dropoff_location_id", lit(0).cast("integer"))
              .withColumn("junk1",lit(""))
              .withColumn("junk2",lit(""))
              .withColumn("trip_year",substring(col("pickup_datetime"),0, 4))
              .withColumn("trip_month",substring(col("pickup_datetime"),6,2))
              .withColumn("taxi_type",lit("green"))
              .withColumn("temp_pickup_longitude", col("pickup_longitude").cast("string"))
                                      .drop("pickup_longitude").withColumnRenamed("temp_pickup_longitude", "pickup_longitude")
              .withColumn("temp_dropoff_longitude", col("dropoff_longitude").cast("string"))
                                      .drop("dropoff_longitude").withColumnRenamed("temp_dropoff_longitude", "dropoff_longitude")
              .withColumn("temp_pickup_latitude", col("pickup_latitude").cast("string"))
                                      .drop("pickup_latitude").withColumnRenamed("temp_pickup_latitude", "pickup_latitude")
              .withColumn("temp_dropoff_latitude", col("dropoff_latitude").cast("string"))
                                      .drop("dropoff_latitude").withColumnRenamed("temp_dropoff_latitude", "dropoff_latitude"))

  elif(tripYear == 2016 and tripMonth > 6):

    sourceDF = (sourceDF.withColumn("pickup_longitude", lit(""))
              .withColumn("pickup_latitude", lit(""))
              .withColumn("dropoff_longitude", lit(""))
              .withColumn("dropoff_latitude", lit(""))
              .withColumn("trip_year",substring(col("pickup_datetime"),0, 4))
              .withColumn("trip_month",substring(col("pickup_datetime"),6,2))
              .withColumn("taxi_type",lit("green")))

  elif(tripYear == 2017 and tripMonth < 7):

    sourceDF = (sourceDF.withColumn("pickup_longitude", lit(""))
              .withColumn("pickup_latitude", lit(""))
              .withColumn("dropoff_longitude", lit(""))
              .withColumn("dropoff_latitude", lit(""))
              .withColumn("trip_year",substring(col("pickup_datetime"),0, 4))
              .withColumn("trip_month",substring(col("pickup_datetime"),6,2))
              .withColumn("taxi_type",lit("green"))
              .withColumn("junk1",lit(""))
              .withColumn("junk2",lit("")))

  else:
    sourceDF
  return sourceDF



#### 4. Read CSV, homogenize schema across years, save as parquet

In [None]:
# To make Hive Parquet format compatible with Spark Parquet format
spark.conf.set("spark.sql.parquet.writeLegacyFormat", "true")

# Disable delta optimizeWrite for consistent number of partition (when write from 2nd time, spark with delta format can reduce number of partitions)
spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "false")

#Green taxi data starts from 2013/08
for j in range(2013,2018):
    startMonth = None
    if j==2013: 
      startMonth=8 
    else: 
      startMonth=1

    endMonth = None
    if j==2017: 
      endMonth=6
    else: 
      endMonth=12

    for i in range(startMonth,endMonth+1): 
      #Source path  
      srcDataFile= "{}year={}/month={:02d}/type=green/green_tripdata_{}-{:02d}.csv".format(srcDataDirRoot,j,i,j,i)
      print ("Year={}; Month={}".format(j,i))
      print (srcDataFile)


      #Destination path  
      destDataDir = "{}/trip_year={}/trip_month={:02d}/".format(destDataDirRoot,j,i)
      
      #Source schema
      taxiSchema = getTaxiSchema(j,i)

      #Read source data
      taxiDF = (sqlContext.read.format("csv")
                      .option("header", "true")
                      .schema(taxiSchema)
                      .option("delimiter",",")
                      .load(srcDataFile).cache())

      #Add additional columns to homogenize schema across years
      taxiFormattedDF = getSchemaHomogenizedDataframe(taxiDF, j, i)

      #Order all columns to align with the canonical schema for green taxi
      taxiCanonicalDF = taxiFormattedDF.select(canonicalTripSchemaColList)
      
      year_month_dir = "{}trip_year={}/trip_month={:02d}".format(destDataDirRoot,j,i)      

      old_files = dbutils.fs.ls(year_month_dir)
      #Write parquet output, calling function to calculate number of partition files
      taxiCanonicalDF.repartition(1).write.option("compression", "zstd").format("delta").mode("overwrite").option("replaceWhere", f"trip_year = '{j}' and trip_month = '{i:02d}'").partitionBy("trip_year","trip_month").save(destDataDirRoot)
      for f in old_files:
        dbutils.fs.rm(f.path, False)

Year=2013; Month=8
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2013/month=08/type=green/green_tripdata_2013-08.csv
case 1
Year=2013; Month=9
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2013/month=09/type=green/green_tripdata_2013-09.csv
case 1
Year=2013; Month=10
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2013/month=10/type=green/green_tripdata_2013-10.csv
case 1
Year=2013; Month=11
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2013/month=11/type=green/green_tripdata_2013-11.csv
case 1
Year=2013; Month=12
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2013/month=12/type=green/green_tripdata_2013-12.csv
case 1
Year=2014; Month=1
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2014/month=01/type=green/green_tripdata_2014-01.csv
case 1
Year=2014; Month=2
abfss://bronze@[REDACTED].dfs.core.windows.net/transactional-data/year=2014/month=02

#### 5. Create external table definition

In [127]:
def create_table(schema: str, table_name: str, parquet_dir: str, location: str):
    spark.sql(f"use {schema};")
    spark.sql(f"DROP TABLE IF EXISTS {table_name};")
    spark.sql(f"CREATE TABLE IF NOT EXISTS {table_name} USING delta LOCATION '{location}/{parquet_dir}';")



In [128]:
create_table(schema="synapse_nyc_reference.nyctaxi", table_name="green_taxi_trips_raw", parquet_dir="", location=destDataDirRoot)



In [130]:
%sql
FSCK REPAIR TABLE `synapse_nyc_reference`.`nyctaxi`.green_taxi_trips_raw;

<Empty result set>

In [124]:
%sql
select * from nyctaxi.green_taxi_trips_raw;

taxi_type,vendor_id,pickup_datetime,dropoff_datetime,store_and_fwd_flag,rate_code_id,pickup_location_id,dropoff_location_id,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,trip_year,trip_month
green,2,2013-08-01T08:14:37Z,2013-08-01T09:09:06Z,N,1,0,0,0.0,0.0,0.0,0.0,1,0.0,21.25,0.0,0.0,0.0,0.0,,0,21.25,2,,2013,8
green,2,2013-08-01T09:13:00Z,2013-08-01T11:38:00Z,N,1,0,0,0.0,0.0,0.0,0.0,2,0.0,74.5,0.0,0.5,0.0,0.0,,0,75.0,2,,2013,8
green,2,2013-08-01T09:48:00Z,2013-08-01T09:49:00Z,N,5,0,0,0.0,0.0,0.0,0.0,1,0.0,1.0,0.1,0.0,0.0,1.0,,0,2.1,2,,2013,8
green,2,2013-08-01T10:38:35Z,2013-08-01T10:38:51Z,N,1,0,0,0.0,0.0,0.0,0.0,1,0.0,3.25,0.0,0.0,0.0,0.0,,0,3.25,2,,2013,8
green,2,2013-08-01T11:51:45Z,2013-08-01T12:03:52Z,N,1,0,0,0.0,0.0,0.0,0.0,1,0.0,8.5,0.0,0.5,0.0,0.0,,0,9.0,2,,2013,8
green,2,2013-08-01T14:33:39Z,2013-08-01T15:49:00Z,N,1,0,0,0.0,0.0,0.0,0.0,1,0.0,9.0,0.0,0.5,0.0,0.0,,0,9.5,2,,2013,8
green,2,2013-08-01T17:19:00Z,2013-08-01T17:19:00Z,N,1,0,0,0.0,0.0,0.0,0.0,1,0.0,2.5,1.0,0.5,0.0,0.0,,0,4.0,2,,2013,8
green,2,2013-08-01T17:22:00Z,2013-08-01T17:22:00Z,N,1,0,0,-73.9377670288086,40.75848007202149,-73.9377670288086,40.75848007202149,1,0.0,2.5,1.0,0.5,0.0,5.33,,0,9.33,2,,2013,8
green,2,2013-08-01T17:24:00Z,2013-08-01T17:25:00Z,N,1,0,0,-73.93792724609375,40.757843017578125,-73.93792724609375,40.757843017578125,1,0.0,2.5,1.0,0.5,0.0,1.11,,0,5.11,2,,2013,8
green,2,2013-08-01T19:21:09Z,2013-08-01T19:22:30Z,N,1,0,0,0.0,0.0,0.0,0.0,5,0.0,3.0,1.0,0.5,0.0,0.0,,0,4.5,1,,2013,8


In [131]:
%sql
select count(1) from nyctaxi.green_taxi_trips_raw;

count(1)
59036972
