In [0]:
# Run only if not using Unity Catalog

# Replace the values: *** Data Lake Name ***, *** Data Lake Access Key ***

#storage_account_name = "*** Data Lake Name ***"
#storage_account_access_key = "*** Data Lake Access Key ***"

#spark.conf.set(f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net", storage_account_access_key)

### A. Define File Path

In [0]:
# Replace the values: ***ContainerName***, ***DataLakeName*** and file path (if required)

taxiZonesFilePath = "abfss://taxidata@mstrainingdatalake.dfs.core.windows.net/Raw/TaxiZones.csv"

print(taxiZonesFilePath)

### B. Read File by Applying Schema

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

"""
# Create schema for Taxi Zones Data

yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("lpep_pickup_datetime"   , TimestampType() , True),
                            StructField("lpep_dropoff_datetime"  , TimestampType() , True),
                            StructField("passenger_count"        , DoubleType()    , True),
                            StructField("trip_distance"          , DoubleType()    , True),
                            StructField("RatecodeID"             , DoubleType()    , True),
                            StructField("store_and_fwd_flag"     , StringType()    , True),
                            StructField("PULocationID"           , IntegerType()   , True),
                            StructField("DOLocationID"           , IntegerType()   , True),
                            StructField("payment_type"           , IntegerType()   , True),
                            StructField("fare_amount"            , DoubleType()    , True),
                            StructField("extra"                  , DoubleType()    , True),
                            StructField("mta_tax"                , DoubleType()    , True),
                            StructField("tip_amount"             , DoubleType()    , True),
                            StructField("tolls_amount"           , DoubleType()    , True),
                            StructField("improvement_surcharge"  , DoubleType()    , True),
                            StructField("total_amount"           , DoubleType()    , True),
                            StructField("congestion_surcharge"   , DoubleType()    , True),
                            StructField("airport_fee"            , DoubleType()    , True)
                        ])
                   )

"""

In [0]:
# Create DataFrame by applying the schema

taxiZonesDF = (
                    spark
                        .read
                        .option("header", "true")

                        .option("inferSchema", "true")
                        #.schema(yellowTaxiSchema)

                        .csv(taxiZonesFilePath)
                )

# Print schema
taxiZonesDF.printSchema()

### D. Transform Data

#### D.2. Rename Columns

In [0]:
taxiZonesDF = (
                   taxiZonesDF                        
                        
                        .withColumnRenamed("service_zone", "ServiceZone")    
               )

### E. Save Data to Data Lake as Spark (Delta) Table

#### E.1. Create Catalog and Schema in Unity Catalog

In [0]:
%sql

CREATE CATALOG IF NOT EXISTS taxicatalog;

CREATE SCHEMA IF NOT EXISTS taxicatalog.rides;

-- For Hive Metastore
-- CREATE SCHEMA IF NOT EXISTS hive_metastore.rides;

#### E.2. Save DataFrame as Delta Table

In [0]:
(
    taxiZonesDF
        .write

        .mode("overwrite")

        .format("delta")

        .saveAsTable("taxicatalog.rides.taxizones")
)