## Stream Customers Data From Cloud Files to Delta Lake using Auto Loader
1. Read files from cloud storage using Auto Loader
1. Transform the dataframe to add the following columns
    -   file path: Cloud file path
    -   ingestion date: Current Timestamp
1. Write the transformed data stream to Delta Lake Table

### 1. Read files using Auto Loader

In [0]:
%sql
USE CATALOG newpavancatalog;
use schema bronze;

In [0]:
%sql
USE CATALOG newpavancatalog;
USE SCHEMA bronze;

CREATE EXTERNAL VOLUME IF NOT EXISTS volume_landing_Stream
    LOCATION 'abfss://landing@parkaru15sa.dfs.core.windows.net/customers';

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, TimestampType

customers_schema = StructType(fields=[StructField("customer_id", IntegerType()),
                                     StructField("customer_name", StringType()),
                                     StructField("date_of_birth", DateType()),
                                     StructField("telephone", StringType()),
                                     StructField("email", StringType()),
                                     StructField("member_since", DateType()),
                                     StructField("created_timestamp", TimestampType())
                                    ]
                              )

In [0]:
customers_df = (
                    spark.readStream
                         .format("cloudFiles")
                         .option("cloudFiles.format", "json")
                         .option("cloudFiles.schemaLocation", "/Volumes/newpavancatalog/bronze/volume_landing_stream/_schema")
                         .option("cloudFiles.inferColumnTypes", "true")
                         .option("cloudFiles.schemaHints", "date_of_birth DATE, member_since DATE, created_timestamp TIMESTAMP")
                         .load("/Volumes/newpavancatalog/bronze/volume_landing_stream/")
)

In [0]:
display(customers_df)

### 2. Transform the dataframe to add the following columns
- file path: Cloud file path
- ingestion date: Current Timestamp

In [0]:
from pyspark.sql.functions import current_timestamp, col

customers_transformed_df = (
                                customers_df.withColumn("file_path", col("_metadata.file_path"))
                                            .withColumn("ingestion_date", current_timestamp())
)

### 3. Write the transformed data stream to Delta Table 

In [0]:
streaming_query = (
                    customers_transformed_df.writeStream
                        .format("delta")
                        .option("checkpointLocation", "/Volumes/newpavancatalog/bronze/volume_landing_stream/_checkpoint_stream")
                        .toTable("newpavancatalog.bronze.customers_autoloader")
)

In [0]:
# streaming_query.stop()

In [0]:
%sql
SELECT * FROM newpavancatalog.bronze.customers_autoloader;