In [0]:
%sql
USE CATALOG newpavancatalog;
use schema bronze;

In [0]:
%sql
USE CATALOG newpavancatalog;
USE SCHEMA bronze;

CREATE EXTERNAL VOLUME IF NOT EXISTS volume_landing_Stream_hr
    LOCATION 'abfss://landing@parkaru15sa.dfs.core.windows.net/hr_data';

In [0]:
from pyspark.sql.types import (
    StructType, StructField, IntegerType, StringType, 
    DateType, TimestampType, ArrayType
)

customers_schema = StructType([
    StructField("id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("gender", StringType()),
    StructField("city", StringType()),
    
    # Job - nested object with title and salary
    StructField("job", StructType([
        StructField("title", StringType()),
        StructField("salary", StringType())
    ])),  # Added closing parentheses and comma
    
    # spoken_languages - array of objects with language and level
    StructField("spoken_languages", ArrayType(
        StructType([
            StructField("language", StringType()),
            StructField("level", StringType())
        ])
    )),
    
    # prev_company - array of strings (company names)
    StructField("prev_company", ArrayType(StringType()))
])

In [0]:
display(customers_schema)

In [0]:
from pyspark.sql.functions import col, parse_json
hr_df = (

    spark.readStream
        .format("json")
        .schema(customers_schema).load("/Volumes/newpavancatalog/bronze/volume_landing_stream_hr/")
)

display(hr_df)

In [0]:
from pyspark.sql.functions import current_timestamp, col

customers_transformed_df = (hr_df.withColumn("file_path", col("_metadata.file_path"))
                                            .withColumn("ingestion_date", current_timestamp())
)

In [0]:
streaming_query = (
                    customers_transformed_df.writeStream
                        .format("delta")
                        .option("checkpointLocation", "/Volumes/newpavancatalog/bronze/volume_landing_stream/hr_data/")
                        .toTable("newpavancatalog.bronze.hr_stream")
)

In [0]:
streaming_query.stop()