**Import packages**

In [0]:
import sys
sys.path.append('dbfs:/FileStore/tables/')

from schema_utils import validate_schema
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

**Define path**

In [0]:
bronze_path = "abfss://bronze@stfinancedev.dfs.core.windows.net/bnz_accounts"
silver_path = "abfss://silver@stfinancedev.dfs.core.windows.net/s_accounts"

### **Schema Validation**

In [0]:
#Define schema
expected_schema = StructType([
    StructField('AccountID', StringType(), True),
    StructField('CustomerID', StringType(), True), 
    StructField('BranchID', StringType(), True), 
    StructField('AccountType', StringType(), True), 
    StructField('Balance', DecimalType(18,2), True), 
    StructField('Currency', StringType(), True), 
    StructField('AccountStatus', StringType(), True), 
    StructField('DateOpened', TimestampType(), True), 
    StructField('HandledByEmployeeID', StringType(), True), 
    StructField('IFSC', StringType(), True)
])

In [0]:
#load incoming schema
incoming_df = spark.read.format("parquet").load(bronze_path)

In [0]:
#schema validation
validate_schema(incoming_df, expected_schema,"Accounts")

### **Incremental loading**

In [0]:
if DeltaTable.isDeltaTable(spark, silver_path):
    silver_df = DeltaTable.forPath(spark, silver_path).toDF()
    max_ts = silver_df.agg(max(col("DateOpened")).alias("max_ts"))\
                      .first()["max_ts"]
    inc     = incoming_df.filter(col("DateOpened") > lit(max_ts))
else:
    inc = incoming_df


In [0]:
inc.limit(10).display()

### **Data Validations**

In [0]:
#Null handle
df_clean = inc.fillna({'balance': 0.00})
df_clean = df_clean.filter(col("AccountID").isNotNull() | (col("AccountID") == ""))

#standardized format
df_clean = df_clean.withColumn("DateOpened", to_timestamp(col("DateOpened"), "yyyy-MM-dd'T'HH:mm:ss"))
df_clean = df_clean.withColumn("IFSC", regexp_replace(col("IFSC"), r"[^A-Za-z0-9]", ""))

#handle duplicates
df_clean = df_clean.dropDuplicates(['AccountID'])




**Data count**

In [0]:
incoming_rows = inc.count()
loaded_rows = df_clean.count()
rejected_rows = incoming_rows - loaded_rows
print(f"Incoming rows from bronze layer {incoming_rows}")
print(f"Loaded rows to silver layer {loaded_rows}")
print(f"Rejected rows {rejected_rows}")


### **Write in silver layer**

In [0]:
df_clean.write.format("delta")\
       .mode("append")\
       .save(silver_path)