**Import packages**

In [0]:
import sys
sys.path.append('/dbfs/FileStore/tables/')

from schema_utils import validate_schema
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta.tables import DeltaTable

**Define paths**

In [0]:
bronze_path = "abfss://bronze@stfinancedev.dfs.core.windows.net/bnz_customers"
silver_path = "abfss://silver@stfinancedev.dfs.core.windows.net/s_customers"

### **Schema validation**

In [0]:
#Define schema
expected_schema = StructType([
    StructField('CustomerID', IntegerType(), True),
    StructField('FirstName', StringType(), True),
    StructField('LastName', StringType(), True),
    StructField('DateOfBirth', DateType(), True), 
    StructField('Gender', StringType(), True), 
    StructField('Email', StringType(), True), 
    StructField('PhoneNumber', StringType(), True), 
    StructField('Address', StringType(), True), 
    StructField('City', StringType(), True), 
    StructField('State', StringType(), True), 
    StructField('Pincode', StringType(), True), 
    StructField('DateOfRegistration', TimestampType(), True), 
    StructField('CustomerType', StringType(), True), 
    StructField('PAN', StringType(), True), 
    StructField('Aadhaar', StringType(), True)
    ])


In [0]:
#Load incoming df
incoming_df = spark.read.format("parquet").load(bronze_path)

In [0]:
#Run validations
validate_schema(incoming_df, expected_schema, "customer")

### **Incremental loading**

In [0]:
if DeltaTable.isDeltaTable(spark, silver_path):
    silver_df = DeltaTable.forPath(spark, silver_path).toDF()
    max_ts = silver_df.agg(max(col("DateOfRegistration")).alias("max_ts")).first()['max_ts']
else:
    max_ts = None

if max_ts:
    inc_df = incoming_df.filter(col("DateOfRegistration") > lit(max_ts))
else:
    inc_df = incoming_df

### **Data validation**

In [0]:
# Null handle 
df_notnull = inc_df.filter(col("CustomerID").isNotNull() & col("Email").isNotNull() & col("PhoneNumber").isNotNull() &
                        col("PAN").isNotNull() &  col("Aadhaar").isNotNull())

#standardized format
df_cust = df_notnull.withColumn("PhoneNumber", regexp_replace(col("PhoneNumber"), r"[^0-9]", ""))
df_cust = df_cust.withColumn("Pincode", regexp_replace(col("Pincode"), r"[^0-9]", ""))
df_cust = df_cust.withColumn("DateOfBirth", to_date(col("DateOfBirth"), "MM-dd-yyyy"))
df_cust = df_cust.withColumn("DateOfRegistration", to_timestamp(col("DateOfRegistration"), "MM-dd-yyyy HH:mm:ss"))


#Duplicates Handle
window = Window.partitionBy("CustomerID").orderBy(col("DateOfRegistration").desc())

df_cust = df_cust.withColumn("rank", row_number().over(window)).filter(col("rank") == 1).drop("rank")


**Data Count**

In [0]:
incoming_rows = inc_df.count()
loaded_rows = df_cust.count()
rejected_rows = incoming_rows - loaded_rows
print(f"Incoming rows from bronze layer {incoming_rows}")
print(f"Loaded rows to silver layer {loaded_rows}")
print(f"Rejected rows {rejected_rows}")

### **Write in silver layer**

In [0]:
df_cust.write.format("delta")\
       .mode("append")\
       .save(silver_path)