# Init

## Import Libraries



In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col

In [0]:
RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "created_date"
}



# Reading from bronze layer

In [0]:
df = spark.table("workspace.bronze.crm_cust_info")

# Transforming crm_cust_info table

In [0]:
# trimming the strings found in the columns
# normalization for marital status, gndr
# rename the column names 
# date format is correct so no need to touch on it   

df.display()

## trimming the strings found in the columns

In [0]:

for field in df.schema.fields:
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))

df.display()



## Normalization of the data

In [0]:
df = (
       df 
          .withColumn(
             "cst_marital_status",
             F.when(F.upper(F.col("cst_marital_status")) == "S", "Single") 
              .when(F.upper(F.col("cst_marital_status")) == "M", "Married")
              .otherwise("n/a")
            )
            .withColumn(
              "cst_gndr",
              F.when(F.upper(F.col("cst_gndr")) == "M", "Male")
               .when(F.upper(F.col("cst_gndr")) == "F", "Female")
               .otherwise("n/a")
               )
)


In [0]:
df.display()

## Remove Records with Missing Customer ID

In [0]:
df = df.filter(col("cst_id").isNotNull())

## rename the column names

In [0]:
for old_name, new_name in RENAME_MAP.items():
    df= df.withColumnRenamed(old_name, new_name)



In [0]:
df.display()


# Write it into silver layer

In [0]:
(
df.write
  .mode("overwrite")
  .option("overwriteSchema", "true")
  .format("delta")
  .saveAsTable("silver.crm_customers")
)

In [0]:
%sql
select * from workspace.silver.crm_customers