In [0]:
# Import required libraries
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/pavansaikandi@gmail.com/unified_pipeline/1_setup/02_config

In [0]:
# Verify project utilities
print(gold_schema, silver_schema, bronze_schema)

In [0]:
# Initialize Notebook Widgets
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "customers", "Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://vitality-nutrition/{data_source}/*.csv'
print(base_path)

#Bronze

In [0]:
# Read data from the s3 bucket
df = (
    spark.read
    .format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(base_path)
    .withColumn("ingest_date", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

In [0]:
# Check sample data
display(df.limit(10))

In [0]:
# Check Schema of the data
df.printSchema()

In [0]:
# Create a table into Bronze Schema
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

#Silver

In [0]:
# Verify the data in the table
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")
display(df_bronze.limit(10))

In [0]:
# Verify and remove Duplicates
df_duplicate = df_bronze.groupBy("customer_id").count().filter("count > 1")
display(df_duplicate)

In [0]:
print('Rows before duplicates dropped: ', df_bronze.count())
df_silver = df_bronze.dropDuplicates(['customer_id'])
print('Rows after duplicates dropped: ', df_silver.count())

In [0]:
# Trim Spaces in Customer Name
df_silver = df_silver.withColumn("customer_name", F.trim(F.col("customer_name")))

In [0]:
# Verify if all the void spaces are removed
display(df_silver.filter(F.col("customer_name") != F.trim(F.col("customer_name"))))

In [0]:
# Correcting City Names(Typos)
df_silver.select("city").distinct().show()

In [0]:
# typos --> correct names
city_mapping = {
    'Bengaluruu': 'Bengaluru',
    'Bengalore': 'Bengaluru',

    'Hyderabadd': 'Hyderabad',
    'Hyderbad': 'Hyderabad',

    'NewDelhi': 'New Delhi',
    'NewDheli': 'New Delhi',
    'NewDelhee': 'New Delhi'
}

allowed = ["Bengaluru", "Hyderabad", "New Delhi"]

df_silver = (
    df_silver.replace(city_mapping, subset=['city'])
    .withColumn("city",
                F.when(F.col("city").isNull(), None)
                .when(F.col("city").isin(allowed), F.col("city"))
                .otherwise(None))
)

In [0]:
# Sanity check
df_silver.select("city").distinct().show()

In [0]:
# Fixing Title Case of customer name
df_silver.select("customer_name").distinct().show()

In [0]:
df_silver = df_silver.withColumn(
    "customer_name", 
    F.when(F.col("customer_name").isNull(), None)
    .otherwise(F.initcap(F.col("customer_name")))
)
                                 

In [0]:
# sanity check

df_silver.select('customer_name').distinct().show()

In [0]:
# Handling missing cities
df_silver.filter(F.col("city").isNull()).show(truncate=False)

In [0]:
# Quick look at the data of customers with missing city names
null_customer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane']
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show(truncate=False)

In [0]:
# City corrections confirmed by business team
customer_city_fix = {
    
    # Sprintx Nutrition
    789403: "New Delhi",

    # Zenathlete Foods
    789420: "Bengaluru",

    # Primefuel Nutrition
    789521: "Hyderabad",

    # Recovery Lane
    789603: "Hyderabad"
}

df_fix = spark.createDataFrame(
    [(k, v) for k, v in customer_city_fix.items()],
    ["customer_id", "fixed_city"]
)

In [0]:
display(df_fix)

In [0]:
# Perform existing df left join with newly created df_fix
df_silver = df_silver.join(df_fix, on='customer_id', how='left')
display(df_silver)

In [0]:
df_silver = df_silver.withColumn("city", F.coalesce(F.col("city"), F.col("fixed_city"))).drop("fixed_city")

In [0]:
# Sanity Check

null_customer_names = ['Sprintx Nutrition', 'Zenathlete Foods', 'Primefuel Nutrition', 'Recovery Lane']
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show(truncate=False)

In [0]:
# Convert customer_id to string 
df_silver = df_silver.withColumn("customer_id", F.col("customer_id").cast("string"))
print(df_silver.printSchema())

In [0]:
# Standardizing Customer Attributes to Match Parent Company Data Model
df_silver = (
    df_silver
    .withColumn("customer", F.concat_ws("-", F.col("customer_name"), F.coalesce(F.col("city"), F.lit("Unknown"))))

    .withColumn("market", F.lit("India"))
    .withColumn("platform", F.lit("Vitality Nutrition"))
    .withColumn("channel", F.lit("Acquisition"))
)

In [0]:
display(df_silver)

In [0]:
# Create a table into Silver Schema
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

# Gold

In [0]:
# Take required columns only and create a table in Gold Schema
df_gold = df_silver.select("customer_id", "customer_name", "city", "customer", "market", "platform", "channel")

df_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{gold_schema}.vn_dim_{data_source}")

In [0]:
# Merging the data source with Parent
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_customers")

df_child_customers = spark.table("fmcg.gold.vn_dim_customers").select(
    F.col("customer_id").alias("customer_code"),
    "customer",
    "market",
    "platform",
    "channel"
)

In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code = source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()