In [0]:
from pyspark.sql.functions import *
import dlt  # Delta Live Tables library

# Define source file paths (in Databricks volumes)
volume_base_path = "/Volumes/cronos_unity_catalog/cdata/banking_unity_catalog/datastore/"
csv_files = {
    "customer_master": f"{volume_base_path}retail_banking_customer_master.csv",
    "transactions": f"{volume_base_path}retail_banking_customer_transactions.csv",
    "credit_score": f"{volume_base_path}retail_banking_credit_score.csv",
    "loans": f"{volume_base_path}retail_banking_customer_loans.csv"
}

# Define Bronze Tables (Raw Ingestion Layer)
@dlt.table(name="bronze_customer_master", comment="Raw data for customer master")
def bronze_customer_master():
    # Ingest raw customer master data from CSV
    return spark.read.csv(csv_files["customer_master"], header=True, inferSchema=True)

@dlt.table(name="bronze_transactions", comment="Raw data for transactions")
def bronze_transactions():
    # Ingest raw transactions data from CSV
    return spark.read.csv(csv_files["transactions"], header=True, inferSchema=True)

@dlt.table(name="bronze_credit_score", comment="Raw data for credit scores")
def bronze_credit_score():
    # Ingest raw credit score data from CSV
    return spark.read.csv(csv_files["credit_score"], header=True, inferSchema=True)

@dlt.table(name="bronze_loans", comment="Raw data for loans")
def bronze_loans():
    # Ingest raw loans data from CSV
    return spark.read.csv(csv_files["loans"], header=True, inferSchema=True)

# Define Gold Tables (Transformed Layer)
@dlt.table(name="gold_customer_master", comment="Enriched customer master data with risk metrics")
def gold_customer_master():
    # Transform bronze customer master data with additional columns
    df = dlt.read("bronze_customer_master")
    return (
        df.dropDuplicates()
        .withColumn(
            "Age_Group", 
            when(col("Age") < 30, "Youth")
            .when((col("Age") >= 30) & (col("Age") < 60), "Adult")
            .otherwise("Senior")
        )
        .withColumn("Income_Bracket_Group", 
            when(col("Income_Bracket") < 50000, "Low Income")
            .when((col("Income_Bracket") >= 50000) & (col("Income_Bracket") < 100000), "Middle Income")
            .otherwise("High Income")
        )
    )

@dlt.table(name="gold_transactions", comment="Aggregated transactions data with spend analysis")
def gold_transactions():
    # Transform and aggregate bronze transactions data
    df = dlt.read("bronze_transactions")
    return (
        df.dropDuplicates()
        .groupBy("Customer_ID")
        .agg(
            count("*").alias("Total_Transactions"),
            avg("Transaction_Amount").alias("Avg_Transaction_Amount"),
            sum("Transaction_Amount").alias("Total_Transaction_Amount"),
            sum(
                when(col("Transaction_Type") == "Withdrawal", col("Transaction_Amount"))
                .otherwise(0)
            ).alias("Total_Withdrawals"),
            sum(
                when(col("Transaction_Type") == "Deposit", col("Transaction_Amount"))
                .otherwise(0)
            ).alias("Total_Deposits"),
        )
        .withColumn(
            "Net_Transaction_Amount",
            col("Total_Deposits") - col("Total_Withdrawals")
        )
    )

@dlt.table(name="gold_credit_score", comment="Credit score data with customer risk profiles")
def gold_credit_score():
    # Transform bronze credit score data with additional columns and aggregations
    df = dlt.read("bronze_credit_score")
    return (
        df.dropDuplicates()
        .withColumn(
            "Risk_Level",
            when(col("Credit_Score") < 580, lit("High Risk"))
            .when((col("Credit_Score") >= 580) & (col("Credit_Score") < 670), lit("Medium Risk"))
            .otherwise(lit("Low Risk"))
        )
        .groupBy("Risk_Level")
        .agg(
            count("*").alias("Customer_Count"),
            avg("Credit_Score").alias("Avg_Credit_Score")
        )
    )

@dlt.table(name="gold_loans", comment="Loan metrics with customer debt analysis")
def gold_loans():
    # Transform and aggregate bronze loans data
    df = dlt.read("bronze_loans")
    return (
        df.dropDuplicates()
        .groupBy("Customer_ID")
        .agg(
            count("*").alias("Total_Loans"),
            sum("Loan_Amount").alias("Total_Loan_Amount"),
            sum("Outstanding_Balance").alias("Total_Outstanding_Balance"),
            avg("Outstanding_Balance").alias("Avg_Outstanding_Balance"),
            (sum("Outstanding_Balance") / sum("Loan_Amount")).alias("Debt_To_Loan_Ratio")
        )
    )

@dlt.table(name="gold_combined_insights", comment="Consolidated insights combining all bronze tables")
def gold_combined_insights():
    # Read Bronze Tables for customer master, transactions, credit scores, and loans
    customer_master = dlt.read("bronze_customer_master")
    transactions = dlt.read("bronze_transactions")
    credit_scores = dlt.read("bronze_credit_score")
    loans = dlt.read("bronze_loans")
    
    # Join Data from All Bronze Tables based on Customer_ID
    combined_df = (
        customer_master
        .join(transactions.groupBy("Customer_ID")
              .agg(
                  count("*").alias("Total_Transactions"),
                  sum("Transaction_Amount").alias("Total_Transaction_Amount"),
                  avg("Transaction_Amount").alias("Avg_Transaction_Amount"),
              ), "Customer_ID", "left")
        .join(credit_scores, "Customer_ID", "left")
        .join(loans.groupBy("Customer_ID")
              .agg(
                  count("*").alias("Total_Loans"),
                  sum("Loan_Amount").alias("Total_Loan_Amount"),
                  sum("Outstanding_Balance").alias("Total_Outstanding_Balance"),
              ), "Customer_ID", "left")
    )

    # Add combined metrics for risk level, debt-to-income ratio, and financial health index
    combined_insights = combined_df.drop("Credit_Score_Category").withColumn(
        "Risk_Level",
        when(col("Credit_Score") < 580, "High Risk")
        .when((col("Credit_Score") >= 580) & (col("Credit_Score") < 670), "Medium Risk")
        .otherwise("Low Risk")
    ).withColumn(
        "Debt_To_Income_Ratio",
        when(
        (col("Income_Bracket").isNotNull()) & (col("Income_Bracket") != 0),
        col("Total_Outstanding_Balance") / col("Income_Bracket")
        ).otherwise(lit(None))
    ).withColumn(
        "Financial_Health_Index",
        (col("Total_Transaction_Amount") - col("Total_Outstanding_Balance")) / col("Income_Bracket")
    )

    return combined_insights

  if (frame.filename.startswith("<command-") or re.search("^\/.*\/.ipykernel\/.*\/command-.*", frame.filename)):


Name,Type
Customer_ID,string
Name,string
Age,int
Gender,string
Income_Bracket,int
Account_Type,string
Customer_Segment,string
Total_Transactions,bigint
Total_Transaction_Amount,double
Avg_Transaction_Amount,double
