## Import All Utilities

Load utility functions that are shared across processing notebooks.

In [0]:
%run ../01_Utilities/config

In [0]:
%run ../01_Utilities/common_functions

In [0]:
%run ../01_Utilities/data_quality

## Configuration

Define input and output paths and processing parameters.

In [0]:
# Input and output paths using mount points
gold_customers_path = get_gold_path("customers")

In [0]:
# Processing parameters
file_format = FILE_FORMATS["gold"]

print(f"Processing Customers Data:")
print(f"- Customers Source: {gold_customers_path}")
print(f"File format: {file_format}")

## Load Required Data

In [0]:
# Import necessary libraries
from pyspark.sql.functions import col, count, when
from pyspark.sql.types import IntegerType, FloatType, DoubleType
import matplotlib.pyplot as plt
import seaborn as sns

In [0]:
# Load Gold layer Customers Data
print(f"Loading Gold layer Customers Data...")
tmp_customers_df = spark.read.format(file_format).load(gold_customers_path)
customers_df = spark.createDataFrame(tmp_customers_df.rdd, gold_customers_schema)
customers_df.cache()
print(f"Loaded {customers_df.count()} customer records")

## Data Exploration and Profiling

In [0]:
# Display sample data
print("Customers Data Sample:")
display(customers_df.limit(10))

In [0]:
# Display schema
print("Customers Data Schema:")
customers_df.printSchema()

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select relevant columns
columns_of_interest = ["Age", "IsParent", "TotalSpending", "NoOfInvoices", "AverageSpending"]
data = customers_df.select(columns_of_interest).toPandas()

# Calculate correlation matrix
correlation_matrix = data.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap")
plt.show()