### Prepare environment

In [0]:
%run ../environment/prepare_environment

In [0]:
%run ../environment/prepare_data

### Load raw data to bronze table.

Bronze table preserves the schema as-is from the original while only adding some ingestion metadata like loading time, source file details and row ID.

We use **Autoloader** (`cloudFiles`) for efficient, incremental ingestion of new files without scanning the entire directory.

To learn more about Autoloader please check [Databricks Autoloader documentation](https://learn.microsoft.com/en-us/azure/databricks/ingestion/cloud-object-storage/auto-loader/).

In [0]:
import logging
from pyspark.sql import DataFrame
from pyspark.sql import functions as F

logging.basicConfig(level=logging.ERROR)
logger = logging.getLogger(__name__)

spark.sql("USE CATALOG ai_ml_in_practice")

# Prepare technical columns function
def add_technical_columns(df: DataFrame) -> DataFrame:

    return (
        df.withColumn("_loaded_at", F.current_timestamp())
        .withColumn("_file_modified_at", F.col("_metadata.file_modification_time"))
        .withColumn("_source_file", F.col("_metadata.file_name"))
        .withColumn("_row_id", F.expr("uuid()"))
        .withColumn("_is_processed", F.lit(False))
    )

# Create volume to store Autoloader checkpoint files
spark.sql("CREATE VOLUME IF NOT EXISTS telco_customer_churn_bronze.autoloader_files")
checkpoint_file_location = f"/Volumes/ai_ml_in_practice/telco_customer_churn_bronze/autoloader_files"

# Run autoloader
try:
    (
        spark.readStream                                                           # Init streaming source 
        .format("cloudFiles")                                                      # Specify cloudFiles format to use Autoloader
        .option("cloudFiles.format", "csv")                                        # File type (csv)
        .option("inferSchema", "true")                                             # Auto schema detection
        .option("delimiter", ",")                                                  # Set delimiter to ","
        .option("escape", '"')                                                     # Set escape character to '"'
        .option("cloudFiles.schemaLocation", checkpoint_file_location)             # Schema files path
        .option("cloudFiles.maxFilesPerTrigger", 1)                                # Max number of files processed per trigger
        .option("cloudFiles.inferColumnTypes", "true")                             # Try to "guess" column types
        .load(f"/Volumes/ai_ml_in_practice/telco_customer_churn_raw/telco_data")   # Source files path
        .transform(add_technical_columns)                                          # Function to add technical columns
        .writeStream.format("delta")                                               # Output format (delta)
        .option("checkpointLocation", checkpoint_file_location)                    # Checkpoint files path
        .trigger(availableNow=True)                                                # Process all available files now
        .toTable("telco_customer_churn_bronze.telco_bronze")                       # Output table path
    )
except Exception as e:
    logger.error(f"Error during Autoloader ingestion:\n{e}")
    raise e