In [0]:
import dlt
from pyspark.sql.functions import input_file_name, current_timestamp
import os

# Volume path - âœ… Replace with your actual path
VOLUME_PATH = "/Volumes/newpavancatalog/bronze/f1files"

# Get all CSV files from volume
csv_files = [f.path for f in dbutils.fs.ls(VOLUME_PATH) if f.name.endswith(".csv")]

# Create a bronze table for each CSV file
for csv_file in csv_files:
    # Extract table name from file name
    table_name = os.path.basename(csv_file).replace(".csv", "").replace("-", "_").replace(" ", "_").lower()
    
    @dlt.table(
        name=f"bronze_{table_name}",
        comment=f"Bronze table for {os.path.basename(csv_file)}",
        table_properties={"quality": "bronze"}
    )
    def load_bronze_table(file_path=csv_file):
        return (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "csv")
            .option("header", "true")
            .option("inferSchema", "true")
            .load(file_path)
            .withColumn("ingestion_time", current_timestamp())
        )

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_circuits",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_circuits")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_constructors",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_constructors")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_drivers",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_drivers")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_lap_times",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_lap_times")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_pit_stops",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_pit_stops")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_qualifying",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_qualifying")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_races",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_races")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, trim, lower

@dlt.table(
    name="silver_results",
    comment="Cleaned and transformed payments data",
    table_properties={"quality": "silver"}
)
def silver_payments():
    df = dlt.read("bronze_results")
    # Example cleaning: trim strings, lowercase column names, remove nulls
    cleaned_df = (
        df.select([trim(lower(col(c))).alias(c) for c in df.columns])
    )
    return cleaned_df

In [0]:
import dlt
from pyspark.sql.functions import col, count, avg, current_timestamp

@dlt.table(
    name="gold_driver_summary",
    comment="Aggregated summary of driver information",
    table_properties={"quality": "gold"}
)
def gold_driver_summary():
    drivers_df = dlt.read("silver_drivers")

    # Example: Aggregate by nationality and year of birth
    summary_df = (
        drivers_df.groupBy("nationality")
        .agg(
            count("*").alias("total_drivers"),
            avg(col("dob").substr(1, 4).cast("int")).alias("avg_birth_year")
        )
        .withColumn("record_updated_time", current_timestamp())
    )

    return summary_df


In [0]:
import dlt
from pyspark.sql.functions import col, count

@dlt.table(
    name="gold_constructors_summary",
    comment="Aggregated stats for each F1 constructor (team) across all races",
    table_properties={"quality": "gold"}
)
def gold_constructors_summary():
    constructors_df = dlt.read("silver_constructors")
    results_df = dlt.read("silver_results")
    races_df = dlt.read("silver_races")

    # Join constructors with results and races
    joined_df = (
        results_df.join(constructors_df, "constructorId", "left")
                  .join(races_df, "raceId", "left")
    )

    # Aggregate gold metrics, disambiguate columns
    gold_df = (
        joined_df.groupBy(
            col("constructorId"),
            col("silver_constructors.name"),
            col("silver_constructors.nationality")
        )
        .agg(
            count("raceId").alias("total_races"),
            count(col("position")).alias("total_finishes"),
            count((col("position") == 1).cast("int")).alias("wins")
        )
        .orderBy(col("wins").desc())
    )
    
    return gold_df