In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, DoubleType, TimestampType

spark = SparkSession.builder.appName("extras").getOrCreate()

In [0]:
# Create metrics table 'sales_pipeline_metrics'
schema = StructType(
    [
        StructField("run_id", StringType(), nullable=False),
        StructField("run_timestamp", TimestampType(), nullable=False),
        StructField("bronze_row_count", LongType(), nullable=False),
        StructField("silver_valid_count", LongType(), nullable=False),
        StructField("silver_quarantine_count", LongType(), nullable=False),
        StructField("fact_rows_inserted", LongType(), nullable=False),
        StructField("fact_rows_updated", LongType(), nullable=False),
        StructField("run_status", StringType(), nullable=False)
    ]
)

metrics_table = spark.createDataFrame([], schema)
metrics_table.printSchema()

metrics_table.write.format("delta").mode("overwrite").saveAsTable("sales_pipeline_metrics")

if spark.catalog.tableExists("sales_pipeline_metrics"):
    print("Table exists.")
else:
    print("Table is not created.")

In [0]:
# Bulk remove tables
tables_to_drop = ["sales_bronze", "sales_silver", "sales_quarantine", "sales_fact"]

for t in tables_to_drop:
    spark.sql(f"DROP TABLE IF EXISTS {t}")

In [0]:
# Create file proccessed table
schema = StructType(
    [
        StructField("file_name", StringType(), nullable=False),
        StructField("processed_timestamp", TimestampType(), nullable=False)
    ]
)

file_processed_table = spark.createDataFrame([], schema)
file_processed_table.write.format("delta").mode("overwrite").saveAsTable("sales_processed_files")

if spark.catalog.tableExists("sales_processed_files"):
    print("Table exists.")
else:
    print("Table is not created.")

In [0]:
%sql
--TRUNCATE TABLE sales_pipeline_metrics;
--select * from sales_processed_files order by processed_timestamp desc;
select * from sales_pipeline_metrics order by run_timestamp desc;
--DELETE FROM sales_pipeline_metrics where silver_quarantine_count = '1008';
--DELETE FROM sales_processed_files where file_name = 'sales_1000';
--show tables;



In [0]:
%sql
select * from csv. `dbfs:/databricks-datasets/iot/iot_devices.json` limit 1

In [0]:
#--select * from csv.`dbfs:/databricks-datasets/online_retail/data-001/data.csv` where _c1 = '85123A' LIMIT 10
dbutils.fs.ls('dbfs:/databricks-datasets/iot')
