In [0]:
from pyspark.sql.functions import *
import uuid
import json
import random
from datetime import datetime, timedelta
import pandas as pd

# Table names
bronze_table = "bronze_sales_tbl"
silver_table = "silver_sales_tbl"
gold_table   = "gold_sales_tbl"

# Paths for Auto Loader input and checkpoints
sales_path = "/sales_order"
bronze_checkpoint = "/checkpoint/bronze"
silver_checkpoint = "/checkpoint/silver"
gold_checkpoint   = "/checkpoint/gold"

In [0]:
spark.sql("USE CATALOG hive_metastore")
spark.sql("DROP DATABASE IF EXISTS medallion CASCADE")
spark.sql("CREATE DATABASE medallion")
spark.sql("USE medallion")

In [0]:
spark.sql("SHOW TABLES").show()

In [0]:
# Generate static customer table
customer_data = [
    (1, "Alice", "North"),
    (2, "Bob", "South"),
    (3, "Charlie", "East"),
    (4, "Diana", "West"),
    (5, "Eve", "North"),
    (6, "Frank", "South"),
    (7, "Grace", "East"),
    (8, "Hank", "West"),
    (9, "Ivy", "North"),
    (10, "Jack", "South"),
    (11, "Kathy", "East"),
    (12, "Leo", "West"),
    (13, "Mona", "North"),
    (14, "Nina", "South"),
    (15, "Oscar", "East")
]
customer_df = spark.createDataFrame(customer_data, ["customer_id", "customer_name", "region"])
customer_df.write.mode("overwrite").format("delta").saveAsTable("customers")

# Generate static product table
product_data = [
    (101, "Laptop", 800),
    (102, "Mouse", 25),
    (103, "Keyboard", 45),
    (104, "Monitor", 180),
    (105, "Headphones", 60),
    (106, "Webcam", 70),
    (107, "Printer", 150),
    (108, "Tablet", 300),
    (109, "Smartphone", 600),
    (110, "Speakers", 120),
    (111, "Router", 90),
    (112, "External Hard Drive", 100)
]
product_df = spark.createDataFrame(product_data, ["product_id", "product_name", "unit_price"])
product_df.write.mode("overwrite").format("delta").saveAsTable("products")

### Sales Data Generator Function

In [0]:
# Ensure the directory exists
dbutils.fs.mkdirs(sales_path)

def generate_sales_data(num_records=5):
    sales = []
    for _ in range(num_records):
        sales.append({
            "order_id": str(uuid.uuid4()),
            "customer_id": random.choice([1, 2, 3, 4, 5]),
            "product_id": random.choice([101, 102, 103, 104, 105]),
            "quantity": random.randint(1, 5),
            "order_timestamp": (datetime.now() - timedelta(minutes=random.randint(1, 60))).strftime("%Y-%m-%d %H:%M:%S")
        })

    file_name = f"/dbfs{sales_path}/sales_{uuid.uuid4()}.json"
    
    with open(file_name, "w") as f:
        for record in sales:
            f.write(json.dumps(record) + "\n")  # Write as newline-delimited JSON (NDJSON)

    print(f"{num_records} sales records written to {file_name}")

def listStream():
    for q in spark.streams.active:
      print("Id: ", q.id, "Streaming: ", q.isActive)

def stopStream():
    for q in spark.streams.active:
      q.stop()
      q.awaitTermination()

def cleanup():
    stopStream()
    dbutils.fs.rm("/checkpoint", True)
    dbutils.fs.rm("/schema", True)
    spark.sql("DROP DATABASE IF EXISTS medallion CASCADE")