In [0]:
from pyspark.sql.functions import *
import uuid
import json
import random
from datetime import datetime, timedelta

In [0]:
sales_path = "/mnt/files/sales_order"
dbutils.fs.rm(sales_path, True)

In [0]:
spark.sql("USE CATALOG hive_metastore")
spark.sql("DROP DATABASE IF EXISTS dltdb CASCADE")
spark.sql("CREATE DATABASE dltdb")
spark.sql("USE dltdb")

In [0]:
# Generate static product table
product_data = [
    (101, "Laptop", 800),
    (102, "Mouse", 25),
    (103, "Keyboard", 45),
    (104, "Monitor", 180),
    (105, "Headphones", 60),
    (106, "Webcam", 70),
    (107, "Printer", 150),
    (108, "Tablet", 300),
    (109, "Smartphone", 600),
    (110, "Speakers", 120),
    (111, "Router", 90),
    (112, "External Hard Drive", 100)
]
product_df = spark.createDataFrame(product_data, ["product_id", "product_name", "unit_price"])
product_df.write.mode("overwrite").format("delta").saveAsTable("products")

### Sales Data Generator Function

In [0]:
# Ensure the directory exists
dbutils.fs.mkdirs(sales_path)

def generate_sales_data(num_records=5):
    sales = []
    for _ in range(num_records):
        sales.append({
            "order_id": str(uuid.uuid4()),
            "customer_id": random.choice([1, 2, 3, 4, 5]),
            "product_id": random.choice([101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112]),
            "quantity": random.randint(0, 5),
            "order_timestamp": (datetime.now() - timedelta(minutes=random.randint(1, 60))).strftime("%Y-%m-%d %H:%M:%S")
        })

    file_name = f"/dbfs{sales_path}/sales_{uuid.uuid4()}.json"
    
    with open(file_name, "w") as f:
        for record in sales:
            f.write(json.dumps(record) + "\n")  # Write as newline-delimited JSON (NDJSON)

    print(f"{num_records} sales records written to {file_name}")

In [0]:
display(dbutils.fs.ls(sales_path))

In [0]:
generate_sales_data(200)
display(dbutils.fs.ls(sales_path))

In [0]:
spark.sql("SELECT * FROM dltdb.gold_top_products").display()

### Clean Up

In [0]:
dbutils.fs.rm(sales_path, True)
dbutils.fs.rm("/mnt/files/tables", True)
spark.sql("DROP DATABASE IF EXISTS dltdb CASCADE")