In [None]:
import time
import random
from datetime import datetime, timezone, timedelta
from concurrent.futures import ThreadPoolExecutor

In [None]:
def generate_ad_stream(
    volume_path: str,
    stream_type: str,
    advertiser_count: int,
    batch_interval_s: int,
    latency_max_s: int):
    """
    Simulates real-time data generation for an ad platform.

    Args:
        volume_path (str): The full path to the Volume where data will be saved (e.g., /Volumes/catalog/schema/landing).
        stream_type (str): The type of data to generate. Must be either 'paid_event' or 'budget_change'.
        advertiser_count (int): The number of distinct advertisers to simulate.
        batch_interval_s (int): The time interval in seconds between generating batches of data.
        latency_max_s (int): The maximum random latency to simulate event time.
    """
    print(f"🚀 Starting '{stream_type}' stream generator for path: {volume_path}")
    
    while True:
        now = datetime.now(timezone.utc)
        data = []
        
        # Generate a batch of random events
        for _ in range(1, advertiser_count + 1):
            advertiser_id = random.randint(1, advertiser_count + 1)
            moment = now
            
            if stream_type == 'paid_event':
                # Simulate a paid click event
                amount = round(random.uniform(1.00, 1.40), 2)
                record = {
                    "advertiser_id": advertiser_id,
                    "moment": moment,
                    "amount": amount
                }
                data.append(record)
            
            elif stream_type == 'budget_change':
                # Simulate a budget change event
                new_budget = round(random.uniform(10.00, 30.00), 2)
                record = {
                    "advertiser_id": advertiser_id,
                    "moment": moment,
                    "new_budget_value": new_budget
                }
                data.append(record)
            
            else:
                raise ValueError("Invalid stream_type. Choose 'paid_event' or 'budget_change'.")

        # Create a Spark DataFrame from the generated data
        if not data:
            print("No data generated in this batch. Skipping write.")
            continue
            
        try:
            df = spark.createDataFrame(data)
            
            # Write the DataFrame to the specified Delta path in append mode
            df.write.format("delta").mode("append").save(volume_path)
            print(f"Successfully wrote {len(data)} records to {volume_path} at {now.isoformat()}")
            
        except Exception as e:
            print(f"Error writing to Spark: {e}")

        # Wait for the next interval
        time.sleep(batch_interval_s)


In [None]:
# --- Configuration for Running the Generators ---

# IMPORTANT: Replace these with your actual Catalog and Schema names
CATALOG_NAME = "ad_monitor"
SCHEMA_NAME = "landing"

# Settings for the data streams
advertiser_count = 5      # Simulate 5 different advertisers
batch_interval_s = 300      # Generate a new batch of data every 300 seconds
latency_max_s = 120         # Simulate a max latency of 60 seconds

# Define the streams to run: (volume_path, stream_type)
streams_to_run = [
    (f"/Volumes/{CATALOG_NAME}/{SCHEMA_NAME}/paid_events_stream", 'paid_event'),
    (f"/Volumes/{CATALOG_NAME}/{SCHEMA_NAME}/budget_changes_stream", 'budget_change')
]

# --- Launch Generators Concurrently --
# Using ThreadPoolExecutor to run the infinite generator functions in parallel threads.
# This cell runs indefinitely. To stop it, interrupt or detach the notebook from the cluster.
print("Starting concurrent stream generation. Interrupt the kernel to stop.")

with ThreadPoolExecutor(max_workers=len(streams_to_run)) as executor:
    for path, stream_type in streams_to_run:
        executor.submit(
            generate_ad_stream,
            path,
            stream_type,
            advertiser_count,
            batch_interval_s,
            latency_max_s
        )
    # The context manager will block here because the submitted tasks are infinite loops.
