In [1]:
# Labraries
from kafka import KafkaProducer
import json, time, random, uuid
from datetime import datetime, timedelta

In [2]:
# # Producer
producer = KafkaProducer(
    bootstrap_servers='kafka:9092',
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

In [3]:
# Orders Data (Transactional Facts)
def generate_orders(n=1000):
    customers = [f"CUST{str(i).zfill(4)}" for i in range(1, 201)]
    products = [
        {"id": "P001", "name": "Laptop", "price": 1200},
        {"id": "P002", "name": "Phone", "price": 800},
        {"id": "P003", "name": "Headphones", "price": 150},
        {"id": "P004", "name": "Monitor", "price": 300},
        {"id": "P005", "name": "Keyboard", "price": 60},
    ]
    start_date = datetime(2024, 1, 1)

    data = []
    for _ in range(n):
        cust = random.choice(customers)
        product = random.choice(products)
        qty = random.randint(1, 5)
        order_date = start_date + timedelta(days=random.randint(0, 180))
        data = {
            "order_id": str(uuid.uuid4()),
            "customer_id": cust,
            "product_id": product["id"],
            "product_name": product["name"],
            "quantity": qty,
            "price": product["price"],
            "total_value": qty * product["price"],
            "order_date": order_date.strftime("%Y-%m-%d")
        }
    return data
# Customer Events (Behavioral Data)
def generate_customer_events(n=1000):
    customers = [f"CUST{str(i).zfill(4)}" for i in range(1, 201)]
    actions = ["view_product", "add_to_cart", "remove_from_cart", "checkout", "wishlist"]
    products = ["P001", "P002", "P003", "P004", "P005"]

    data = []
    current_time = int(time.time())
    for _ in range(n):
        cust = random.choice(customers)
        action = random.choice(actions)
        prod = random.choice(products)
        timestamp = current_time - random.randint(0, 3600)

        data = {
            "event_id": str(uuid.uuid4()),
            "customer_id": cust,
            "product_id": prod,
            "action": action,
            "timestamp": timestamp
        }
    return data

while True:
    # Send data to "order_data" topic
    message_1 = generate_orders()
    producer.send("order_data", value=message_1)
    print("Sent:", message_1)
    time.sleep(5)
    # Send data to "customer_event" topic
    message_2 = generate_customer_events()
    producer.send("customer_event", value=message_2)
    print("Sent:", message_2)
    time.sleep(5)

Sent: {'order_id': 'fb02126c-68f7-43ca-b81e-50327ab4c6ea', 'customer_id': 'CUST0191', 'product_id': 'P003', 'product_name': 'Headphones', 'quantity': 5, 'price': 150, 'total_value': 750, 'order_date': '2024-04-25'}
Sent: {'event_id': '0b04e6ce-aae0-4851-a175-f033f282557d', 'customer_id': 'CUST0101', 'product_id': 'P002', 'action': 'checkout', 'timestamp': 1755774104}
Sent: {'order_id': 'c9c7cf68-408e-49e2-92eb-aecc3f61c530', 'customer_id': 'CUST0058', 'product_id': 'P005', 'product_name': 'Keyboard', 'quantity': 2, 'price': 60, 'total_value': 120, 'order_date': '2024-06-03'}
Sent: {'event_id': 'bc074afe-88d4-4c6e-aa19-4ce7e62caf7b', 'customer_id': 'CUST0121', 'product_id': 'P001', 'action': 'view_product', 'timestamp': 1755776150}
Sent: {'order_id': '53ac8787-f8db-4f8b-af15-51a4a5ddc546', 'customer_id': 'CUST0128', 'product_id': 'P004', 'product_name': 'Monitor', 'quantity': 4, 'price': 300, 'total_value': 1200, 'order_date': '2024-06-01'}
Sent: {'event_id': 'd65d2b7d-4a0b-4962-b31c-89

KeyboardInterrupt: 

In [4]:
from pyspark.sql import SparkSession

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct

# 1. Create Spark session
spark = (
    SparkSession.builder
    .appName("OrdersToKafka")
    .getOrCreate()
)

# 2. Read data from order_data folder (JSON files dropped there)
orders_df = (
    spark.readStream
    .format("json")               # use "csv" if your data is CSV
    .option("path", "/app/order_data")   # directory mounted in container
    .option("maxFilesPerTrigger", 1)     # simulate streaming by 1 file at a time
    .load()
)

# 3. Convert to Kafka-friendly format (key, value must be bytes)
# - For demo, no key
# - Value is entire record serialized as JSON
orders_to_kafka = (
    orders_df
    .selectExpr("CAST(NULL AS STRING) as key")  # optional key
    .selectExpr("CAST(NULL AS STRING) as key", "to_json(struct(*)) AS value")
)

# 4. Write stream to Kafka
query = (
    orders_to_kafka
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("topic", "orders")    # Kafka topic
    .option("checkpointLocation", "/tmp/spark-checkpoints/orders")  # required
    .outputMode("append")
    .start()
    .awaitTermination()
)
