In [3]:
# Generate two CSV files for Kafka testing: a small sample (20 rows) and a larger one (10,000 rows)
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

random.seed(42)
np.random.seed(42)

def make_orders(n_rows: int, start_datetime: datetime, hours_span: int = 48):
    # Generate timestamps over the last `hours_span` hours
    times = [start_datetime - timedelta(seconds=random.randint(0, hours_span*3600)) for _ in range(n_rows)]
    times = [t.replace(microsecond=0).isoformat() + "Z" for t in times]  # ISO-8601 with Z
    
    order_ids = [f"O-{start_datetime.strftime('%Y%m%d')}-{i:06d}" for i in range(1, n_rows+1)]
    customer_ids = np.random.randint(1000, 1999, size=n_rows)
    
    skus = [f"SKU-{random.randint(10000, 99999)}" for _ in range(n_rows)]
    qty = np.random.choice([1,2,3,4,5], size=n_rows, p=[0.45,0.25,0.15,0.10,0.05])
    
    unit_price = np.round(np.random.uniform(1.5, 120.0, size=n_rows), 2)
    total_amount = np.round(unit_price * qty, 2)
    
    payment_method = np.random.choice(["card","cash","wallet","transfer"], size=n_rows, p=[0.55,0.20,0.20,0.05])
    status = np.random.choice(["NEW","PAID","CANCELLED","REFUNDED"], size=n_rows, p=[0.20,0.65,0.10,0.05])
    channel = np.random.choice(["web","mobile","pos"], size=n_rows, p=[0.45,0.35,0.20])
    country = np.random.choice(["LB","AE","SA","US","FR","DE","GB","TR","EG","JO"], size=n_rows, p=[0.35,0.08,0.07,0.10,0.08,0.07,0.05,0.07,0.07,0.06])

    df = pd.DataFrame({
        "order_id": order_ids,
        "event_time": times,
        "customer_id": customer_ids,
        "product_sku": skus,
        "quantity": qty,
        "unit_price": unit_price,
        "total_amount": total_amount,
        "payment_method": payment_method,
        "status": status,
        "channel": channel,
        "country": country,
    })
    return df

now = datetime.utcnow()
small_df = make_orders(20, now, hours_span=24)
big_df = make_orders(10_000, now, hours_span=72)

small_path = "c:/data/orders_small.csv"
big_path = "c:/data/orders_10k.csv"
small_df.to_csv(small_path, index=False)
big_df.to_csv(big_path, index=False)

small_path, big_path, small_df.head(5)


  now = datetime.utcnow()


('c:/data/orders_small.csv',
 'c:/data/orders_10k.csv',
             order_id            event_time  customer_id product_sku  quantity  \
 0  O-20250908-000001  2025-09-07T03:02:49Z         1102   SKU-83563         1   
 1  O-20250908-000002  2025-09-07T22:16:27Z         1435   SKU-36062         5   
 2  O-20250908-000003  2025-09-08T01:25:01Z         1860   SKU-95181         3   
 3  O-20250908-000004  2025-09-07T16:18:51Z         1270   SKU-81426         1   
 4  O-20250908-000005  2025-09-07T17:24:41Z         1106   SKU-64987         1   
 
    unit_price  total_amount payment_method status channel country  
 0       73.49         73.49       transfer   PAID     web      LB  
 1       21.71        108.55         wallet    NEW  mobile      LB  
 2        9.21         27.63         wallet    NEW  mobile      EG  
 3      113.94        113.94         wallet   PAID  mobile      TR  
 4      115.93        115.93           cash   PAID  mobile      FR  )