In [1]:
import numpy as np
import pandas as pd
import random
import time
import json
from datetime import datetime

# Initialize global variables for customer IDs
existing_customer_ids = list(range(1, 50001))  # Old customer IDs range from 1 to 50000
current_customer_id = 50001  # Starting ID for new customers

# Distribution for repeat counts of old customer IDs
repeat_count_distribution = {
    5: 0.17921508,
    4: 0.17712088,
    6: 0.14675500,
    3: 0.14162018,
    7: 0.10581744,
    2: 0.08378808,
    8: 0.06395361,
    9: 0.03624575,
    1: 0.03328568,
    10: 0.01850547,
    11: 0.00765188,
    12: 0.00404744,
    13: 0.00124846,
    14: 0.00052355,
    15: 0.00016109,
    16: 0.00004027,
    17: 0.00002014
}

# Function to generate a single synthetic data record
def generate_synthetic_record():
    global existing_customer_ids, current_customer_id

    # Generate Customer ID
    if random.random() < 0.95:  # 95% old customers
        repeat_counts = np.random.choice(
            list(repeat_count_distribution.keys()),
            p=list(repeat_count_distribution.values())
        )
        customer_id = random.choice(existing_customer_ids)
    else:  # 5% new customers
        customer_id = current_customer_id
        existing_customer_ids.append(current_customer_id)
        current_customer_id += 1

    # Generate synthetic categorical data
    product_category = np.random.choice(
        ["Electronics", "Clothing", "Home", "Books"],
        p=[0.25052, 0.250324, 0.250168, 0.248988]
    )

    payment_method = np.random.choice(
        ["Credit Card", "PayPal", "Cash"],
        p=[0.334188, 0.333764, 0.332048]
    )

    gender = np.random.choice(
        ["Male", "Female"],
        p=[0.502704, 0.497296]
    )

    # Generate synthetic numerical data
    product_price = np.random.normal(loc=254.74, scale=141.74)
    product_price = np.clip(product_price, 10, 500)  # Ensure values are within the range

    quantity = np.random.randint(1, 6)

    total_purchase_amount = np.random.normal(loc=2725.39, scale=1442.58)
    total_purchase_amount = np.clip(total_purchase_amount, 100, 5350)

    customer_age = np.random.normal(loc=43.80, scale=15.36)
    customer_age = np.clip(customer_age, 18, 70)  # Ensure values are within the range

    returns = np.random.choice(
        [1.0, 0.0, np.nan],
        p=[0.405904, 0.404568, 0.189528]
    )

    # Create a single record as a dictionary and ensure JSON compatibility
    record = {
        "Customer ID": int(customer_id),
        "Timestamp": datetime.now().isoformat(),
        "Product Category": product_category,
        "Payment Method": payment_method,
        "Gender": gender,
        "Product Price": float(product_price),
        "Quantity": int(quantity),
        "Total Purchase Amount": float(total_purchase_amount),
        "Customer Age": float(customer_age),
        "Returns": None if pd.isna(returns) else int(returns)
    }

    return record

# Function to stream synthetic data as JSON
def stream_synthetic_data():
    while True:
        try:
            record = generate_synthetic_record()
            json_data = json.dumps(record)  # Convert the record to JSON format
            print(json_data, flush=True)  # Stream the JSON data with immediate output
            time.sleep(random.randint(30, 90))
        except Exception as e:
            print(f"Error: {e}", flush=True)

# Example usage
if __name__ == "__main__":
    stream_synthetic_data()


{"Customer ID": 5151, "Timestamp": "2025-01-09T18:57:36.536806", "Product Category": "Electronics", "Payment Method": "PayPal", "Gender": "Female", "Product Price": 472.49064470284577, "Quantity": 2, "Total Purchase Amount": 2205.967203775179, "Customer Age": 65.9658095760099, "Returns": 1}


KeyboardInterrupt: 