In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import timedelta

fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# --- Params ---
N_PRODUCTS = 200
CATEGORIES = [
    "Tools",
    "Garden",
    "Electronics",
    "Furniture",
    "Sports",
    "DIY",
    "Home Decor",
]

# --- Generate products ---
products = []
for i in range(N_PRODUCTS):
    category = random.choice(CATEGORIES)
    base_price = round(random.uniform(10, 300), 2)
    stock = int(random.randint(0, 500))
    delivery_days = int(random.randint(1, 10))
    rating = round(random.uniform(2.5, 5.0), 1)
    return_rate = round(random.uniform(0.01, 0.25), 2)

    products.append(
        {
            "product_id": f"P{i:04d}",
            "name": fake.catch_phrase(),
            "category": category,
            "price": base_price,
            "stock_qty": stock,
            "avg_rating": rating,
            "return_rate": return_rate,
            "delivery_estimate_days": delivery_days,
            "description": fake.paragraph(nb_sentences=3),
        }
    )

df_products = pd.DataFrame(products)

# --- Generate mock orders ---
orders = []
for _ in range(800):
    product = df_products.sample(1).iloc[0]
    order_date = fake.date_between(start_date="-90d", end_date="today")
    delivery_delay = max(0, np.random.normal(0, 2))  # +/- 2 days noise
    actual_delivery = order_date + timedelta(
        days=int(product["delivery_estimate_days"]) + int(delivery_delay)
    )

    orders.append(
        {
            "order_id": fake.uuid4(),
            "product_id": product["product_id"],
            "order_date": order_date,
            "estimated_delivery_days": product["delivery_estimate_days"],
            "actual_delivery_date": actual_delivery,
            "delivered_late": actual_delivery
            > order_date + timedelta(days=int(product["delivery_estimate_days"])),
            "customer_feedback": fake.sentence(nb_words=12),
        }
    )

df_orders = pd.DataFrame(orders)

# --- Save to CSVs ---
df_products.to_csv("../data/products.csv", index=False)
df_orders.to_csv("../data/orders.csv", index=False)

print("✅ Synthetic datasets generated!")
print(df_products.head())
print(df_orders.head())

✅ Synthetic datasets generated!
  product_id                               name    category   price  \
0      P0000      Sharable bifurcated algorithm         DIY   42.29   
1      P0001  Total needs-based instruction set       Tools  206.24   
2      P0002    Persevering national capability       Tools   37.17   
3      P0003       Synergized secondary archive         DIY  198.47   
4      P0004       Reduced interactive paradigm  Home Decor  262.10   

   stock_qty  avg_rating  return_rate  delivery_estimate_days  \
0        379         3.1         0.04                       5   
1        456         2.7         0.11                       9   
2        119         4.0         0.14                       9   
3        279         3.1         0.15                       7   
4        388         4.2         0.09                       3   

                                         description  
0  Development say quality throughout beautiful. ...  
1  Operation speak according south recen