In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os

fake = Faker()
np.random.seed(42)
random.seed(42)

# Configuration
NUM_CUSTOMERS = 6_000_000
NUM_PRODUCTS = 10_000
NUM_TRANSACTIONS = 50_000_000
NUM_SUPPLIERS = 500
NUM_STORES = 100
CUSTOMER_CHUNK_SIZE = 500_000
TRANSACTION_CHUNK_SIZE = 2_000_000

# Output directory
OUTPUT_DIR = "retail_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Store Branches ---
def generate_store_branches():
    regions = ["Beirut", "Jnoub", "Chmel", "Bekaa", "Metn"]
    df = pd.DataFrame({
        "store_id": range(1, NUM_STORES + 1),
        "store_name": [f"MegaMart {chr(65+i)}" for i in range(NUM_STORES)],
        "city": [fake.city() for _ in range(NUM_STORES)],
        "region": [random.choice(regions) for _ in range(NUM_STORES)],
        "size_sqft": np.random.randint(5000, 30000, size=NUM_STORES),
        "open_date": [fake.date_between(start_date="-10y", end_date="-1y") for _ in range(NUM_STORES)]
    })
    df.to_csv(os.path.join(OUTPUT_DIR, "store_branches.csv"), index=False)

# --- Suppliers ---
def generate_suppliers():
    df = pd.DataFrame({
        "supplier_id": range(1, NUM_SUPPLIERS + 1),
        "supplier_name": [fake.company() for _ in range(NUM_SUPPLIERS)],
        "contact_person": [fake.name() for _ in range(NUM_SUPPLIERS)],
        "phone": [fake.phone_number() for _ in range(NUM_SUPPLIERS)],
        "country": [fake.country() for _ in range(NUM_SUPPLIERS)]
    })
    df.to_csv(os.path.join(OUTPUT_DIR, "suppliers.csv"), index=False)

# --- Products ---
def generate_products():
    categories = ["Electronics", "Clothing", "Grocery", "Home", "Sports"]
    df = pd.DataFrame({
        "product_id": range(1, NUM_PRODUCTS + 1),
        "product_name": [fake.word().capitalize() for _ in range(NUM_PRODUCTS)],
        "category": [random.choice(categories) for _ in range(NUM_PRODUCTS)],
        "price": np.random.uniform(5.0, 500.0, NUM_PRODUCTS).round(2),
        "supplier_id": np.random.randint(1, NUM_SUPPLIERS + 1, NUM_PRODUCTS)
    })
    df.to_csv(os.path.join(OUTPUT_DIR, "products.csv"), index=False)

# --- Customers (Chunked) ---
def generate_customers():
    for i in range(0, NUM_CUSTOMERS, CUSTOMER_CHUNK_SIZE):
        chunk_size = min(CUSTOMER_CHUNK_SIZE, NUM_CUSTOMERS - i)
        df = pd.DataFrame({
            "customer_id": range(i + 1, i + 1 + chunk_size),
            "name": [fake.name() for _ in range(chunk_size)],
            "gender": [random.choice(["Male", "Female"]) for _ in range(chunk_size)],
            "age": np.random.randint(18, 70, size=chunk_size),
            "location": [fake.city() for _ in range(chunk_size)]#,
            #"income": np.random.normal(50000, 15000, chunk_size).astype(int)
        })
        mode = 'w' if i == 0 else 'a'
        header = (i == 0)
        df.to_csv(os.path.join(OUTPUT_DIR, "customers.csv"), mode=mode, header=header, index=False)
        print(f"✅ Customers: {i + 1} to {i + chunk_size}")

# --- Helper to Load Product Prices ---
def load_product_prices():
    df = pd.read_csv(os.path.join(OUTPUT_DIR, "products.csv"))
    return dict(zip(df['product_id'], df['price']))

# --- Transactions (Chunked) ---
def generate_transactions(product_prices):
    date_range = pd.date_range(start="2020-01-01", end="2024-12-31", freq='D')

    for i in range(0, NUM_TRANSACTIONS, TRANSACTION_CHUNK_SIZE):
        chunk_size = min(TRANSACTION_CHUNK_SIZE, NUM_TRANSACTIONS - i)
        customer_ids = np.random.randint(1, NUM_CUSTOMERS + 1, size=chunk_size)
        product_ids = np.random.randint(1, NUM_PRODUCTS + 1, size=chunk_size)
        quantities = np.random.randint(1, 5, size=chunk_size)
        dates = np.random.choice(date_range, size=chunk_size)
        store_ids = np.random.randint(1, NUM_STORES + 1, size=chunk_size)

        total_amounts = [
            round(quantities[j] * product_prices[product_ids[j]], 2)
            for j in range(chunk_size)
        ]

        df = pd.DataFrame({
            "transaction_id": range(i + 1, i + 1 + chunk_size),
            "customer_id": customer_ids,
            "product_id": product_ids,
            "store_id": store_ids,
            "quantity": quantities,
            "transaction_date": dates,
            "total_amount": total_amounts
        })

        mode = 'w' if i == 0 else 'a'
        header = (i == 0)
        df.to_csv(os.path.join(OUTPUT_DIR, "transactions.csv"), mode=mode, header=header, index=False)
        print(f"✅ Transactions: {i + 1} to {i + chunk_size}")

# --- MAIN ---
if __name__ == "__main__":
    print("🔧 Generating suppliers...")
    generate_suppliers()

    print("🔧 Generating store branches...")
    generate_store_branches()

    print("🔧 Generating products...")
    generate_products()

    print("🔧 Generating customers in chunks...")
    generate_customers()

    print("🔧 Loading product prices for transactions...")
    product_prices = load_product_prices()

    print("🔧 Generating transactions in chunks...")
    generate_transactions(product_prices)

    print("\n✅ All files generated successfully in the 'retail_data/' folder.")


🔧 Generating suppliers...
🔧 Generating store branches...
🔧 Generating products...
🔧 Generating customers in chunks...
✅ Customers: 1 to 500000
✅ Customers: 500001 to 1000000
✅ Customers: 1000001 to 1500000
✅ Customers: 1500001 to 2000000
✅ Customers: 2000001 to 2500000
✅ Customers: 2500001 to 3000000
✅ Customers: 3000001 to 3500000
✅ Customers: 3500001 to 4000000
✅ Customers: 4000001 to 4500000
✅ Customers: 4500001 to 5000000
✅ Customers: 5000001 to 5500000
✅ Customers: 5500001 to 6000000
🔧 Loading product prices for transactions...
🔧 Generating transactions in chunks...
✅ Transactions: 1 to 2000000
✅ Transactions: 2000001 to 4000000
✅ Transactions: 4000001 to 6000000
✅ Transactions: 6000001 to 8000000
✅ Transactions: 8000001 to 10000000
✅ Transactions: 10000001 to 12000000
✅ Transactions: 12000001 to 14000000
✅ Transactions: 14000001 to 16000000
✅ Transactions: 16000001 to 18000000
✅ Transactions: 18000001 to 20000000
✅ Transactions: 20000001 to 22000000
✅ Transactions: 22000001 to 24

In [2]:
pip install faker


Defaulting to user installation because normal site-packages is not writeable
Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB ? eta -:--:--
   --------------------------- ------------ 1.3/1.9 MB 5.2 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 5.3 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.1.0
Note: you may need to restart the kernel to use updated packages.


