In [1]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
import os

fake = Faker()
np.random.seed(42)
random.seed(42)

# Parameters
TRANSACTIONS_FILE = "retail_data/transactions.csv"
NUM_FEEDBACK = 2_500_000  # Feedback for 5% of 50M transactions
EMPLOYEES_PER_STORE = 15
MAX_PRODUCTS_PER_STORE = 1000

# Output path
OUTPUT_DIR = "retail_data"

# --- 1. Customer Feedback ---
def generate_customer_feedback():
    transactions = pd.read_csv(TRANSACTIONS_FILE, usecols=["transaction_id", "customer_id"])
    feedback_sample = transactions.sample(n=NUM_FEEDBACK)

    df = pd.DataFrame({
        "feedback_id": range(1, NUM_FEEDBACK + 1),
        "customer_id": feedback_sample["customer_id"].values,
        "transaction_id": feedback_sample["transaction_id"].values,
        "rating": np.random.randint(1, 6, size=NUM_FEEDBACK),
        "feedback_text": [fake.sentence(nb_words=random.randint(5, 15)) for _ in range(NUM_FEEDBACK)],
        "date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(NUM_FEEDBACK)]
    })
    df.to_csv(os.path.join(OUTPUT_DIR, "customer_feedback.csv"), index=False)
    print("âœ… customer_feedback.csv generated.")

# --- 2. Inventory per Store ---
def generate_inventory():
    inventory_data = []
    for store_id in range(1, 101):  # 100 stores
        num_products = random.randint(500, MAX_PRODUCTS_PER_STORE)
        product_ids = random.sample(range(1, 10_001), num_products)
        for product_id in product_ids:
            inventory_data.append({
                "store_id": store_id,
                "product_id": product_id,
                "stock_quantity": random.randint(0, 500),
                "last_restock_date": fake.date_between(start_date="-60d", end_date="today")
            })

    df = pd.DataFrame(inventory_data)
    df.to_csv(os.path.join(OUTPUT_DIR, "inventory.csv"), index=False)
    print("âœ… inventory.csv generated.")

# --- 3. Employees ---
def generate_employees():
    employee_data = []
    employee_id = 1
    roles = ["Cashier", "Store Manager", "Sales Associate", "Stock Clerk"]

    for store_id in range(1, 101):
        for _ in range(EMPLOYEES_PER_STORE):
            employee_data.append({
                "employee_id": employee_id,
                "store_id": store_id,
                "name": fake.name(),
                "role": random.choices(roles, weights=[0.6, 0.1, 0.2, 0.1])[0],
                "hire_date": fake.date_between(start_date="-5y", end_date="-1y")
            })
            employee_id += 1

    df = pd.DataFrame(employee_data)
    df.to_csv(os.path.join(OUTPUT_DIR, "employees.csv"), index=False)
    print("âœ… employees.csv generated.")
    return employee_id - 1  # total number of employees

# --- 4. Update Transactions with employee_id ---
def update_transactions_with_employees(total_employees):
    input_file = TRANSACTIONS_FILE
    output_file = os.path.join(OUTPUT_DIR, "transactions_with_employee.csv")

    chunk_size = 2_000_000
    transaction_reader = pd.read_csv(input_file, chunksize=chunk_size)
    is_first = True

    for chunk in transaction_reader:
        chunk["employee_id"] = np.random.randint(1, total_employees + 1, size=len(chunk))
        chunk.to_csv(output_file, mode='w' if is_first else 'a', header=is_first, index=False)
        is_first = False
        print(f"âœ… Appended {len(chunk)} transactions with employee_id")

    print("âœ… transactions_with_employee.csv generated.")

# --- MAIN ---
if __name__ == "__main__":
    print("ðŸ”§ Generating employees...")
    total_employees = generate_employees()

    print("ðŸ”§ Updating transactions with employee_id...")
    update_transactions_with_employees(total_employees)

    print("ðŸ”§ Generating customer feedback...")
    generate_customer_feedback()

    print("ðŸ”§ Generating inventory per store...")
    generate_inventory()

    print("\nðŸŽ‰ All additional data generated successfully.")


ðŸ”§ Generating employees...
âœ… employees.csv generated.
ðŸ”§ Updating transactions with employee_id...
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transactions with employee_id
âœ… Appended 2000000 transac