In [2]:
import csv
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

In [3]:
# establish amount of data for each
num_users = 10000
num_sellers = 100
num_products = 1000
num_orders = 1000
num_flash_sales = 50
output_folder = "medium/"

# Details:
# amount of cart items are uneven among users
# amount of order items are uneven among orders
# amount of products per flash sale are uneven among flash sales
# amount of products per seller are uneven among sellers
# amount of flash sales per product are uneven among products
# flash sales span January 2024 to November 2025

In [4]:
# Users.csv
# user_id,user_name,email,user_pw,user_role

OUTPUT_FILE =  output_folder + "Users.csv"

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["user_id", "user_name", "email", "user_pw", "user_role"])

    for user_id in range(1, num_users + 1):
        user_name = fake.name()
        email = fake.email()
        user_pw = fake.password(length=10)
        user_role = "SELLER" if user_id <= num_sellers else "BUYER"
        writer.writerow([user_id, user_name, email, user_pw, user_role])

print(f"Generated {num_users} records in {OUTPUT_FILE}")

Generated 10000 records in medium/Users.csv


In [5]:
# Products.csv
# product_id,seller_id,product_name,category,product_desc,price,original_price,discount_rate,quantity_stock,flash_sale_id

OUTPUT_FILE = output_folder + "Products.csv"

CATEGORIES = [
    "Smartphones", "Laptops", "Headphones", "Watches", "Shoes",
    "Clothing", "Kitchen Appliances", "Furniture", "Books", "Toys"
]

def generate_price_data():
    original_price = random.uniform(10, 500)
    discount_rate = random.uniform(0.05, 0.60)
    price = round(original_price * (1 - discount_rate), 2)
    original_price = round(original_price, 2)
    discount_rate = round(discount_rate, 2)
    return price, original_price, discount_rate

def generate_flash_sale_id():
    return random.randint(1, num_flash_sales) if random.random() < 0.50 else 0

def generate_product_name(category):
    product_names = {
        "Smartphones": ["Galaxy S21", "iPhone 14", "Pixel 7", "OnePlus 11"],
        "Laptops": ["MacBook Pro", "Dell XPS 13", "HP Spectre", "Lenovo ThinkPad"],
        "Headphones": ["Sony WH-1000XM4", "AirPods Pro", "Bose QC45", "Jabra Elite 85h"],
        "Watches": ["Apple Watch Series 8", "Samsung Galaxy Watch", "Fossil Gen 6", "Garmin Fenix 7"],
        "Shoes": ["Nike Air Max", "Adidas Ultraboost", "Puma RS-X", "Reebok Nano X"],
        "Clothing": ["Levi's Jeans", "H&M T-Shirt", "Zara Jacket", "Uniqlo Hoodie"],
        "Kitchen Appliances": ["Instant Pot", "Ninja Blender", "KitchenAid Mixer", "Philips Airfryer"],
        "Furniture": ["Ikea Sofa", "Ashley Dining Table", "Wayfair Bed Frame", "West Elm Chair"],
        "Books": ["The Great Gatsby", "1984", "To Kill a Mockingbird", "The Catcher in the Rye"],
        "Toys": ["LEGO Star Wars", "Barbie Dreamhouse", "Hot Wheels Track", "Nerf Blaster"]
    }
    return random.choice(product_names.get(category, ["Generic Product"]))

def generate_product_description(category):
    descriptions = {
        "Smartphones": "A high-performance smartphone with cutting-edge features, perfect for staying connected and productive.",
        "Laptops": "A sleek and powerful laptop designed for professionals and gamers alike.",
        "Headphones": "Premium noise-cancelling headphones with crystal-clear sound and long-lasting comfort.",
        "Watches": "A stylish smartwatch with advanced health tracking and customizable watch faces.",
        "Shoes": "Durable and comfortable shoes designed for both sports and casual wear.",
        "Clothing": "Trendy and high-quality clothing that combines style and comfort for any occasion.",
        "Kitchen Appliances": "Essential kitchen appliance to simplify your cooking and save time.",
        "Furniture": "Modern and elegant furniture to enhance the look and feel of your home.",
        "Books": "An engaging and thought-provoking book that will captivate readers of all ages.",
        "Toys": "Fun and creative toys that inspire imagination and endless hours of play."
    }
    return descriptions.get(category, "A high-quality product designed to meet your needs.")

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "product_id", "seller_id", "product_name", "category", "product_desc",
        "price", "original_price", "discount_rate", "quantity_stock", "flash_sale_id"
    ])

    for product_id in range(1, num_products + 1):
        seller_id = random.randint(1, num_sellers)
        category = random.choice(CATEGORIES)
        product_name = generate_product_name(category)
        product_desc = generate_product_description(category)
        price, original_price, discount_rate = generate_price_data()
        quantity_stock = random.randint(1, 500)
        flash_sale_id = generate_flash_sale_id()

        writer.writerow([
            product_id, seller_id, product_name, category, product_desc,
            price, original_price, discount_rate, quantity_stock, flash_sale_id
        ])

print(f"Generated {num_products} records in {OUTPUT_FILE}")

Generated 1000 records in medium/Products.csv


In [6]:
# FlashSales.csv
# flash_sale_id,name,start_time,end_time

OUTPUT_FILE = output_folder + "FlashSales.csv"

def generate_flash_sale_name(flash_sale_id):
    optional_addons = ["", "Mega", "Super", "Ultimate", "One-Day", "Weekend"]
    season_phrases = [
        "Back-to-School", "Holiday", "Black Friday", "Cyber Monday", "Summer",
        "Winter", "Spring", "Fall", "Exclusive", "Limited Time"
    ]
    sale_types = ["Sale", "Flash Sale" "Deals", "Discount", "Bonanza", "Extravaganza", "Event", 
                  "Specials", "Offer", "Clearance", "Surprise"]
    return f"{random.choice(optional_addons)} {random.choice(season_phrases).strip()} {random.choice(sale_types)}".strip()

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["flash_sale_id", "name", "start_time", "end_time"])

    for flash_sale_id in range(1, num_flash_sales + 1):
        name = generate_flash_sale_name(flash_sale_id)
        start_time = fake.date_time_between(
            start_date=datetime.strptime("2024-01-01", "%Y-%m-%d"),
            end_date=datetime.strptime("2025-11-01", "%Y-%m-%d")
        )
        end_time = start_time + timedelta(days=random.randint(1, 14)) # flash sales last between 1 to 14 days
        writer.writerow([flash_sale_id, name, start_time, end_time])

print(f"Generated {num_flash_sales} records in {OUTPUT_FILE}")

Generated 50 records in medium/FlashSales.csv


In [7]:
# Orders.csv
# order_id,buyer_id,created_at
# buyer_id: references user_id in Users table

OUTPUT_FILE = output_folder + "Orders.csv"

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["order_id", "buyer_id", "created_at"])

    for order_id in range(1, num_orders + 1):
        buyer_id = random.randint(num_sellers + 1, num_users)  # Only buyers
        created_at = fake.date_time_between(start_date="-1y", end_date="now")
        writer.writerow([order_id, buyer_id, created_at])

print(f"Generated {num_orders} records in {OUTPUT_FILE}")

Generated 1000 records in medium/Orders.csv


In [8]:
# OrderItems.csv
# order_items_id,order_id,product_id,quantity_sold
# order_id: references order_id in Orders table
# product_id: references product_id in Products table

OUTPUT_FILE = output_folder + "OrderItems.csv"

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["order_items_id", "order_id", "product_id", "quantity_sold"])

    order_items_id = 1
    for order_id in range(1, num_orders + 1):
        num_items = random.randint(1, 5)  # Each order has 1-5 items
        for _ in range(num_items):
            product_id = random.randint(1, num_products)
            quantity_sold = random.randint(1, 10)
            writer.writerow([order_items_id, order_id, product_id, quantity_sold])
            order_items_id += 1

print(f"Generated {order_items_id - 1} records in {OUTPUT_FILE}")

Generated 3036 records in medium/OrderItems.csv


In [9]:
# CartItems.csv
# cart_item_id,user_id,product_id,quantity_added
# user_id: references user_id in Users table
# product_id: references product_id in Products table

OUTPUT_FILE = output_folder + "CartItems.csv"

with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["cart_item_id", "user_id", "product_id", "quantity_added"])

    cart_item_id = 1
    for user_id in range(num_sellers + 1, num_users + 1):  # Only buyers have carts
        num_cart_items = random.randint(1, 10)  # Each user has 1-10 cart items
        for _ in range(num_cart_items):
            product_id = random.randint(1, num_products)
            quantity_added = random.randint(1, 10) # 1-10 quantity added
            writer.writerow([cart_item_id, user_id, product_id, quantity_added])
            cart_item_id += 1

print(f"Generated {cart_item_id - 1} records in {OUTPUT_FILE}")

Generated 54650 records in medium/CartItems.csv
