In [24]:
import pandas as pd
import random
from datetime import datetime, timedelta

In [25]:
products = {
    "Electronics": {
        "Mobile Phone": [9999,14999, 19999, 24999, 29999,49999],
        "Laptop": [49999, 59999, 69999, 89999],
        "Headphones": [499, 999, 1999, 2999, 4999],
        "Smartwatch": [1499, 2999, 4999, 6999, 9999],
        "Tablet": [19999, 24999, 29999, 34999],
        "Bluetooth Speaker": [899, 1499, 2999, 4499, 5999]
    },
    "Fashion": {
        "T-Shirt": [199, 299, 499, 699, 999],
        "Jeans": [999, 1499, 1999, 2499],
        "Sneakers": [1999, 2499, 2999, 3999],
        "Jacket": [2499, 3499, 4499, 5999],
        "Saree": [1499, 1999, 2499, 3499],
        "Kurta": [999, 1299, 1599, 1999],
        "Shirt" :[499,899,1299,1499,2199],
        "Shoes": [799,999,1499,2499,2999]
    },
    "Groceries": {
        "Rice (10kg)": [499, 699, 799, 999],
        "Wheat Flour (5kg)": [249, 349, 449, 599],
        "Milk (1L)": [50, 55, 60, 65,70],
        "Tea (500g)": [149, 199, 249, 299],
        "Coffee (200g)": [199,209,249,299],
        "Sugar (2kg)": [99, 129, 149, 179]
    },
    "Home Appliances": {
        "Washing Machine": [19999, 25999, 29999, 34999],
        "Refrigerator": [34999, 39999, 44999, 49999],
        "Microwave Oven": [9999, 12999, 14999, 19999],
        "Mixer Grinder": [2999, 3999, 4999, 5999],
        "Air Conditioner": [29999, 36999, 42999, 49999],
        "Ceiling Fan":[899,1299,1799,2999,4499,5199]
    },
    "Furniture": {
        "Sofa": [19999, 24999, 29999, 34999],
        "Dining Table": [9999, 12999, 15999, 19999],
        "Bed": [19999, 22999, 24999, 29999],
        "Chair": [999, 1499, 1999, 2499],
        "Study Table": [2999, 3999, 4999, 5999]
    },
    "Beauty & Personal Care": {
        "Perfume": [599, 999, 1499, 1999],
        "Shampoo": [199, 299, 399, 499],
        "Face Cream": [199, 299, 399, 599],
        "Body Lotion": [299, 399, 499, 699],
        "Lipstick": [199, 299, 399, 499],
        "Face Wash" : [199,399,799,999],
        "Hair Cream":[299,499,799]
    },
    "Toys & Baby Products": {
        "Soft Toy": [399, 599, 799, 999],
        "Baby Stroller": [3999, 4999, 5999, 6999],
        "Kids Cycle": [2499, 2999, 3499, 3999],
        "Lego Set": [1499, 1999, 2499, 2999],
        "Puzzle Game": [599, 799, 999, 1299]
    },
    "Sports & Fitness": {
        "Cricket Bat": [999, 1599, 1999, 2499],
        "Football": [499, 999, 1299, 1499],
        "Dumbbells": [1999, 2499, 2999, 3499],
        "Treadmill": [24999, 29999, 34999, 39999],
        "Yoga Mat": [599, 899, 999, 1299],
        "Hockeystick" :[799,1299,1499,2499]
    }
}

In [27]:
# Define realistic quantity ranges per category
quantity_range = {
    "Electronics": (1, 2),
    "Fashion": (1, 4),
    "Groceries": (1, 10),
    "Home Appliances": (1, 2),
    "Furniture": (1, 2),
    "Beauty & Personal Care": (1, 3),
    "Toys & Baby Products": (1, 3),
    "Sports & Fitness": (1, 2)
}

In [28]:
# List of realistic Indian cities and states
cities_states = [
    ("Mumbai", "Maharashtra"), ("Delhi", "Delhi"), ("Bangalore", "Karnataka"),
    ("Chennai", "Tamil Nadu"), ("Hyderabad", "Telangana"), ("Pune", "Maharashtra"),
    ("Kolkata", "West Bengal"), ("Ahmedabad", "Gujarat"), ("Jaipur", "Rajasthan"),
    ("Lucknow", "Uttar Pradesh"), ("Surat", "Gujarat"), ("Bhopal", "Madhya Pradesh"),
    ("Indore", "Madhya Pradesh"), ("Patna", "Bihar"), ("Nagpur", "Maharashtra"),
    ("Coimbatore", "Tamil Nadu"), ("Visakhapatnam", "Andhra Pradesh"),
    ("Chandigarh", "Chandigarh"), ("Guwahati", "Assam"), ("Bhubaneswar", "Odisha")
]

In [29]:
# Payment modes
payment_modes = ["Credit Card", "Debit Card", "Net Banking", "UPI", "Cash on Delivery"]

In [33]:
# Step 1: Create a fixed pool of customers (e.g., 20,000 unique customers)
num_customers = 70000
customer_ids = [random.randint(100000, 999999) for _ in range(num_customers)]

# Step 2: Control purchase frequency (some customers shop more often)
customer_frequencies = {
    "Frequent": int(0.2 * num_customers),  # 20% shop very often (10-30 times)
    "Regular": int(0.5 * num_customers),   # 50% shop regularly (5-10 times)
    "Occasional": int(0.3 * num_customers) # 30% shop occasionally (1-4 times)
}

# Assign frequencies to customers
frequent_customers = random.sample(customer_ids, customer_frequencies["Frequent"])
regular_customers = random.sample(list(set(customer_ids) - set(frequent_customers)), customer_frequencies["Regular"])
occasional_customers = list(set(customer_ids) - set(frequent_customers) - set(regular_customers))

In [34]:
num_records = 500000  # 5 Lakh transactions
data = []

for customer_id in customer_ids:
    if customer_id in frequent_customers:
        purchases = random.randint(10, 30)  # Frequent shoppers
    elif customer_id in regular_customers:
        purchases = random.randint(5, 10)  # Regular shoppers
    else:
        purchases = random.randint(1, 4)  # Occasional shoppers

    # Generate purchases for this customer
    for _ in range(purchases):
        invoice_date = datetime(2022, 1, 1) + timedelta(days=random.randint(0, 730), 
                                                         hours=random.randint(0, 23), 
                                                         minutes=random.randint(0, 59)) 
        category = random.choice(list(products.keys()))
        description = random.choice(list(products[category].keys()))
        stock_code = category[:3].upper() + str(random.randint(100, 999))
        price = random.choice(products[category][description])
        quantity = random.randint(quantity_range[category][0], quantity_range[category][1])
        discount_used = random.choice(["Yes", "No"])
        city, state = random.choice(cities_states)
        country = "India"
        payment_mode = random.choice(payment_modes)

        data.append([customer_id, invoice_date, stock_code, description, category, price, quantity,
                     discount_used, city, state, country, payment_mode])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Customer ID", "Invoice Date", "Stock Code", "Description", "Product Category",
                                 "Purchase Amount (₹)", "Quantity", "Discount Used", "City", "State", "Country",
                                 "Payment Mode"])

In [35]:
df.to_csv("e-commerce_dataset.csv", index=False)