In [1]:
import pandas as pd
import random
from faker import Faker
import uuid
import datetime

In [2]:
# Initialize Faker and set random seed for reproducibility
fake = Faker()
Faker.seed(0)
random.seed(0)

In [3]:
# Function to generate random time
def random_time():
    return fake.date_time_between(datetime.datetime(2023,1,1))

# Function to generate random customer demographics
def random_customer():
    age = random.randint(18, 65)
    gender = random.choice(['Male', 'Female', 'Nonbinary', 'Prefer not to say'])
    return age, gender

In [4]:
# Generate Customer Data
customer_data = []
for i in range(1, 2342):
    age, gender  = random_customer()
    customer_data.append([f'C{i:010}', age, gender])

customer_df = pd.DataFrame(customer_data, columns=['customer_id', 'age', 'gender'])
customer_df.to_csv('cafe_customer_data.csv', index=False)


In [5]:
# Generate Branch Data
cities = [
    {"city": "New York", "neighborhoods": ["Brooklyn Heights", "Upper West Side"], "area": "East"},
    {"city": "Boston", "neighborhoods": ["South Boston", "North End"], "area": "East"},
    {"city": "Washington, D.C.", "neighborhoods": ["Capitol Hill", "Georgetown"], "area": "East"},
    {"city": "Atlanta", "neighborhoods": ["Buckhead"], "area": "South"},
    {"city": "Dallas", "neighborhoods": ["Uptown (Dallas)"], "area": "South"},
    {"city": "Austin", "neighborhoods": ["South Congress"], "area": "South"},
    {"city": "San Diego", "neighborhoods": ["North Park", "Little Italy"], "area": "West"},
    {"city": "Los Angeles", "neighborhoods": ["Silver Lake"], "area": "West"},
    {"city": "San Francisco", "neighborhoods": ["SOMA"], "area": "West"},
    {"city": "Chicago", "neighborhoods": ["Old Town", "West Loop"], "area": "North"},
    {"city": "Cleveland", "neighborhoods": ["Downtown"], "area": "North"},
    {"city": "Pittsburgh", "neighborhoods": ["Shadyside"], "area": "North"},
    {"city": "New Orleans", "neighborhoods": ["French Quarter"], "area": "South"},
    {"city": "Houston", "neighborhoods": ["The Heights"], "area": "South"},
    {"city": "Denver", "neighborhoods": ["LoDo"], "area": "West"},
    {"city": "Seattle", "neighborhoods": ["Ballard"], "area": "West"},
    {"city": "Phoenix", "neighborhoods": ["Arcadia"], "area": "West"},
    {"city": "Miami", "neighborhoods": ["Wynwood"], "area": "South"},
    {"city": "Philadelphia", "neighborhoods": ["Rittenhouse Square"], "area": "East"},
    {"city": "Portland", "neighborhoods": ["Pearl District"], "area": "West"},
    {"city": "Minneapolis", "neighborhoods": ["Uptown (Minneapolis)"], "area": "North"},
    {"city": "Baltimore", "neighborhoods": ["Fells Point"], "area": "East"}
]

neighborhood_zip_codes = {
    "Brooklyn Heights": "11201",
    "Upper West Side": ["10024", "10025"],
    "South Boston": "02127",
    "North End": "02113",
    "Capitol Hill": "20003",
    "Georgetown": "20007",
    "Buckhead": ["30305", "30309"],
    "Uptown (Dallas)": ["75204", "75219"],
    "South Congress": "78704",
    "North Park": "92104",
    "Little Italy": "92101",
    "Silver Lake": "90039",
    "SOMA": "94103",
    "Old Town": "60610",
    "West Loop": "60607",
    "Downtown": "44113",
    "Shadyside": "15232",
    "French Quarter": "70116",
    "The Heights": "77008",
    "LoDo": "80202",
    "Ballard": "98107",
    "Arcadia": "85018",
    "Wynwood": "33127",
    "Rittenhouse Square": "19103",
    "Pearl District": "97209",
    "Uptown (Minneapolis)": "55408",
    "Fells Point": "21231"
}

branch_data = []
for i in range(1, 234):
    shop_id = f"B{i:03}"
    city_info = random.choice(cities)
    city = city_info["city"]
    neighborhood = random.choice(city_info["neighborhoods"])
    zip_code = neighborhood_zip_codes[neighborhood] if isinstance(neighborhood_zip_codes[neighborhood], str) else random.choice(neighborhood_zip_codes[neighborhood])
    area = city_info["area"]
    
    branch_data.append({
        "branch_id": shop_id,
        "neighborhood": neighborhood,
        "zipcode": zip_code,
        "city": city,
        "area": area
    })
    
branch_data_df = pd.DataFrame(branch_data)
branch_data_df.to_csv('cafe_branch_data.csv', index=False)

In [6]:
# Generate Product Data
products = [
    (1,'Latte', 4.50, 'Drinks'),
    (2,'Espresso', 3.00, 'Drinks'),
    (3,'Cappuccino', 4.00, 'Drinks'),
    (4,'Mocha', 4.75, 'Drinks'),
    (5,'Americano', 3.25, 'Drinks'),
    (6,'Flat White', 4.25, 'Drinks'),
    (7,'Macchiato', 3.50, 'Drinks'),
    (8,'Iced Coffee', 3.75, 'Drinks'),
    (9,'Tea', 2.50, 'Drinks'),
    (10,'Hot Chocolate', 3.75, 'Drinks'),
    (11,'Bagel', 2.50, 'Bakery'),
    (12,'Croissant', 3.50, 'Bakery'),
    (13,'Muffin', 2.75, 'Bakery'),
    (14,'Scone', 3.00, 'Bakery'),
    (15,'Pastrami Sandwich', 5.00, 'Sandwiches & Toasts'),
    (16,'Vegan Sandwich', 4.00, 'Sandwiches & Toasts'),
    (17,'Avocado&Egg Toast', 6.20, 'Sandwiches & Toasts'),
    (18,'Salad', 6.50, 'Lunch'),
    (19,'Soup', 4.50, 'Lunch'),
    (20,'Fruit Cup', 3.75, 'Lunch'),
    (21,'Smoothie', 5.50, 'Drinks'),
    (22,'Cookie', 2.00, 'Bakery'),
    (23,'Brownie', 2.50, 'Bakery'),
    (24,'Cake Slice', 4.00, 'Bakery')
]

products_df = pd.DataFrame(products, columns=['product_id','product_name', 'price', 'category'])
products_df.to_csv('cafe_product_data.csv', index=False)

In [7]:
# Generate Order Data
sales_data = []
orders = []
ratings = [1, 2, 3, 4, 5]
payment_methods = ['Credit Card', 'Cash', 'Mobile Payment']
for i in range(1, 43232):
    orders.append((f"O{i:010}", random.choice(customer_df["customer_id"]), random_time(), random.choice(branch_data_df["branch_id"]), random.choice(ratings), random.choice(payment_methods)))

for _ in range(100000):
    order_id, customer_id, timestamp, branch, rating, payment_method = random.choice(orders)
    product_id, product, price, category = random.choice(products)
    quantity = random.randint(1, 5) 
    sales_data.append([order_id, product_id, quantity])

sales_df = pd.DataFrame(sales_data, columns=['order_id', 'product_id', 'quantity'])
sales_df.to_csv('cafe_order_detail_data.csv', index=False)

orders_df = pd.DataFrame(orders, columns=['order_id', 'customer_id', 'timestamp', 'branch_id', 'rating', 'payment_method'])
orders_df['date'] = orders_df['timestamp'].apply(lambda x: str(x).split(" ")[0])
orders_df['time'] = orders_df['timestamp'].apply(lambda x: str(x).split(" ")[1])
orders_df['time'] = orders_df['time'].apply(lambda x: x.replace(x[:2], str(random.randrange(7,22))) if x[:2] in ['00', '01', '02', '03', '04', '05', '06', '21', '22', '23'] else x)
orders_df = orders_df.drop('timestamp', axis=1)
orders_df.to_csv('cafe_order_data.csv', index=False)
