In [3]:
import csv
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Function to generate random null values
def generate_null(probability):
    return random.random() < probability

# Function to generate garbage values
def generate_garbage(value, probability):
    if random.random() < probability:
        return fake.word()
    return value

# Function to generate outliers
def generate_outlier(value, probability, mean, std_deviation):
    if random.random() < probability:
        return round(random.gauss(mean, std_deviation), 2)
    return value

# Function to generate synthetic business data
def generate_business_data():
    data = []
    for i in range(1, 1001):
        revenue = random.randint(1000, 100000)
        expenses = random.randint(500, 50000)
        profit = revenue - expenses
        profit_margin = (profit / revenue) * 100 if revenue != 0 else 0
        
        data.append({
            'BusinessName': fake.company(),
            'Category': fake.random_element(elements=('Retail', 'Hospitality', 'Technology', 'Healthcare', 'Finance')),
            'Revenue': generate_outlier(revenue, 0.05, 50000, 20000),
            'Expenses': generate_outlier(expenses, 0.05, 20000, 10000),
            'Profit': profit,
            'ProfitMargin': generate_outlier(profit_margin, 0.05, 15, 5),
            'City': fake.city(),
            'State': fake.state_abbr(),
            'ZipCode': fake.zipcode(),
            'Latitude': round(fake.latitude(), 6),
            'Longitude': round(fake.longitude(), 6),
            'IsOpen': fake.random_element(elements=('Yes', 'No')),
            'Wifi': fake.random_element(elements=('Free', 'Paid', 'None')),
            'OutdoorSeating': fake.random_element(elements=('Available', 'Not available')),
            'Delivery': fake.random_element(elements=('Available', 'Not available'))
        })

    return data

# Generate CSV data
with open('synthetic_business_data.csv', 'w', newline='') as csvfile:
    fieldnames = ['BusinessName', 'Category', 'Revenue', 'Expenses', 'Profit', 'ProfitMargin',
                  'City', 'State', 'ZipCode', 'Latitude', 'Longitude', 'IsOpen', 'Wifi', 'OutdoorSeating', 'Delivery']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    data = generate_business_data()

    for row in data:
        writer.writerow(row)

print("Synthetic business dataset generated successfully!")


Synthetic business dataset generated successfully!
