In [4]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker
fake = Faker()

# Generate realistic data
num_records = 100

# Generate dates
dates = pd.date_range(start="2023-01-01", periods=num_records).to_list()
# Randomly introduce incorrect date formats
dates = [str(date) if np.random.rand() > 0.1 else date.strftime('%m-%d-%Y') for date in dates]

# Generate product names and categories
products = ['Laptop', 'Smartphone', 'Tablet', 'Monitor']
categories = ['Electronics', 'Electronics', 'Electronics', 'Electronics']
product_choices = np.random.choice(products, num_records)
category_choices = [categories[products.index(prod)] for prod in product_choices]

# Generate regions with some inconsistencies
regions = ['North', 'South', 'East', 'West']
region_choices = [np.random.choice(regions) for _ in range(num_records)]
region_choices = [region if np.random.rand() > 0.1 else region.lower() for region in region_choices]

# Generate sales amounts with some outliers
sales = np.random.normal(200, 50, num_records).round(2)
sales[::10] = sales[::10] * 30  # Introducing outliers

# Generate discounts with some negative values (to represent errors)
discounts = np.random.uniform(0, 20, num_records).round(2)
discounts[::15] = discounts[::15] * -1  # Introducing errors

# Generate profit with some incorrect calculations
profit = (sales * (1 - discounts/100)).round(2)
profit[::20] = profit[::20] * np.random.uniform(1.5, 2.0)  # Introducing incorrect profit values

# Generate quantity sold with some outliers
quantity = np.random.randint(1, 10, num_records)
quantity[::8] = quantity[::8] * 3  # Introducing outliers

# Generate customer IDs with some duplicates and missing values
customer_ids = [fake.uuid4()[:8] for _ in range(num_records)]
customer_ids[::7] = [None] * (len(customer_ids[::7]))  # Introducing missing values correctly
customer_ids[::12] = customer_ids[1:len(customer_ids):12]  # Introducing duplicates

# Generate customer names
customer_names = [fake.name() for _ in range(num_records)]

# Create the DataFrame
data = {
    'Date': dates,
    'Product': product_choices,
    'Category': category_choices,
    'Region': region_choices,
    'Sales': sales,
    'Discount': discounts,
    'Profit': profit,
    'Quantity': quantity,
    'Customer ID': customer_ids,
    'Customer Name': customer_names
}

df_realistic = pd.DataFrame(data)

# Save to CSV
file_path_realistic = "Sales_Data_Practical.csv"
df_realistic.to_csv(file_path_realistic, index=False)
