In [18]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
Faker.seed(42)

# Generate core data dimensions
def generate_products(num_products=50):
    categories = ['Laptop', 'Smartphone', 'Tablet', 'Camera', 'Headphones']
    brands = ['Apple', 'Samsung', 'Sony', 'Dell', 'Bose', 'Canon']
    
    products = []
    for i in range(num_products):
        products.append({
            'product_id': i+1000,
            'product_name': f"{fake.word(ext_word_list=brands)} {fake.bothify(text='???-####')}",
            'category': np.random.choice(categories),
            'brand': np.random.choice(brands),
            'cost_price': round(np.random.uniform(200, 2000), 2),
            'base_price': None  # Will be calculated
        })
    
    df = pd.DataFrame(products)
    df['base_price'] = df['cost_price'] * np.random.uniform(1.2, 2.0, size=num_products)
    return df.round(2)

products_df = generate_products()

def generate_users(num_users=1000):
    users = []
    for i in range(num_users):
        users.append({
            'user_id': i+5000,
            'name': fake.name(),
            'email': fake.email(),
            'location': fake.country(),
            'signup_date': fake.date_between(start_date='-2y', end_date='today')
        })
    return pd.DataFrame(users)

users_df = generate_users()

# Generate transactional data with temporal relationships
def generate_transactional_data(products, users, days=90):
    data = []
    current_date = datetime.now()
    
    for _ in range(20000):  # Total events
        user = users.sample(1).iloc[0]
        product = products.sample(1).iloc[0]
        event_date = current_date - timedelta(days=np.random.randint(0, days))
        
        # Generate different event types with probabilities
        event_type = np.random.choice(
            ['browse', 'purchase', 'review'],
            p=[0.6, 0.3, 0.1]
        )
        
        record = {
            'user_id': user['user_id'],
            'product_id': product['product_id'],
            'event_type': event_type,
            'event_timestamp': event_date.strftime('%Y-%m-%d %H:%M:%S'),
            'session_duration': np.random.randint(10, 600) if event_type == 'browse' else None,
            'quantity': np.random.randint(1, 3) if event_type == 'purchase' else None,
            'price_paid': round(product['base_price'] * np.random.uniform(0.8, 1.2), 2) if event_type == 'purchase' else None,
            'rating': np.random.randint(1, 6) if event_type == 'review' else None,
            'review_text': fake.sentence(nb_words=10) if event_type == 'review' else None
        }
        
        data.append(record)
    
    return pd.DataFrame(data).sort_values('event_timestamp')

transactions_df = generate_transactional_data(products_df, users_df)

# Generate inventory data
def generate_inventory(products):
    inventory = []
    for _, product in products.iterrows():
        inventory.append({
            'product_id': product['product_id'],
            'current_stock': np.random.randint(0, 100),
            'last_restock_date': fake.date_between(start_date='-30d', end_date='today'),
            'lead_time': np.random.choice([3, 7, 14], p=[0.6, 0.3, 0.1])
        })
    return pd.DataFrame(inventory)

inventory_df = generate_inventory(products_df)

# Generate competitor pricing data
def generate_competitor_pricing(products, days=30):
    competitors = ['Amazon', 'BestBuy', 'Walmart', 'Newegg']
    data = []
    
    for _, product in products.iterrows():
        for day in range(days):
            date = datetime.now() - timedelta(days=day)
            for competitor in competitors:
                data.append({
                    'product_id': product['product_id'],
                    'competitor_name': competitor,
                    'price': round(product['base_price'] * np.random.uniform(0.85, 1.15), 2),
                    'date_tracked': date.strftime('%Y-%m-%d')
                })
    
    return pd.DataFrame(data)

competitor_pricing_df = generate_competitor_pricing(products_df)

# Save datasets to CSV
datasets = {
    'products': products_df,
    'users': users_df,
    'transactions': transactions_df,
    'inventory': inventory_df,
    'competitor_pricing': competitor_pricing_df
}

for name, df in datasets.items():
    df.to_csv(f'{name}.csv', index=False)

print("Synthetic datasets generated successfully!")

Synthetic datasets generated successfully!
