In [6]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from itertools import product
import json

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

def generate_truly_balanced_wifi_dataset(n_samples=50000):
    """
    Generate TRULY BALANCED WiFi router dataset
    Perfect balance across ALL dimensions: service groups, device groups, time periods
    """
    
    # Device definitions - organized by groups
    devices = {
        'work_computer': {
            'Work Laptop': {
                'max_bandwidth_capability': (20, 150),
                'compatible_services': ['Zoom', 'Microsoft Teams', 'Google Meet', 'Web Browsing', 'Cloud Productivity', 'File Sharing', 'Cloud Backup'],
                'business_priority': True
            },
            'Business Desktop': {
                'max_bandwidth_capability': (50, 200),
                'compatible_services': ['Microsoft Teams', 'Web Browsing', 'Cloud Productivity', 'File Sharing', 'Software Updates', 'Cloud Backup'],
                'business_priority': True
            }
        },
        'personal_computer': {
            'Personal Laptop': {
                'max_bandwidth_capability': (15, 100),
                'compatible_services': ['Netflix', 'YouTube', 'Steam Gaming', 'Web Browsing', 'Social Media', 'Online Shopping'],
                'business_priority': False
            }
        },
        'entertainment_device': {
            'Smart TV': {
                'max_bandwidth_capability': (25, 100),
                'compatible_services': ['Netflix', 'Disney+', 'Prime Video', 'YouTube'],
                'business_priority': False
            },
            'Gaming Console': {
                'max_bandwidth_capability': (30, 120),
                'compatible_services': ['Console Gaming', 'Netflix', 'YouTube', 'Software Updates'],
                'business_priority': False
            },
            'Streaming Device': {
                'max_bandwidth_capability': (20, 80),
                'compatible_services': ['Netflix', 'Disney+', 'Prime Video', 'YouTube'],
                'business_priority': False
            }
        },
        'mobile_device': {
            'Work Phone': {
                'max_bandwidth_capability': (5, 40),
                'compatible_services': ['WhatsApp Video', 'Web Browsing', 'Social Media', 'Cloud Productivity'],
                'business_priority': True
            },
            'Smartphone': {
                'max_bandwidth_capability': (3, 50),
                'compatible_services': ['YouTube', 'Social Media', 'Mobile Gaming', 'WhatsApp Video', 'Web Browsing', 'Online Shopping'],
                'business_priority': False
            },
            'Tablet': {
                'max_bandwidth_capability': (8, 60),
                'compatible_services': ['Netflix', 'YouTube', 'Disney+', 'Social Media', 'Web Browsing'],
                'business_priority': False
            }
        },
        'smart_home': {
            'Security Camera': {
                'max_bandwidth_capability': (2, 12),
                'compatible_services': ['Security Camera'],
                'business_priority': True
            },
            'Smart Speaker': {
                'max_bandwidth_capability': (0.5, 3),
                'compatible_services': ['Voice Assistant', 'Web Browsing'],
                'business_priority': False
            },
            'IoT Device': {
                'max_bandwidth_capability': (0.1, 2),
                'compatible_services': ['Smart Devices'],
                'business_priority': False
            }
        }
    }
    
    # Service definitions - organized by groups
    services = {
        'video_streaming': {
            'Netflix': {'bitrate_range': (3, 25), 'latency_tolerance': 200, 'packet_loss_tolerance': 1.0, 'jitter_tolerance': 50, 'priority_score': 5, 'business_priority': False},
            'YouTube': {'bitrate_range': (1, 15), 'latency_tolerance': 300, 'packet_loss_tolerance': 1.5, 'jitter_tolerance': 60, 'priority_score': 4, 'business_priority': False},
            'Disney+': {'bitrate_range': (3, 25), 'latency_tolerance': 200, 'packet_loss_tolerance': 1.0, 'jitter_tolerance': 50, 'priority_score': 5, 'business_priority': False},
            'Prime Video': {'bitrate_range': (3, 22), 'latency_tolerance': 200, 'packet_loss_tolerance': 1.0, 'jitter_tolerance': 50, 'priority_score': 5, 'business_priority': False}
        },
        'video_calling': {
            'Zoom': {'bitrate_range': (1.5, 8), 'latency_tolerance': 80, 'packet_loss_tolerance': 0.1, 'jitter_tolerance': 20, 'priority_score': 10, 'business_priority': True},
            'Microsoft Teams': {'bitrate_range': (1.2, 6), 'latency_tolerance': 100, 'packet_loss_tolerance': 0.2, 'jitter_tolerance': 25, 'priority_score': 10, 'business_priority': True},
            'Google Meet': {'bitrate_range': (1, 5), 'latency_tolerance': 120, 'packet_loss_tolerance': 0.3, 'jitter_tolerance': 30, 'priority_score': 9, 'business_priority': True},
            'WhatsApp Video': {'bitrate_range': (0.5, 3), 'latency_tolerance': 120, 'packet_loss_tolerance': 0.3, 'jitter_tolerance': 30, 'priority_score': 6, 'business_priority': False}
        },
        'gaming': {
            'Steam Gaming': {'bitrate_range': (1, 5), 'latency_tolerance': 30, 'packet_loss_tolerance': 0.05, 'jitter_tolerance': 10, 'priority_score': 7, 'business_priority': False},
            'Console Gaming': {'bitrate_range': (1, 6), 'latency_tolerance': 40, 'packet_loss_tolerance': 0.1, 'jitter_tolerance': 15, 'priority_score': 7, 'business_priority': False},
            'Mobile Gaming': {'bitrate_range': (0.5, 3), 'latency_tolerance': 60, 'packet_loss_tolerance': 0.2, 'jitter_tolerance': 25, 'priority_score': 5, 'business_priority': False}
        },
        'web_browsing': {
            'Web Browsing': {'bitrate_range': (0.5, 3), 'latency_tolerance': 300, 'packet_loss_tolerance': 2.0, 'jitter_tolerance': 100, 'priority_score': 6, 'business_priority': True},
            'Social Media': {'bitrate_range': (1, 4), 'latency_tolerance': 250, 'packet_loss_tolerance': 1.5, 'jitter_tolerance': 80, 'priority_score': 3, 'business_priority': False},
            'Cloud Productivity': {'bitrate_range': (1, 8), 'latency_tolerance': 200, 'packet_loss_tolerance': 0.5, 'jitter_tolerance': 50, 'priority_score': 8, 'business_priority': True},
            'Online Shopping': {'bitrate_range': (1, 5), 'latency_tolerance': 400, 'packet_loss_tolerance': 2.0, 'jitter_tolerance': 120, 'priority_score': 2, 'business_priority': False}
        },
        'file_transfer': {
            'Cloud Backup': {'bitrate_range': (5, 30), 'latency_tolerance': 2000, 'packet_loss_tolerance': 5.0, 'jitter_tolerance': 500, 'priority_score': 7, 'business_priority': True},
            'File Sharing': {'bitrate_range': (10, 50), 'latency_tolerance': 1000, 'packet_loss_tolerance': 3.0, 'jitter_tolerance': 300, 'priority_score': 8, 'business_priority': True},
            'Software Updates': {'bitrate_range': (10, 80), 'latency_tolerance': 1500, 'packet_loss_tolerance': 4.0, 'jitter_tolerance': 400, 'priority_score': 4, 'business_priority': False}
        },
        'smart_home': {
            'Security Camera': {'bitrate_range': (2, 8), 'latency_tolerance': 500, 'packet_loss_tolerance': 2.0, 'jitter_tolerance': 200, 'priority_score': 8, 'business_priority': True},
            'Smart Devices': {'bitrate_range': (0.1, 2), 'latency_tolerance': 800, 'packet_loss_tolerance': 3.0, 'jitter_tolerance': 400, 'priority_score': 4, 'business_priority': False},
            'Voice Assistant': {'bitrate_range': (0.2, 1.5), 'latency_tolerance': 300, 'packet_loss_tolerance': 1.5, 'jitter_tolerance': 150, 'priority_score': 5, 'business_priority': False}
        }
    }
    
    # Define stratification dimensions
    service_groups = list(services.keys())
    device_groups = list(devices.keys()) 
    time_periods = ['work_hours', 'peak_hours', 'off_hours']
    
    # Calculate perfect balance
    total_combinations = len(service_groups) * len(device_groups) * len(time_periods)  # 6 × 5 × 3 = 90
    samples_per_combination = n_samples // total_combinations  # ~556 samples per combination
    remaining_samples = n_samples % total_combinations
    
    print(f" PERFECT BALANCE PLAN:")
    print(f"    {len(service_groups)} service groups × {len(device_groups)} device groups × {len(time_periods)} time periods = {total_combinations} combinations")
    print(f"    {samples_per_combination} samples per combination (+ {remaining_samples} distributed)")
    
    data = []
    combination_count = 0
    
    # Generate samples for EVERY combination to ensure perfect balance
    for service_group in service_groups:
        for device_group in device_groups:
            for time_period in time_periods:
                
                # Calculate samples for this specific combination
                samples_for_this_combo = samples_per_combination
                if combination_count < remaining_samples:
                    samples_for_this_combo += 1  # Distribute remaining samples
                
                # Get compatible devices and services for this combination
                devices_in_group = devices[device_group]
                services_in_group = services[service_group]
                
                # Generate balanced samples for this exact combination
                for i in range(samples_for_this_combo):
                    
                    # Select device from this device group
                    device_name = random.choice(list(devices_in_group.keys()))
                    device_info = devices_in_group[device_name]
                    
                    # Select service from this service group  
                    service_name = random.choice(list(services_in_group.keys()))
                    service_info = services_in_group[service_name]
                    
                    # Check compatibility (80% compatible, 20% edge cases for robustness)
                    is_compatible = service_name in device_info['compatible_services']
                    if not is_compatible and random.random() > 0.2:
                        # Force compatibility 80% of the time, allow edge cases 20%
                        compatible_services = [s for s in services_in_group.keys() 
                                             if s in device_info['compatible_services']]
                        if compatible_services:
                            service_name = random.choice(compatible_services)
                            service_info = services_in_group[service_name]
                    
                    # Generate timestamp based on time period
                    if time_period == 'work_hours':
                        hour_of_day = random.randint(9, 17)
                        day_of_week = random.randint(0, 4)  # Weekdays
                    elif time_period == 'peak_hours':
                        hour_of_day = random.randint(18, 22)
                        day_of_week = random.randint(0, 6)  # Any day
                    else:  # off_hours
                        hour_of_day = random.choice(list(range(0, 9)) + list(range(23, 24)))
                        day_of_week = random.randint(0, 6)
                    
                    timestamp = datetime.now() - timedelta(
                        days=random.randint(0, 30),
                        hours=hour_of_day - datetime.now().hour,
                        minutes=random.randint(0, 59)
                    )
                    
                    # Time-based features
                    is_weekend = day_of_week >= 5
                    is_business_hours = 10 <= hour_of_day <= 19 and not is_weekend
                    is_peak_hours = 18 <= hour_of_day <= 22
                    is_wfh_core_hours = 10 <= hour_of_day <= 17 and not is_weekend
                    
                    # Realistic business user distribution
                    if is_business_hours and device_info['business_priority']:
                        is_business_user = np.random.choice([True, False], p=[0.8, 0.2])
                    elif is_business_hours:
                        is_business_user = np.random.choice([True, False], p=[0.6, 0.4])
                    elif is_weekend:
                        is_business_user = np.random.choice([True, False], p=[0.2, 0.8])
                    else:
                        is_business_user = np.random.choice([True, False], p=[0.4, 0.6])
                    
                    # Network infrastructure
                    base_router_capacity = np.random.uniform(100, 500)
                    num_connected_devices = np.random.poisson(8) + 2
                    
                    # Network load based on time
                    if is_wfh_core_hours:
                        network_load_factor = 0.6
                    elif is_peak_hours:
                        network_load_factor = 0.7
                    else:
                        network_load_factor = 0.4
                    
                    current_network_utilization = min(0.9, max(0.1, 
                        network_load_factor + np.random.normal(0, 0.1)))
                    
                    available_download_speed = base_router_capacity * (1 - current_network_utilization)
                    available_upload_speed = available_download_speed * np.random.uniform(0.1, 0.3)
                    
                    # WiFi frequency and signal strength
                    bandwidth_frequency = random.choice(['2.4G', '5G'])
                    if bandwidth_frequency == '5G':
                        signal_strength = np.random.beta(4, 2) * (1 - np.random.uniform(0.05, 0.2))
                    else:
                        signal_strength = np.random.beta(3, 2) * (1 - np.random.uniform(0.15, 0.4))
                    signal_strength = max(0.2, min(1.0, signal_strength))
                    
                    # Network quality metrics
                    base_latency = 12 + np.random.gamma(2, 8)
                    latency = base_latency * (1 + current_network_utilization * 0.8) * (2.2 - signal_strength)
                    
                    jitter = (np.random.exponential(6) + 1) * (1 + current_network_utilization * 0.4)
                    
                    packet_loss = (np.random.exponential(0.2) * (2.5 - signal_strength) * 
                                  (1 + current_network_utilization * 1.2))
                    packet_loss = min(8.0, packet_loss)
                    
                    # Health assessment
                    overall_health = (signal_strength * 0.4 + 
                                    (1 - min(current_network_utilization, 1.0)) * 0.35 + 
                                    (1 - min(packet_loss/4.0, 1.0)) * 0.25)
                    ping_test_status = 'Good' if overall_health > 0.75 else 'Fair' if overall_health > 0.45 else 'Poor'
                    
                    # Device capabilities
                    device_max_bandwidth = np.random.uniform(*device_info['max_bandwidth_capability'])
                    
                    # Bandwidth calculation
                    min_bitrate, max_bitrate = service_info['bitrate_range']
                    
                    target_bitrate = np.random.uniform(min_bitrate, max_bitrate)
                    target_bitrate *= signal_strength
                    
                    # Business priority (subtle preference)
                    if (service_info['business_priority'] and is_business_user and 
                        is_business_hours and device_info['business_priority']):
                        target_bitrate *= 1.2
                    
                    # Apply constraints
                    optimal_bitrate = min(target_bitrate, device_max_bandwidth * 0.8, 
                                        available_download_speed * 0.7)
                    optimal_bitrate = max(optimal_bitrate, min_bitrate * 0.6)
                    
                    # Historical and usage metrics
                    historical_avg_bitrate = optimal_bitrate * np.random.uniform(0.85, 1.15)
                    data_usage = (optimal_bitrate * np.random.uniform(60, 240)) / 8 / 1024  # MB
                    
                    # Quality satisfaction (combined metric)
                    latency_satisfaction = max(0, 1 - max(0, latency - service_info['latency_tolerance']) / service_info['latency_tolerance'])
                    jitter_satisfaction = max(0, 1 - max(0, jitter - service_info['jitter_tolerance']) / service_info['jitter_tolerance'])
                    packet_loss_satisfaction = max(0, 1 - max(0, packet_loss - service_info['packet_loss_tolerance']) / service_info['packet_loss_tolerance'])
                    
                    if optimal_bitrate >= min_bitrate:
                        bitrate_satisfaction = min(1.0, optimal_bitrate / max_bitrate)
                    else:
                        bitrate_satisfaction = optimal_bitrate / min_bitrate
                    
                    quality_satisfaction = (
                        bitrate_satisfaction * 0.35 +
                        latency_satisfaction * 0.25 +
                        packet_loss_satisfaction * 0.25 +
                        jitter_satisfaction * 0.15
                    )
                    
                    # Throughput calculation
                    throughput_efficiency = (
                        signal_strength * 0.4 +
                        (1 - current_network_utilization) * 0.3 +
                        (1 - min(packet_loss/5.0, 1.0)) * 0.2 +
                        np.random.uniform(0.85, 1.0) * 0.1
                    )
                    throughput_efficiency = max(0.3, min(1.0, throughput_efficiency))
                    actual_throughput = optimal_bitrate * throughput_efficiency
                    
                    # MOS calculation
                    MOS = 1 + (quality_satisfaction * 4)
                    MOS = max(1.0, min(5.0, MOS + np.random.normal(0, 0.12)))
                    
                    # Create perfectly balanced record
                    record = {
                        # Temporal features
                        'timestamp': timestamp,
                        'hour_of_day': hour_of_day,
                        'day_of_week': day_of_week,
                        'is_weekend': is_weekend,
                        'is_business_hours': is_business_hours,
                        'is_peak_hours': is_peak_hours,
                        'is_wfh_core_hours': is_wfh_core_hours,
                        
                        # Device context
                        'device_name': device_name,
                        'device_group': device_group,
                        'device_max_bandwidth': device_max_bandwidth,
                        'device_is_business': device_info['business_priority'],
                        
                        # User context
                        'is_business_user': is_business_user,
                        
                        # Service context
                        'service_name': service_name,
                        'service_group': service_group,
                        'service_priority': service_info['priority_score'],
                        'is_business_service': service_info['business_priority'],
                        
                        # Network infrastructure
                        'download_speed': available_download_speed,
                        'upload_speed': available_upload_speed,
                        'bandwidth_frequency': bandwidth_frequency,
                        'num_connected_devices': num_connected_devices,
                        'network_utilization': current_network_utilization,
                        
                        # Network quality
                        'latency': latency,
                        'jitter': jitter,
                        'packet_loss': packet_loss,
                        'signal_strength': signal_strength,
                        'ping_test_status': ping_test_status,
                        
                        # Usage and performance
                        'historical_avg_bitrate': historical_avg_bitrate,
                        'data_usage': data_usage,
                        'quality_satisfaction': quality_satisfaction,
                        'throughput': actual_throughput,
                        'throughput_efficiency': throughput_efficiency,
                        
                        # Targets
                        'optimal_bandwidth_allocation': optimal_bitrate,
                        'MOS': MOS
                    }
                    
                    data.append(record)
                
                combination_count += 1
    
    return pd.DataFrame(data)

def validate_perfect_balance(df):
    """Validate perfect balance across all dimensions"""
    print("\n PERFECT BALANCE VALIDATION")
    print("=" * 60)
    
    total = len(df)
    
    # Service group balance
    print(" SERVICE GROUP BALANCE:")
    service_counts = df['service_group'].value_counts().sort_index()
    service_balanced = True
    for group, count in service_counts.items():
        percentage = (count / total) * 100
        expected = 100/6  # Should be 16.67%
        deviation = abs(percentage - expected)
        status = "" if deviation < 1.0 else ""
        if deviation >= 1.0:
            service_balanced = False
        print(f"{status} {group}: {count:,} ({percentage:.2f}%) [Expected: {expected:.2f}%, Deviation: {deviation:.2f}%]")
    
    # Device group balance  
    print("\n DEVICE GROUP BALANCE:")
    device_counts = df['device_group'].value_counts().sort_index()
    device_balanced = True
    for group, count in device_counts.items():
        percentage = (count / total) * 100
        expected = 100/5  # Should be 20%
        deviation = abs(percentage - expected)
        status = "" if deviation < 1.0 else ""
        if deviation >= 1.0:
            device_balanced = False
        print(f"{status} {group}: {count:,} ({percentage:.2f}%) [Expected: {expected:.2f}%, Deviation: {deviation:.2f}%]")
    
    # Time period balance
    print("\n TIME PERIOD BALANCE:")
    work_hours = len(df[df['is_business_hours'] == True])
    peak_hours = len(df[df['is_peak_hours'] == True]) 
    off_hours = total - work_hours - peak_hours
    
    time_balanced = True
    expected_time = 100/3  # Should be 33.33%
    
    for count, name in [(work_hours, "work_hours"), (peak_hours, "peak_hours"), (off_hours, "off_hours")]:
        percentage = (count / total) * 100
        deviation = abs(percentage - expected_time)
        status = "" if deviation < 2.0 else ""
        if deviation >= 2.0:
            time_balanced = False
        print(f"{status} {name}: {count:,} ({percentage:.2f}%) [Expected: {expected_time:.2f}%, Deviation: {deviation:.2f}%]")
    
    # Combination balance check
    print("\n COMBINATION BALANCE CHECK:")
    combinations = df.groupby(['service_group', 'device_group']).size()
    combo_std = combinations.std()
    combo_mean = combinations.mean()
    combo_cv = combo_std / combo_mean  # Coefficient of variation
    
    combo_balanced = combo_cv < 0.05  # Less than 5% variation
    status = "" if combo_balanced else ""
    print(f"{status} Service-Device combinations: {len(combinations)} combinations")
    print(f"   Average samples per combination: {combo_mean:.1f}")
    print(f"   Standard deviation: {combo_std:.1f}")
    print(f"   Coefficient of variation: {combo_cv:.4f} ({'Good' if combo_cv < 0.05 else 'Needs improvement'})")
    
    # Overall balance score
    overall_balanced = service_balanced and device_balanced and time_balanced and combo_balanced
    print(f"\n OVERALL BALANCE STATUS: {' PERFECTLY BALANCED' if overall_balanced else ' NEEDS IMPROVEMENT'}")
    
    return overall_balanced

if __name__ == "__main__":
    print(" Generating TRULY BALANCED WiFi Dataset...")
    print(" Perfect stratification across ALL dimensions")
    
    df = generate_truly_balanced_wifi_dataset(n_samples=50000)
    
    # Validate perfect balance
    is_perfectly_balanced = validate_perfect_balance(df)
    
    if is_perfectly_balanced:
        print("\n SUCCESS: Dataset is PERFECTLY BALANCED!")
        print(" Ready for unbiased ML model training")
    else:
        print("\n Balance needs fine-tuning")
    
    # Save dataset
    df.to_csv('perfectly_balanced_wifi_dataset.csv', index=False)
    print(f"\n Dataset saved as 'perfectly_balanced_wifi_dataset.csv'")
    print(f" Shape: {df.shape}")
    print(f" Features: {len(df.columns)}")
    
    # Quick summary
    print(f"\n DATASET SUMMARY:")
    print(f"   Total samples: {len(df):,}")
    print(f"   Service groups: {df['service_group'].nunique()} (perfectly balanced)")
    print(f"   Device groups: {df['device_group'].nunique()} (perfectly balanced)")
    print(f"   Unique combinations: {len(df.groupby(['service_group', 'device_group']))}")
    print(f"   Average throughput: {df['throughput'].mean():.2f} Mbps")
    print(f"   Average MOS: {df['MOS'].mean():.2f}")
    
    print(f"\n This dataset provides PERFECT BALANCE for optimal ML training!")

 Generating TRULY BALANCED WiFi Dataset...
 Perfect stratification across ALL dimensions
 PERFECT BALANCE PLAN:
    6 service groups × 5 device groups × 3 time periods = 90 combinations
    555 samples per combination (+ 50 distributed)

 PERFECT BALANCE VALIDATION
 SERVICE GROUP BALANCE:
 file_transfer: 8,325 (16.65%) [Expected: 16.67%, Deviation: 0.02%]
 gaming: 8,340 (16.68%) [Expected: 16.67%, Deviation: 0.01%]
 smart_home: 8,325 (16.65%) [Expected: 16.67%, Deviation: 0.02%]
 video_calling: 8,340 (16.68%) [Expected: 16.67%, Deviation: 0.01%]
 video_streaming: 8,340 (16.68%) [Expected: 16.67%, Deviation: 0.01%]
 web_browsing: 8,330 (16.66%) [Expected: 16.67%, Deviation: 0.01%]

 DEVICE GROUP BALANCE:
 entertainment_device: 9,999 (20.00%) [Expected: 20.00%, Deviation: 0.00%]
 mobile_device: 9,999 (20.00%) [Expected: 20.00%, Deviation: 0.00%]
 personal_computer: 10,001 (20.00%) [Expected: 20.00%, Deviation: 0.00%]
 smart_home: 9,999 (20.00%) [Expected: 20.00%, Deviation: 0.00%]
 work_