In [2]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)

def generate_mac_address():
    """Generate a random MAC address"""
    # Generate 6 random bytes
    mac_bytes = [random.randint(0, 255) for _ in range(6)]
    # Format as MAC address (XX:XX:XX:XX:XX:XX)
    mac_address = ':'.join([f'{b:02X}' for b in mac_bytes])
    return mac_address

def generate_network_dataset(output_path='network_dataset.csv', start_date=datetime(2023, 1, 1), 
                            end_date=datetime(2025, 3, 20), interval_minutes=30):
    """
    Generate synthetic network dataset with comprehensive network signals
    
    Parameters:
    -----------
    output_path : str
        Path to save the CSV file
    start_date : datetime
        Start date for dataset
    end_date : datetime
        End date for dataset
    interval_minutes : int
        Time interval between data points in minutes
    
    Returns:
    --------
    pandas.DataFrame
        The generated dataset
    """
    # Define the date range and intervals
    date_range = pd.date_range(start=start_date, end=end_date, freq=f'{interval_minutes}T')
    
    # Define service groups and their service names
    service_groups = {
        "Social Media": ["Instagram", "Facebook", "LinkedIn", "Twitter"],
        "Gaming": ["Fortnite", "League of Legends", "Valorant"],
        "Streaming": ["YouTube", "Netflix", "Disney+"],
        "Shopping": ["Amazon", "Etsy", "eBay", "Walmart"],
        "Software": ["Gmail", "Slack", "WebEx", "GMeet"]
    }
    
    # Create group_id and service_id maps
    group_ids = {group: 1001 + i for i, group in enumerate(service_groups)}
    service_ids = {}
    for group in service_groups:
        for i, service in enumerate(service_groups[group]):
            # Generate unique service IDs within each group
            base_id = (group_ids[group] - 1000) * 100
            service_ids[service] = base_id + i + 1
    
    print(f"Generating {len(date_range)} timestamps from {start_date} to {end_date}")
    print(f"Service groups: {list(service_groups.keys())}")
    
    # Data generation
    data = []
    for timestamp in date_range:
        # Generate a unique MAC address for this session
        mac_address = generate_mac_address()
        
        # Time-based logic for service group probabilities
        hour = timestamp.hour
        is_weekend = timestamp.weekday() >= 5
        
        # Adjust probabilities based on time of day and day of week
        if 9 <= hour <= 17 and not is_weekend:
            # Work hours on weekdays
            temp_probs = {
                "Social Media": 0.15,
                "Gaming": 0.05,
                "Streaming": 0.10,
                "Shopping": 0.15,
                "Software": 0.55
            }
        elif 18 <= hour <= 23:
            # Evening hours
            temp_probs = {
                "Social Media": 0.15,
                "Gaming": 0.30,
                "Streaming": 0.35,
                "Shopping": 0.15,
                "Software": 0.05
            }
        elif is_weekend:
            # Weekend
            temp_probs = {
                "Social Media": 0.20,
                "Gaming": 0.25,
                "Streaming": 0.30,
                "Shopping": 0.20,
                "Software": 0.05
            }
        else:
            # Default - early morning
            temp_probs = {
                "Social Media": 0.25,
                "Gaming": 0.10,
                "Streaming": 0.20,
                "Shopping": 0.15,
                "Software": 0.30
            }
        
        # Determine service group based on adjusted probabilities
        service_group = random.choices(
            list(temp_probs.keys()), 
            weights=list(temp_probs.values())
        )[0]
        
        # Select specific service within group
        service_name = random.choice(service_groups[service_group])
        
        # Network metrics generation based on service group
        if service_group == "Streaming":
            bandwidth = round(random.uniform(15, 100), 2)
            signal_strength = round(random.uniform(-70, -30), 2)
            packet_loss = round(random.uniform(0, 0.2), 4)
            latency = round(random.uniform(10, 50), 2)
            jitter = round(random.uniform(1, 5), 2)
        elif service_group == "Gaming":
            bandwidth = round(random.uniform(5, 50), 2)
            signal_strength = round(random.uniform(-65, -25), 2)
            packet_loss = round(random.uniform(0, 0.1), 4)
            latency = round(random.uniform(5, 30), 2)
            jitter = round(random.uniform(0.5, 3), 2)
        elif service_group == "Social Media":
            bandwidth = round(random.uniform(2, 30), 2)
            signal_strength = round(random.uniform(-75, -40), 2)
            packet_loss = round(random.uniform(0, 0.3), 4)
            latency = round(random.uniform(20, 80), 2)
            jitter = round(random.uniform(2, 8), 2)
        elif service_group == "Shopping":
            bandwidth = round(random.uniform(1, 20), 2)
            signal_strength = round(random.uniform(-80, -45), 2)
            packet_loss = round(random.uniform(0, 0.4), 4)
            latency = round(random.uniform(30, 100), 2)
            jitter = round(random.uniform(3, 10), 2)
        else:  # Software
            bandwidth = round(random.uniform(3, 40), 2)
            signal_strength = round(random.uniform(-70, -35), 2)
            packet_loss = round(random.uniform(0, 0.2), 4)
            latency = round(random.uniform(15, 70), 2)
            jitter = round(random.uniform(1, 7), 2)
        
        # Temporal features
        hour = timestamp.hour
        day_of_week = timestamp.weekday()
        month = timestamp.month
        
        # Usage metrics
        usage_percentage = round(random.uniform(10, 90), 2)
        usage_minutes = round(usage_percentage * 0.3, 2)  # Proportional to percentage
        
        # Device group (work devices more likely during work hours)
        if service_group == "Software" and 8 <= hour <= 18 and day_of_week < 5:
            device_group = "work_device"
        else:
            device_group = "personal_device"
        
        data.append([
            timestamp, 
            service_group, 
            group_ids[service_group],
            service_name, 
            service_ids[service_name],
            bandwidth,
            signal_strength,
            packet_loss,
            latency,
            jitter,
            hour,
            day_of_week,
            month,
            usage_percentage,
            usage_minutes,
            device_group,
            mac_address
        ])
    
    # Create DataFrame
    columns = [
        'timestamp', 'service_group', 'group_id', 'service_name', 'service_id', 
        'bandwidth_speed', 'signal_strength', 'packet_loss', 'latency', 'jitter',
        'hour', 'day_of_week', 'month', 
        'usage_percentage', 'usage_minutes', 'device_group', 'mac_address'
    ]
    
    df = pd.DataFrame(data, columns=columns)
    
    # Save to CSV
    df.to_csv(output_path, index=False)
    print(f"Dataset saved to {output_path}")
    print(f"Generated {len(df)} records with {len(df['service_group'].unique())} service groups")
    
    # Generate summary statistics
    print("\nSummary by Service Group:")
    for group in df['service_group'].unique():
        group_data = df[df['service_group'] == group]
        print(f"{group}: {len(group_data)} records, " +
              f"Avg Bandwidth: {group_data['bandwidth_speed'].mean():.2f} Mbps")
    
    return df

def generate_training_testing_datasets(output_dir='data', 
                                      train_file='train_network_data.csv',
                                      test_file='test_network_data.csv',
                                      train_start=datetime(2023, 1, 1),
                                      train_end=datetime(2024, 12, 31),
                                      test_start=datetime(2025, 1, 1),
                                      test_end=datetime(2025, 3, 20)):
    """
    Generate separate training and testing datasets
    
    Parameters:
    -----------
    output_dir : str
        Directory to save the datasets
    train_file : str
        Filename for the training data
    test_file : str
        Filename for the testing data
    train_start : datetime
        Start date for training data
    train_end : datetime
        End date for training data
    test_start : datetime
        Start date for testing data
    test_end : datetime
        End date for testing data
    
    Returns:
    --------
    tuple
        (train_df, test_df) - The training and testing DataFrames
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Generate training data
    print(f"Generating training data from {train_start} to {train_end}")
    train_df = generate_network_dataset(
        os.path.join(output_dir, train_file), 
        start_date=train_start,
        end_date=train_end
    )
    
    # Generate testing data
    print(f"Generating testing data from {test_start} to {test_end}")
    test_df = generate_network_dataset(
        os.path.join(output_dir, test_file),
        start_date=test_start,
        end_date=test_end
    )
    
    return train_df, test_df

if __name__ == "__main__":
    # Generate a dataset with custom date range
    generate_network_dataset(
        'network_dataset.csv',
        start_date=datetime(2023, 1, 1),
        end_date=datetime(2025, 3, 20),
        interval_minutes=30
    )
    
    # Optional: Generate separate training and testing datasets
    # generate_training_testing_datasets()

Generating 38833 timestamps from 2023-01-01 00:00:00 to 2025-03-20 00:00:00
Service groups: ['Social Media', 'Gaming', 'Streaming', 'Shopping', 'Software']
Dataset saved to network_dataset.csv
Generated 38833 records with 5 service groups

Summary by Service Group:
Streaming: 9133 records, Avg Bandwidth: 57.61 Mbps
Shopping: 6148 records, Avg Bandwidth: 10.41 Mbps
Software: 9609 records, Avg Bandwidth: 21.52 Mbps
Gaming: 6574 records, Avg Bandwidth: 27.15 Mbps
Social Media: 7369 records, Avg Bandwidth: 16.00 Mbps
