In [None]:
import pandas as pd
import numpy as np

# --- Configuration ---
# Define the number of data points for each scenario based on the plan
N_POINTS_NORMAL = 20000
N_POINTS_DEGRADATION = 5000
N_POINTS_INTERFERENCE = 5000
N_POINTS_INTERMITTENT = 5000
N_POINTS_SYNC = 5000
TOTAL_POINTS = N_POINTS_NORMAL + N_POINTS_DEGRADATION + N_POINTS_INTERFERENCE + N_POINTS_INTERMITTENT + N_POINTS_SYNC

# Define the columns for the final dataset, now including the 'Scenario' column
COLUMNS = [
    "Timestamp", "Scenario", "SNR", "RSSI", "BER", "EVM",
    "Phase_Offset", "Frequency_Offset", "Label"
]
# Define the feature columns (excluding metadata and label) for DataFrame creation
FEATURE_COLUMNS = [
    "SNR", "RSSI", "BER", "EVM", "Phase_Offset", "Frequency_Offset", "Label"
]


# --- Scenario Generation Functions ---

def generate_normal_data(n_points):
    """Generates data for Scenario 1: Normal Operation."""
    print(f"Generating {n_points} data points for Normal Operation...")
    snr_levels = [18, 22, 25]
    snr = np.random.choice(snr_levels, n_points) + np.random.normal(0, 0.5, n_points)
    rssi = -40 + snr/5 + np.random.normal(0, 0.5, n_points)
    ber = np.zeros(n_points)
    evm = np.random.uniform(0.5, 1.5, n_points)
    phase_offset = np.random.normal(0, 0.05, n_points)
    frequency_offset = np.random.normal(0, 0.01, n_points)
    label = np.zeros(n_points, dtype=int)
    
    return pd.DataFrame(data=zip(snr, rssi, ber, evm, phase_offset, frequency_offset, label),
                        columns=FEATURE_COLUMNS)

def generate_gradual_degradation_data(n_points):
    """Generates data for Scenario 2: Gradual Degradation."""
    print(f"Generating {n_points} data points for Gradual Degradation...")
    snr = np.linspace(20, 5, n_points) + np.random.normal(0, 0.5, n_points)
    rssi = -40 + snr/5 + np.random.normal(0, 1, n_points)
    ber = np.maximum(0, (10 - snr) * 0.01) * np.random.uniform(0.5, 1.5, n_points)
    ber[snr > 10] = 0
    
    # *** DEFINITIVE FIX: Add noise first, then clamp the entire result to 0 ***
    base_evm = 20 - (snr * 0.9)
    noise = np.random.normal(0, 1, n_points)
    evm = np.maximum(0, base_evm + noise)
    
    phase_offset = np.linspace(0.1, 0.8, n_points) + np.random.normal(0, 0.1, n_points)
    frequency_offset = np.linspace(0.05, 0.3, n_points) + np.random.normal(0, 0.05, n_points)
    label = np.ones(n_points, dtype=int)
    
    return pd.DataFrame(data=zip(snr, rssi, ber, evm, phase_offset, frequency_offset, label),
                        columns=FEATURE_COLUMNS)

def generate_sudden_interference_data(n_points):
    """Generates data for Scenario 3: Sudden Interference."""
    print(f"Generating {n_points} data points for Sudden Interference...")
    snr = np.full(n_points, 25.0) + np.random.normal(0, 0.5, n_points)
    rssi = -40 + snr/5 + np.random.normal(0, 0.5, n_points)
    ber = np.zeros(n_points)
    evm = np.random.uniform(0.5, 1.5, n_points)
    phase_offset = np.random.normal(0, 0.05, n_points)
    frequency_offset = np.random.normal(0, 0.01, n_points)
    
    drop_start = int(n_points * 0.45)
    drop_end = int(n_points * 0.55)
    snr[drop_start:drop_end] = np.random.uniform(-5, 0, drop_end - drop_start)
    rssi[drop_start:drop_end] += np.random.uniform(-10, -5, drop_end - drop_start)
    ber[drop_start:drop_end] = np.random.uniform(0.1, 0.4, drop_end - drop_start)
    evm[drop_start:drop_end] = np.random.uniform(20, 40, drop_end - drop_start)
    phase_offset[drop_start:drop_end] += np.random.normal(0, 0.5, drop_end - drop_start)
    frequency_offset[drop_start:drop_end] += np.random.normal(0, 0.2, drop_end - drop_start)
    
    label = np.ones(n_points, dtype=int)
    
    return pd.DataFrame(data=zip(snr, rssi, ber, evm, phase_offset, frequency_offset, label),
                        columns=FEATURE_COLUMNS)

def generate_intermittent_faults_data(n_points):
    """Generates data for Scenario 4: Intermittent Faults."""
    print(f"Generating {n_points} data points for Intermittent Faults...")
    base_snr = 12
    t = np.linspace(0, 10 * np.pi, n_points)
    snr = base_snr + 4 * np.sin(t) + np.random.normal(0, 1, n_points)
    rssi = -45 + snr/6 + np.random.normal(0, 1.5, n_points)
    ber = np.maximum(0, (12 - snr) * 0.005) * np.random.uniform(0.5, 1.5, n_points)
    
    # *** DEFINITIVE FIX: Add noise first, then clamp the entire result to 0 ***
    base_evm = 18 - (snr * 0.8)
    noise = np.random.normal(0, 2, n_points)
    evm = np.maximum(0, base_evm + noise)

    phase_offset = 0.2 + np.sin(t/2) * 0.5 + np.random.normal(0, 0.2, n_points)
    frequency_offset = 0.1 + np.sin(t/3) * 0.2 + np.random.normal(0, 0.1, n_points)
    label = np.ones(n_points, dtype=int)
    
    return pd.DataFrame(data=zip(snr, rssi, ber, evm, phase_offset, frequency_offset, label),
                        columns=FEATURE_COLUMNS)

def generate_sync_issues_data(n_points):
    """Generates data for Scenario 5: Synchronization Issues."""
    print(f"Generating {n_points} data points for Synchronization Issues...")
    snr = np.full(n_points, 15.0) + np.random.normal(0, 1, n_points)
    rssi = -42 + np.random.normal(0, 1, n_points)
    phase_offset = np.linspace(1.5, 2.5, n_points) + np.random.normal(0, 0.1, n_points)
    frequency_offset = np.linspace(0.8, 1.2, n_points) + np.random.normal(0, 0.05, n_points)
    ber = np.random.uniform(0.01, 0.05, n_points)
    evm = np.random.uniform(8, 15, n_points)
    label = np.ones(n_points, dtype=int)
    
    return pd.DataFrame(data=zip(snr, rssi, ber, evm, phase_offset, frequency_offset, label),
                        columns=FEATURE_COLUMNS)

# --- Main Script Execution ---
if __name__ == "__main__":
    # Generate data for each scenario
    df_normal = generate_normal_data(N_POINTS_NORMAL)
    df_degradation = generate_gradual_degradation_data(N_POINTS_DEGRADATION)
    df_interference = generate_sudden_interference_data(N_POINTS_INTERFERENCE)
    df_intermittent = generate_intermittent_faults_data(N_POINTS_INTERMITTENT)
    df_sync = generate_sync_issues_data(N_POINTS_SYNC)
    
    # Add a 'Scenario' column to each dataframe BEFORE concatenating
    df_normal['Scenario'] = 'Normal'
    df_degradation['Scenario'] = 'Gradual_Degradation'
    df_interference['Scenario'] = 'Sudden_Interference'
    df_intermittent['Scenario'] = 'Intermittent_Faults'
    df_sync['Scenario'] = 'Sync_Issues'
    
    # Combine all dataframes into one
    print("\nCombining all scenarios into a single dataset...")
    final_df = pd.concat([
        df_normal,
        df_degradation,
        df_interference,
        df_intermittent,
        df_sync
    ], ignore_index=True)
    
    # Add a timestamp column (e.g., in seconds)
    final_df['Timestamp'] = np.arange(TOTAL_POINTS) * 0.01 # Assuming 10ms intervals
    
    # Reorder columns to the desired final order
    final_df = final_df[COLUMNS]
    
    # CRITICAL STEP: Shuffle the dataset to randomize the order of scenarios
    print("Shuffling the dataset...")
    final_df = final_df.sample(frac=1).reset_index(drop=True)
    
    # Save the final dataset to a CSV file
    output_filename = "eVTOL_telemetry_dataset_v3.csv"
    print(f"Saving the final, corrected dataset to {output_filename}...")
    final_df.to_csv(output_filename, index=False)
    
    print("\n--- Dataset Generation Complete ---")
    print(f"Total data points: {len(final_df)}")
    print("Class distribution:")
    print(final_df['Label'].value_counts())
    print("\nFirst 5 rows of the final dataset:")
    print(final_df.head())